diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 4ebf304e23d58..2e503c867403b 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -68,7 +68,7 @@ function compute-projects-to-test() {
       done
     ;;
     clang)
-      for p in clang-tools-extra compiler-rt flang libc lldb openmp cross-project-tests; do
+      for p in clang-tools-extra compiler-rt flang lldb cross-project-tests; do
         echo $p
       done
     ;;
@@ -224,7 +224,7 @@ fi
 # needs while letting them run on the infrastructure provided by LLVM.
 
 # Figure out which projects need to be built on each platform
-all_projects="bolt clang-tools-extra compiler-rt cross-project-tests flang libc libclc lld lldb llvm mlir openmp polly pstl"
+all_projects="bolt clang clang-tools-extra compiler-rt cross-project-tests flang libc libclc lld lldb llvm mlir openmp polly pstl"
 modified_projects="$(keep-modified-projects ${all_projects})"
 
 linux_projects_to_test=$(exclude-linux $(compute-projects-to-test ${modified_projects}))
diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 9a580c6d6984e..a0428336d300f 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -629,6 +629,8 @@ backend:DirectX:
   - '**/*DirectX*/**'
   - '**/*DXIL*/**'
   - '**/*dxil*/**'
+  - '**/*DXContainer*'
+  - '**/*DXContainer*/**'
 
 backend:SPIR-V:
   - clang/lib/Driver/ToolChains/SPIRV.*
@@ -933,3 +935,6 @@ openmp:libomp:
 
 openmp:libomptarget:
   - any: ['openmp/**', '!openmp/runtime/**']
+
+bazel:
+  - utils/bazel/**
diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml
index 18a27392cc2c8..0a228c41f354e 100644
--- a/.github/workflows/llvm-project-tests.yml
+++ b/.github/workflows/llvm-project-tests.yml
@@ -118,6 +118,11 @@ jobs:
           else
             builddir="$(pwd)"/build
           fi
+          if [ "${{ runner.os }}" == "macOS" ]; then
+            # Workaround test failure on some lld tests on MacOS
+            # https://github.com/llvm/llvm-project/issues/81967
+            extra_cmake_args="-DLLVM_DISABLE_ASSEMBLY_FILES=ON"
+          fi
           echo "llvm-builddir=$builddir" >> "$GITHUB_OUTPUT"
           cmake -G Ninja \
                 -B "$builddir" \
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index c432c206c47ea..0c46fda313960 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -42,6 +42,7 @@ jobs:
       BASELINE_REF: ${{ steps.vars.outputs.BASELINE_REF }}
       ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
       BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
+      BASELINE_VERSION_MINOR: ${{ steps.vars.outputs.BASELINE_VERSION_MINOR }}
       LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
       LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }}
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }}
@@ -58,7 +59,14 @@ jobs:
       - name: Setup Variables
         id: vars
         run: |
-          if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 ] || [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
+          # C++ ABI:
+          # 18.1.0 we aren't doing ABI checks.
+          # 18.1.1 We want to check 18.1.0.
+          # C ABI:
+          # 18.1.0 We want to check 17.0.x
+          # 18.1.1 We want to check 18.1.0
+          echo "BASELINE_VERSION_MINOR=1" >> "$GITHUB_OUTPUT"
+          if [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
             {
               echo "BASELINE_VERSION_MAJOR=$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))"
               echo "ABI_HEADERS=llvm-c"
@@ -82,7 +90,7 @@ jobs:
         include:
           - name: build-baseline
             llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}
-            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.0.0
+            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MINOR }}.0
             repo: llvm/llvm-project
           - name: build-latest
             llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }}
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index a03a29602c7f9..078f070ffe54c 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -56,7 +56,7 @@ jobs:
       - name: Install clang-format
         uses: aminya/setup-cpp@v1
         with:
-          clangformat: 17.0.1
+          clangformat: 18.1.1
 
       - name: Setup Python env
         uses: actions/setup-python@v5
diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index 4debf93633547..06084819ec0b3 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -209,7 +209,13 @@ class DIEBuilder {
   void updateReferences();
 
   /// Update the Offset and Size of DIE, populate DebugNames table.
-  uint32_t finalizeDIEs(DWARFUnit &CU, DIE &Die, uint32_t &CurOffset);
+  /// Along with current CU, and DIE being processed and the new DIE offset to
+  /// be updated, it takes in Parents vector that can be empty if this DIE has
+  /// no parents.
+  uint32_t
+  finalizeDIEs(DWARFUnit &CU, DIE &Die,
+               std::vector<std::optional<BOLTDWARF5AccelTableData *>> &Parents,
+               uint32_t &CurOffset);
 
   void registerUnit(DWARFUnit &DU, bool NeedSort);
 
diff --git a/bolt/include/bolt/Core/DebugNames.h b/bolt/include/bolt/Core/DebugNames.h
index 84c448aa1354c..1f17f1ae4d139 100644
--- a/bolt/include/bolt/Core/DebugNames.h
+++ b/bolt/include/bolt/Core/DebugNames.h
@@ -36,6 +36,9 @@ class BOLTDWARF5AccelTableData : public DWARF5AccelTableData {
   bool isTU() const { return DWARF5AccelTableData::isTU(); }
   std::optional<unsigned> getSecondUnitID() const { return SecondUnitID; }
 
+  void setPatchOffset(uint64_t PatchOffset) { OffsetVal = PatchOffset; }
+  uint64_t getPatchOffset() const { return std::get<uint64_t>(OffsetVal); }
+
 private:
   std::optional<unsigned> SecondUnitID;
 };
@@ -49,10 +52,12 @@ class DWARF5AcceleratorTable {
       Abbrev->~DebugNamesAbbrev();
   }
   /// Add DWARF5 Accelerator table entry.
-  /// Input is DWARFUnit being processed, DIE that belongs to it, and potential
-  /// SkeletonCU if the Unit comes from a DWO section.
-  void addAccelTableEntry(DWARFUnit &Unit, const DIE &Die,
-                          const std::optional<uint64_t> &DWOID);
+  /// Input is DWARFUnit being processed, DIE that belongs to it, potential
+  /// DWOID if the Unit comes from a DWO section, and potential parent entry.
+  std::optional<BOLTDWARF5AccelTableData *>
+  addAccelTableEntry(DWARFUnit &Unit, const DIE &Die,
+                     const std::optional<uint64_t> &DWOID,
+                     std::optional<BOLTDWARF5AccelTableData *> &Parent);
   /// Set current unit being processed.
   void setCurrentUnit(DWARFUnit &Unit, const uint64_t UnitStartOffset);
   /// Emit Accelerator table.
@@ -121,6 +126,8 @@ class DWARF5AcceleratorTable {
   llvm::DenseMap<llvm::hash_code, uint64_t> StrCacheToOffsetMap;
   // Contains DWO ID to CUList Index.
   llvm::DenseMap<uint64_t, uint32_t> CUOffsetsToPatch;
+  // Contains a map of Entry ID to Entry relative offset.
+  llvm::DenseMap<uint64_t, uint32_t> EntryRelativeOffsets;
   /// Adds Unit to either CUList, LocalTUList or ForeignTUList.
   /// Input Unit being processed, and DWO ID if Unit is being processed comes
   /// from a DWO section.
@@ -143,7 +150,7 @@ class DWARF5AcceleratorTable {
   /// Write Entries.
   void writeEntries();
   /// Write an Entry.
-  void writeEntry(const BOLTDWARF5AccelTableData &Entry);
+  void writeEntry(BOLTDWARF5AccelTableData &Entry);
   /// Write augmentation_string for BOLT.
   void writeAugmentationString();
   /// Emit out Header for DWARF5 Accelerator table.
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 6bb76d1b917db..96b58f5416234 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -487,10 +487,9 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
   }
 
-  virtual bool createDirectCall(MCInst &Inst, const MCSymbol *Target,
+  virtual void createDirectCall(MCInst &Inst, const MCSymbol *Target,
                                 MCContext *Ctx, bool IsTailCall) {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   virtual MCPhysReg getX86R11() const { llvm_unreachable("not implemented"); }
@@ -1534,15 +1533,13 @@ class MCPlusBuilder {
   }
 
   /// Create a no-op instruction.
-  virtual bool createNoop(MCInst &Inst) const {
+  virtual void createNoop(MCInst &Inst) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Create a return instruction.
-  virtual bool createReturn(MCInst &Inst) const {
+  virtual void createReturn(MCInst &Inst) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Store \p Target absolute address to \p RegName
@@ -1556,32 +1553,30 @@ class MCPlusBuilder {
 
   /// Creates a new unconditional branch instruction in Inst and set its operand
   /// to TBB.
-  ///
-  /// Returns true on success.
-  virtual bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+  virtual void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
                                   MCContext *Ctx) const {
     llvm_unreachable("not implemented");
-    return false;
+  }
+
+  /// Create a version of unconditional jump that has the largest span for a
+  /// single instruction with direct target.
+  virtual void createLongUncondBranch(MCInst &Inst, const MCSymbol *Target,
+                                      MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
   }
 
   /// Creates a new call instruction in Inst and sets its operand to
   /// Target.
-  ///
-  /// Returns true on success.
-  virtual bool createCall(MCInst &Inst, const MCSymbol *Target,
+  virtual void createCall(MCInst &Inst, const MCSymbol *Target,
                           MCContext *Ctx) {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Creates a new tail call instruction in Inst and sets its operand to
   /// Target.
-  ///
-  /// Returns true on success.
-  virtual bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+  virtual void createTailCall(MCInst &Inst, const MCSymbol *Target,
                               MCContext *Ctx) {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   virtual void createLongTailCall(InstructionListType &Seq,
@@ -1590,43 +1585,36 @@ class MCPlusBuilder {
   }
 
   /// Creates a trap instruction in Inst.
-  ///
-  /// Returns true on success.
-  virtual bool createTrap(MCInst &Inst) const {
+  virtual void createTrap(MCInst &Inst) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Creates an instruction to bump the stack pointer just like a call.
-  virtual bool createStackPointerIncrement(MCInst &Inst, int Size = 8,
+  virtual void createStackPointerIncrement(MCInst &Inst, int Size = 8,
                                            bool NoFlagsClobber = false) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Creates an instruction to move the stack pointer just like a ret.
-  virtual bool createStackPointerDecrement(MCInst &Inst, int Size = 8,
+  virtual void createStackPointerDecrement(MCInst &Inst, int Size = 8,
                                            bool NoFlagsClobber = false) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Create a store instruction using \p StackReg as the base register
   /// and \p Offset as the displacement.
-  virtual bool createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg,
+  virtual void createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg,
                                  int Offset, const MCPhysReg &SrcReg,
                                  int Size) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
-  virtual bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
+  virtual void createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
                           const MCPhysReg &IndexReg, int64_t Offset,
                           const MCExpr *OffsetExpr,
                           const MCPhysReg &AddrSegmentReg,
                           const MCPhysReg &DstReg, int Size) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   virtual InstructionListType createLoadImmediate(const MCPhysReg Dest,
@@ -1636,32 +1624,27 @@ class MCPlusBuilder {
 
   /// Create a fragment of code (sequence of instructions) that load a 32-bit
   /// address from memory, zero-extends it to 64 and jump to it (indirect jump).
-  virtual bool
+  virtual void
   createIJmp32Frag(SmallVectorImpl<MCInst> &Insts, const MCOperand &BaseReg,
                    const MCOperand &Scale, const MCOperand &IndexReg,
                    const MCOperand &Offset, const MCOperand &TmpReg) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Create a load instruction using \p StackReg as the base register
   /// and \p Offset as the displacement.
-  virtual bool createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
+  virtual void createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
                                       int Offset, const MCPhysReg &DstReg,
                                       int Size) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Creates a call frame pseudo instruction. A single operand identifies which
   /// MCCFIInstruction this MCInst is referring to.
-  ///
-  /// Returns true on success.
-  virtual bool createCFI(MCInst &Inst, int64_t Offset) const {
+  virtual void createCFI(MCInst &Inst, int64_t Offset) const {
     Inst.clear();
     Inst.setOpcode(TargetOpcode::CFI_INSTRUCTION);
     Inst.addOperand(MCOperand::createImm(Offset));
-    return true;
   }
 
   /// Create an inline version of memcpy(dest, src, 1).
@@ -1699,6 +1682,12 @@ class MCPlusBuilder {
     return Inst.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
   }
 
+  /// Create a conditional branch with a target-specific conditional code \p CC.
+  virtual void createCondBranch(MCInst &Inst, const MCSymbol *Target,
+                                unsigned CC, MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+  }
+
   /// Reverses the branch condition in Inst and update its taken target to TBB.
   ///
   /// Returns true on success.
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index 42287978a8576..0cf8a5e8c2c3d 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -377,20 +377,32 @@ getUnitForOffset(DIEBuilder &Builder, DWARFContext &DWCtx,
   return nullptr;
 }
 
-uint32_t DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
-                                  uint32_t &CurOffset) {
+uint32_t DIEBuilder::finalizeDIEs(
+    DWARFUnit &CU, DIE &Die,
+    std::vector<std::optional<BOLTDWARF5AccelTableData *>> &Parents,
+    uint32_t &CurOffset) {
   getState().DWARFDieAddressesParsed.erase(Die.getOffset());
   uint32_t CurSize = 0;
   Die.setOffset(CurOffset);
-  DebugNamesTable.addAccelTableEntry(
-      CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt);
+  std::optional<BOLTDWARF5AccelTableData *> NameEntry =
+      DebugNamesTable.addAccelTableEntry(
+          CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt,
+          Parents.back());
+  // It is possible that an indexed debugging information entry has a parent
+  // that is not indexed (for example, if its parent does not have a name
+  // attribute). In such a case, a parent attribute may point to a nameless
+  // index entry (that is, one that cannot be reached from any entry in the name
+  // table), or it may point to the nearest ancestor that does have an index
+  // entry.
+  if (NameEntry)
+    Parents.push_back(std::move(NameEntry));
   for (DIEValue &Val : Die.values())
     CurSize += Val.sizeOf(CU.getFormParams());
   CurSize += getULEB128Size(Die.getAbbrevNumber());
   CurOffset += CurSize;
 
   for (DIE &Child : Die.children()) {
-    uint32_t ChildSize = finalizeDIEs(CU, Child, CurOffset);
+    uint32_t ChildSize = finalizeDIEs(CU, Child, Parents, CurOffset);
     CurSize += ChildSize;
   }
   // for children end mark.
@@ -400,6 +412,8 @@ uint32_t DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
   }
 
   Die.setSize(CurSize);
+  if (NameEntry)
+    Parents.pop_back();
 
   return CurSize;
 }
@@ -410,7 +424,9 @@ void DIEBuilder::finish() {
     uint32_t HeaderSize = CU.getHeaderSize();
     uint32_t CurOffset = HeaderSize;
     DebugNamesTable.setCurrentUnit(CU, UnitStartOffset);
-    finalizeDIEs(CU, *UnitDIE, CurOffset);
+    std::vector<std::optional<BOLTDWARF5AccelTableData *>> Parents;
+    Parents.push_back(std::nullopt);
+    finalizeDIEs(CU, *UnitDIE, Parents, CurOffset);
 
     DWARFUnitInfo &CurUnitInfo = getUnitInfoByDwarfUnit(CU);
     CurUnitInfo.UnitOffset = UnitStartOffset;
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
index 384e63695dfdc..23a29f52513c0 100644
--- a/bolt/lib/Core/DebugNames.cpp
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/LEB128.h"
 #include <cstdint>
+#include <optional>
 
 namespace llvm {
 namespace bolt {
@@ -163,10 +164,16 @@ static uint64_t getNameOffset(BinaryContext &BC, DWARFUnit &Unit,
                                    Index * DwarfOffsetByteSize);
 }
 
-void DWARF5AcceleratorTable::addAccelTableEntry(
-    DWARFUnit &Unit, const DIE &Die, const std::optional<uint64_t> &DWOID) {
+static uint64_t getEntryID(const BOLTDWARF5AccelTableData &Entry) {
+  return reinterpret_cast<uint64_t>(&Entry);
+}
+
+std::optional<BOLTDWARF5AccelTableData *>
+DWARF5AcceleratorTable::addAccelTableEntry(
+    DWARFUnit &Unit, const DIE &Die, const std::optional<uint64_t> &DWOID,
+    std::optional<BOLTDWARF5AccelTableData *> &Parent) {
   if (Unit.getVersion() < 5 || !NeedToCreate)
-    return;
+    return std::nullopt;
   std::string NameToUse = "";
   auto canProcess = [&](const DIE &Die) -> bool {
     switch (Die.getTag()) {
@@ -217,7 +224,7 @@ void DWARF5AcceleratorTable::addAccelTableEntry(
   };
 
   if (!canProcess(Die))
-    return;
+    return std::nullopt;
 
   // Addes a Unit to either CU, LocalTU or ForeignTU list the first time we
   // encounter it.
@@ -227,10 +234,10 @@ void DWARF5AcceleratorTable::addAccelTableEntry(
     addUnit(Unit, DWOID);
   }
 
-  auto addEntry = [&](DIEValue ValName) -> void {
+  auto getName = [&](DIEValue ValName) -> std::optional<std::string> {
     if ((!ValName || ValName.getForm() == dwarf::DW_FORM_string) &&
         NameToUse.empty())
-      return;
+      return std::nullopt;
     std::string Name = "";
     uint64_t NameIndexOffset = 0;
     if (NameToUse.empty()) {
@@ -260,7 +267,16 @@ void DWARF5AcceleratorTable::addAccelTableEntry(
       // This the same hash function used in DWARF5AccelTableData.
       It.HashValue = caseFoldingDjbHash(Name);
     }
+    return Name;
+  };
+
+  auto addEntry =
+      [&](DIEValue ValName) -> std::optional<BOLTDWARF5AccelTableData *> {
+    std::optional<std::string> Name = getName(ValName);
+    if (!Name)
+      return std::nullopt;
 
+    auto &It = Entries[*Name];
     bool IsTU = false;
     uint32_t DieTag = 0;
     uint32_t UnitID = getUnitID(Unit, IsTU, DieTag);
@@ -270,18 +286,53 @@ void DWARF5AcceleratorTable::addAccelTableEntry(
       if (Iter == CUOffsetsToPatch.end())
         BC.errs() << "BOLT-WARNING: [internal-dwarf-warning]: Could not find "
                      "DWO ID in CU offsets for second Unit Index "
-                  << Name << ". For DIE at offset: "
+                  << *Name << ". For DIE at offset: "
                   << Twine::utohexstr(CurrentUnitOffset + Die.getOffset())
                   << ".\n";
       SecondIndex = Iter->second;
     }
+    std::optional<uint64_t> ParentOffset =
+        (Parent ? std::optional<uint64_t>(getEntryID(**Parent)) : std::nullopt);
+    // This will be populated later in writeEntry.
+    // This way only parent entries get tracked.
+    // Keeping memory footprint down.
+    if (ParentOffset)
+      EntryRelativeOffsets.insert({*ParentOffset, 0});
     It.Values.push_back(new (Allocator) BOLTDWARF5AccelTableData(
-        Die.getOffset(), std::nullopt, DieTag, UnitID, IsTU, SecondIndex));
+        Die.getOffset(), ParentOffset, DieTag, UnitID, IsTU, SecondIndex));
+    return It.Values.back();
   };
 
-  addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_name));
-  addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_linkage_name));
-  return;
+  // Minor optimization not to add entry twice for DW_TAG_namespace if it has no
+  // DW_AT_name.
+  if (!(Die.getTag() == dwarf::DW_TAG_namespace &&
+        !Die.findAttribute(dwarf::Attribute::DW_AT_name)))
+    addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_linkage_name));
+  // For the purposes of determining whether a debugging information entry has a
+  // particular attribute (such as DW_AT_name), if debugging information entry A
+  // has a DW_AT_specification or DW_AT_abstract_origin attribute pointing to
+  // another debugging information entry B, any attributes of B are considered
+  // to be part of A.
+  auto processReferencedDie = [&](const dwarf::Attribute &Attr)
+      -> std::optional<BOLTDWARF5AccelTableData *> {
+    const DIEValue Value = Die.findAttribute(Attr);
+    if (!Value)
+      return std::nullopt;
+    const DIEEntry &DIEENtry = Value.getDIEEntry();
+    DIE &EntryDie = DIEENtry.getEntry();
+    addEntry(EntryDie.findAttribute(dwarf::Attribute::DW_AT_linkage_name));
+    return addEntry(EntryDie.findAttribute(dwarf::Attribute::DW_AT_name));
+  };
+
+  if (std::optional<BOLTDWARF5AccelTableData *> Entry =
+          processReferencedDie(dwarf::Attribute::DW_AT_abstract_origin))
+    return *Entry;
+  if (std::optional<BOLTDWARF5AccelTableData *> Entry =
+          processReferencedDie(dwarf::Attribute::DW_AT_specification))
+    return *Entry;
+
+  return addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_name));
+  ;
 }
 
 /// Algorithm from llvm implementation.
@@ -382,6 +433,11 @@ void DWARF5AcceleratorTable::populateAbbrevsMap() {
         if (SecondEntryRet)
           Abbrev.addAttribute(SecondEntryRet->Encoding);
         Abbrev.addAttribute({dwarf::DW_IDX_die_offset, dwarf::DW_FORM_ref4});
+        if (std::optional<uint64_t> Offset = Value->getParentDieOffset())
+          Abbrev.addAttribute({dwarf::DW_IDX_parent, dwarf::DW_FORM_ref4});
+        else
+          Abbrev.addAttribute(
+              {dwarf::DW_IDX_parent, dwarf::DW_FORM_flag_present});
         FoldingSetNodeID ID;
         Abbrev.Profile(ID);
         void *InsertPos;
@@ -401,7 +457,11 @@ void DWARF5AcceleratorTable::populateAbbrevsMap() {
   }
 }
 
-void DWARF5AcceleratorTable::writeEntry(const BOLTDWARF5AccelTableData &Entry) {
+void DWARF5AcceleratorTable::writeEntry(BOLTDWARF5AccelTableData &Entry) {
+  const uint64_t EntryID = getEntryID(Entry);
+  if (EntryRelativeOffsets.find(EntryID) != EntryRelativeOffsets.end())
+    EntryRelativeOffsets[EntryID] = EntriesBuffer->size();
+
   const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
       getIndexForEntry(Entry);
   // For forgeign type (FTU) units that need to refer to the FTU and to the CU.
@@ -456,6 +516,17 @@ void DWARF5AcceleratorTable::writeEntry(const BOLTDWARF5AccelTableData &Entry) {
                              llvm::endianness::little);
       break;
     }
+    case dwarf::DW_IDX_parent: {
+      assert(
+          (AttrEnc.Form == dwarf::DW_FORM_ref4 && Entry.getParentDieOffset()) ||
+          AttrEnc.Form == dwarf::DW_FORM_flag_present);
+      if (std::optional<uint64_t> ParentOffset = Entry.getParentDieOffset()) {
+        Entry.setPatchOffset(EntriesBuffer->size());
+        support::endian::write(*Entriestream, static_cast<uint32_t>(UINT32_MAX),
+                               llvm::endianness::little);
+      }
+      break;
+    }
     }
   }
 }
@@ -464,13 +535,34 @@ void DWARF5AcceleratorTable::writeEntries() {
   for (auto &Bucket : getBuckets()) {
     for (DWARF5AcceleratorTable::HashData *Hash : Bucket) {
       Hash->EntryOffset = EntriesBuffer->size();
-      for (const BOLTDWARF5AccelTableData *Value : Hash->Values) {
+      for (BOLTDWARF5AccelTableData *Value : Hash->Values) {
         writeEntry(*Value);
       }
       support::endian::write(*Entriestream, static_cast<uint8_t>(0),
                              llvm::endianness::little);
     }
   }
+  // Patching parent offsets.
+  for (auto &Bucket : getBuckets()) {
+    for (DWARF5AcceleratorTable::HashData *Hash : Bucket) {
+      for (BOLTDWARF5AccelTableData *Entry : Hash->Values) {
+        std::optional<uint64_t> ParentOffset = Entry->getParentDieOffset();
+        if (!ParentOffset)
+          continue;
+        if (const auto Iter = EntryRelativeOffsets.find(*ParentOffset);
+            Iter != EntryRelativeOffsets.end()) {
+          const uint64_t PatchOffset = Entry->getPatchOffset();
+          uint32_t *Ptr = reinterpret_cast<uint32_t *>(
+              &EntriesBuffer.get()->data()[PatchOffset]);
+          *Ptr = Iter->second;
+        } else {
+          BC.errs() << "BOLT-WARNING: [internal-dwarf-warning]: Could not find "
+                       "entry with offset "
+                    << *ParentOffset << "\n";
+        }
+      }
+    }
+  }
 }
 
 void DWARF5AcceleratorTable::writeAugmentationString() {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d2850b03e2e13..bf1c2ddd37dd2 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1056,10 +1056,9 @@ void Peepholes::addTailcallTraps(BinaryFunction &Function) {
     MCInst *Inst = BB.getLastNonPseudoInstr();
     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
       MCInst Trap;
-      if (MIB->createTrap(Trap)) {
-        BB.addInstruction(Trap);
-        ++TailCallTraps;
-      }
+      MIB->createTrap(Trap);
+      BB.addInstruction(Trap);
+      ++TailCallTraps;
     }
   }
 }
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index 9a1f9d72623a3..c9706500758d1 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -680,16 +680,15 @@ void StackLayoutModifier::performChanges() {
       if (StackPtrReg != BC.MIB->getFramePointer())
         Adjustment = -Adjustment;
       if (IsLoad)
-        Success = BC.MIB->createRestoreFromStack(
-            Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size);
+        BC.MIB->createRestoreFromStack(Inst, StackPtrReg,
+                                       StackOffset + Adjustment, Reg, Size);
       else if (IsStore)
-        Success = BC.MIB->createSaveToStack(
-            Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size);
+        BC.MIB->createSaveToStack(Inst, StackPtrReg, StackOffset + Adjustment,
+                                  Reg, Size);
       LLVM_DEBUG({
         dbgs() << "Adjusted instruction: ";
         Inst.dump();
       });
-      assert(Success);
     }
   }
 }
@@ -1653,19 +1652,13 @@ Expected<MCInst> ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
   if (SPVal != StackPointerTracking::SUPERPOSITION &&
       SPVal != StackPointerTracking::EMPTY) {
     if (FIE.IsLoad) {
-      if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getStackPointer(),
-                                          FIE.StackOffset - SPVal, FIE.RegOrImm,
-                                          FIE.Size)) {
-        return createFatalBOLTError(
-            "createRestoreFromStack: not supported on this platform\n");
-      }
-    } else {
-      if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getStackPointer(),
+      BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getStackPointer(),
                                      FIE.StackOffset - SPVal, FIE.RegOrImm,
-                                     FIE.Size)) {
-        return createFatalBOLTError(
-            "createSaveToStack: not supported on this platform\n");
-      }
+                                     FIE.Size);
+    } else {
+      BC.MIB->createSaveToStack(NewInst, BC.MIB->getStackPointer(),
+                                FIE.StackOffset - SPVal, FIE.RegOrImm,
+                                FIE.Size);
     }
     if (CreatePushOrPop)
       BC.MIB->changeToPushOrPop(NewInst);
@@ -1675,19 +1668,12 @@ Expected<MCInst> ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
          FPVal != StackPointerTracking::EMPTY);
 
   if (FIE.IsLoad) {
-    if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getFramePointer(),
-                                        FIE.StackOffset - FPVal, FIE.RegOrImm,
-                                        FIE.Size)) {
-      return createFatalBOLTError(
-          "createRestoreFromStack: not supported on this platform\n");
-    }
-  } else {
-    if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getFramePointer(),
+    BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getFramePointer(),
                                    FIE.StackOffset - FPVal, FIE.RegOrImm,
-                                   FIE.Size)) {
-      return createFatalBOLTError(
-          "createSaveToStack: not supported on this platform\n");
-    }
+                                   FIE.Size);
+  } else {
+    BC.MIB->createSaveToStack(NewInst, BC.MIB->getFramePointer(),
+                              FIE.StackOffset - FPVal, FIE.RegOrImm, FIE.Size);
   }
   return NewInst;
 }
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 631ccaec6ae61..016962ff34d8d 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -705,6 +705,9 @@ void assignProfile(BinaryFunction &BF,
 
 bool YAMLProfileReader::inferStaleProfile(
     BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
+  if (!BF.hasCFG())
+    return false;
+
   LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
                     << "\"" << BF.getPrintName() << "\"\n");
 
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 85c2397dcc5b2..8dc7b90a6e307 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -283,10 +283,9 @@ class DIEStreamer : public DwarfStreamer {
   DIEStreamer(DIEBuilder *DIEBldr, DWARFRewriter &Rewriter,
               DWARFLinkerBase::OutputFileType OutFileType,
               raw_pwrite_stream &OutFile,
-              std::function<StringRef(StringRef Input)> Translator,
               DWARFLinkerBase::MessageHandlerTy Warning)
-      : DwarfStreamer(OutFileType, OutFile, Translator, Warning),
-        DIEBldr(DIEBldr), Rewriter(Rewriter){};
+      : DwarfStreamer(OutFileType, OutFile, Warning), DIEBldr(DIEBldr),
+        Rewriter(Rewriter){};
 
   using DwarfStreamer::emitCompileUnitHeader;
 
@@ -469,7 +468,6 @@ createDIEStreamer(const Triple &TheTriple, raw_pwrite_stream &OutFile,
 
   std::unique_ptr<DIEStreamer> Streamer = std::make_unique<DIEStreamer>(
       &DIEBldr, Rewriter, DWARFLinkerBase::OutputFileType::Object, OutFile,
-      [](StringRef Input) -> StringRef { return Input; },
       [&](const Twine &Warning, StringRef Context, const DWARFDie *) {});
   Error Err = Streamer->init(TheTriple, Swift5ReflectionSegmentName);
   if (Err)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9efb428a6e1ba..0ae9d3668b93b 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1026,7 +1026,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return Code;
   }
 
-  bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+  void createTailCall(MCInst &Inst, const MCSymbol *Target,
                       MCContext *Ctx) override {
     return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true);
   }
@@ -1036,11 +1036,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     createShortJmp(Seq, Target, Ctx, /*IsTailCall*/ true);
   }
 
-  bool createTrap(MCInst &Inst) const override {
+  void createTrap(MCInst &Inst) const override {
     Inst.clear();
     Inst.setOpcode(AArch64::BRK);
     Inst.addOperand(MCOperand::createImm(1));
-    return true;
   }
 
   bool convertJmpToTailCall(MCInst &Inst) override {
@@ -1068,16 +1067,15 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
            Inst.getOperand(0).getImm() == 0;
   }
 
-  bool createNoop(MCInst &Inst) const override {
+  void createNoop(MCInst &Inst) const override {
     Inst.setOpcode(AArch64::HINT);
     Inst.clear();
     Inst.addOperand(MCOperand::createImm(0));
-    return true;
   }
 
   bool mayStore(const MCInst &Inst) const override { return false; }
 
-  bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
+  void createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
                         bool IsTailCall) override {
     Inst.setOpcode(IsTailCall ? AArch64::B : AArch64::BL);
     Inst.clear();
@@ -1086,7 +1084,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
         *Ctx, 0)));
     if (IsTailCall)
       convertJmpToTailCall(Inst);
-    return true;
   }
 
   bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
@@ -1293,14 +1290,13 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+  void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
                           MCContext *Ctx) const override {
     Inst.setOpcode(AArch64::B);
     Inst.clear();
     Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
         Inst, MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx),
         *Ctx, 0)));
-    return true;
   }
 
   bool shouldRecordCodeRelocation(uint64_t RelType) const override {
@@ -1353,14 +1349,13 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return StringRef("\0\0\0\0", 4);
   }
 
-  bool createReturn(MCInst &Inst) const override {
+  void createReturn(MCInst &Inst) const override {
     Inst.setOpcode(AArch64::RET);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(AArch64::LR));
-    return true;
   }
 
-  bool createStackPointerIncrement(
+  void createStackPointerIncrement(
       MCInst &Inst, int Size,
       bool NoFlagsClobber = false /*unused for AArch64*/) const override {
     Inst.setOpcode(AArch64::SUBXri);
@@ -1369,10 +1364,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Inst.addOperand(MCOperand::createReg(AArch64::SP));
     Inst.addOperand(MCOperand::createImm(Size));
     Inst.addOperand(MCOperand::createImm(0));
-    return true;
   }
 
-  bool createStackPointerDecrement(
+  void createStackPointerDecrement(
       MCInst &Inst, int Size,
       bool NoFlagsClobber = false /*unused for AArch64*/) const override {
     Inst.setOpcode(AArch64::ADDXri);
@@ -1381,12 +1375,12 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Inst.addOperand(MCOperand::createReg(AArch64::SP));
     Inst.addOperand(MCOperand::createImm(Size));
     Inst.addOperand(MCOperand::createImm(0));
-    return true;
   }
 
   void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg,
                             int64_t Disp) const {
     Inst.setOpcode(AArch64::BR);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(MemBaseReg));
   }
 
diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index e19c070b08173..ab9623d5c51b2 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -219,46 +219,43 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  bool createReturn(MCInst &Inst) const override {
+  void createReturn(MCInst &Inst) const override {
     // TODO "c.jr ra" when RVC is enabled
     Inst.setOpcode(RISCV::JALR);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(RISCV::X0));
     Inst.addOperand(MCOperand::createReg(RISCV::X1));
     Inst.addOperand(MCOperand::createImm(0));
-    return true;
   }
 
-  bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+  void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
                           MCContext *Ctx) const override {
     Inst.setOpcode(RISCV::JAL);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(RISCV::X0));
     Inst.addOperand(MCOperand::createExpr(
         MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx)));
-    return true;
   }
 
   StringRef getTrapFillValue() const override {
     return StringRef("\0\0\0\0", 4);
   }
 
-  bool createCall(unsigned Opcode, MCInst &Inst, const MCSymbol *Target,
+  void createCall(unsigned Opcode, MCInst &Inst, const MCSymbol *Target,
                   MCContext *Ctx) {
     Inst.setOpcode(Opcode);
     Inst.clear();
     Inst.addOperand(MCOperand::createExpr(RISCVMCExpr::create(
         MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
         RISCVMCExpr::VK_RISCV_CALL, *Ctx)));
-    return true;
   }
 
-  bool createCall(MCInst &Inst, const MCSymbol *Target,
+  void createCall(MCInst &Inst, const MCSymbol *Target,
                   MCContext *Ctx) override {
     return createCall(RISCV::PseudoCALL, Inst, Target, Ctx);
   }
 
-  bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+  void createTailCall(MCInst &Inst, const MCSymbol *Target,
                       MCContext *Ctx) override {
     return createCall(RISCV::PseudoTAIL, Inst, Target, Ctx);
   }
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 387cf481ba304..de55fbe51764d 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -2230,7 +2230,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  bool createStackPointerIncrement(MCInst &Inst, int Size,
+  void createStackPointerIncrement(MCInst &Inst, int Size,
                                    bool NoFlagsClobber) const override {
     if (NoFlagsClobber) {
       Inst.setOpcode(X86::LEA64r);
@@ -2241,17 +2241,16 @@ class X86MCPlusBuilder : public MCPlusBuilder {
       Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
       Inst.addOperand(MCOperand::createImm(-Size));           // Displacement
       Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
-      return true;
+      return;
     }
     Inst.setOpcode(X86::SUB64ri8);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(X86::RSP));
     Inst.addOperand(MCOperand::createReg(X86::RSP));
     Inst.addOperand(MCOperand::createImm(Size));
-    return true;
   }
 
-  bool createStackPointerDecrement(MCInst &Inst, int Size,
+  void createStackPointerDecrement(MCInst &Inst, int Size,
                                    bool NoFlagsClobber) const override {
     if (NoFlagsClobber) {
       Inst.setOpcode(X86::LEA64r);
@@ -2262,22 +2261,22 @@ class X86MCPlusBuilder : public MCPlusBuilder {
       Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
       Inst.addOperand(MCOperand::createImm(Size));            // Displacement
       Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
-      return true;
+      return;
     }
     Inst.setOpcode(X86::ADD64ri8);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(X86::RSP));
     Inst.addOperand(MCOperand::createReg(X86::RSP));
     Inst.addOperand(MCOperand::createImm(Size));
-    return true;
   }
 
-  bool createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg, int Offset,
+  void createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg, int Offset,
                          const MCPhysReg &SrcReg, int Size) const override {
     unsigned NewOpcode;
     switch (Size) {
     default:
-      return false;
+      llvm_unreachable("Invalid operand size");
+      return;
     case 2:      NewOpcode = X86::MOV16mr; break;
     case 4:      NewOpcode = X86::MOV32mr; break;
     case 8:      NewOpcode = X86::MOV64mr; break;
@@ -2290,10 +2289,9 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     Inst.addOperand(MCOperand::createImm(Offset));          // Displacement
     Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
     Inst.addOperand(MCOperand::createReg(SrcReg));
-    return true;
   }
 
-  bool createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
+  void createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
                               int Offset, const MCPhysReg &DstReg,
                               int Size) const override {
     return createLoad(Inst, StackReg, /*Scale=*/1, /*IndexReg=*/X86::NoRegister,
@@ -2301,14 +2299,15 @@ class X86MCPlusBuilder : public MCPlusBuilder {
                       DstReg, Size);
   }
 
-  bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
+  void createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
                   const MCPhysReg &IndexReg, int64_t Offset,
                   const MCExpr *OffsetExpr, const MCPhysReg &AddrSegmentReg,
                   const MCPhysReg &DstReg, int Size) const override {
     unsigned NewOpcode;
     switch (Size) {
     default:
-      return false;
+      llvm_unreachable("Invalid operand size");
+      return;
     case 2:      NewOpcode = X86::MOV16rm; break;
     case 4:      NewOpcode = X86::MOV32rm; break;
     case 8:      NewOpcode = X86::MOV64rm; break;
@@ -2324,7 +2323,6 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     else
       Inst.addOperand(MCOperand::createImm(Offset)); // Displacement
     Inst.addOperand(MCOperand::createReg(AddrSegmentReg)); // AddrSegmentReg
-    return true;
   }
 
   InstructionListType createLoadImmediate(const MCPhysReg Dest,
@@ -2338,7 +2336,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return Insts;
   }
 
-  bool createIJmp32Frag(SmallVectorImpl<MCInst> &Insts,
+  void createIJmp32Frag(SmallVectorImpl<MCInst> &Insts,
                         const MCOperand &BaseReg, const MCOperand &Scale,
                         const MCOperand &IndexReg, const MCOperand &Offset,
                         const MCOperand &TmpReg) const override {
@@ -2362,17 +2360,16 @@ class X86MCPlusBuilder : public MCPlusBuilder {
 
     Insts.push_back(Load);
     Insts.push_back(IJmp);
-    return true;
   }
 
-  bool createNoop(MCInst &Inst) const override {
+  void createNoop(MCInst &Inst) const override {
     Inst.setOpcode(X86::NOOP);
-    return true;
+    Inst.clear();
   }
 
-  bool createReturn(MCInst &Inst) const override {
+  void createReturn(MCInst &Inst) const override {
     Inst.setOpcode(X86::RET64);
-    return true;
+    Inst.clear();
   }
 
   InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
@@ -2729,23 +2726,31 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return FoundOne;
   }
 
-  bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+  void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
                           MCContext *Ctx) const override {
     Inst.setOpcode(X86::JMP_1);
+    Inst.clear();
     Inst.addOperand(MCOperand::createExpr(
         MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx)));
-    return true;
   }
 
-  bool createCall(MCInst &Inst, const MCSymbol *Target,
+  void createLongUncondBranch(MCInst &Inst, const MCSymbol *Target,
+                              MCContext *Ctx) const override {
+    Inst.setOpcode(X86::JMP_4);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
+  }
+
+  void createCall(MCInst &Inst, const MCSymbol *Target,
                   MCContext *Ctx) override {
     Inst.setOpcode(X86::CALL64pcrel32);
+    Inst.clear();
     Inst.addOperand(MCOperand::createExpr(
         MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
-    return true;
   }
 
-  bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+  void createTailCall(MCInst &Inst, const MCSymbol *Target,
                       MCContext *Ctx) override {
     return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true);
   }
@@ -2757,10 +2762,18 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     createDirectCall(Seq.back(), Target, Ctx, /*IsTailCall*/ true);
   }
 
-  bool createTrap(MCInst &Inst) const override {
+  void createTrap(MCInst &Inst) const override {
     Inst.clear();
     Inst.setOpcode(X86::TRAP);
-    return true;
+  }
+
+  void createCondBranch(MCInst &Inst, const MCSymbol *Target, unsigned CC,
+                        MCContext *Ctx) const override {
+    Inst.setOpcode(X86::JCC_1);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
+    Inst.addOperand(MCOperand::createImm(CC));
   }
 
   bool reverseBranchCondition(MCInst &Inst, const MCSymbol *TBB,
@@ -2862,7 +2875,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     Inst.setOpcode(X86::LFENCE);
   }
 
-  bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
+  void createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
                         bool IsTailCall) override {
     Inst.clear();
     Inst.setOpcode(IsTailCall ? X86::JMP_4 : X86::CALL64pcrel32);
@@ -2870,7 +2883,6 @@ class X86MCPlusBuilder : public MCPlusBuilder {
         MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
     if (IsTailCall)
       setTailCall(Inst);
-    return true;
   }
 
   void createShortJmp(InstructionListType &Seq, const MCSymbol *Target,
@@ -3066,6 +3078,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
   void createSwap(MCInst &Inst, MCPhysReg Source, MCPhysReg MemBaseReg,
                   int64_t Disp) const {
     Inst.setOpcode(X86::XCHG64rm);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(Source));
     Inst.addOperand(MCOperand::createReg(Source));
     Inst.addOperand(MCOperand::createReg(MemBaseReg));      // BaseReg
@@ -3078,6 +3091,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
   void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg,
                             int64_t Disp) const {
     Inst.setOpcode(X86::JMP64m);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(MemBaseReg));      // BaseReg
     Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
     Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
@@ -3540,9 +3554,10 @@ class X86MCPlusBuilder : public MCPlusBuilder {
   }
 
 private:
-  bool createMove(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
+  void createMove(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
                   MCContext *Ctx) const {
     Inst.setOpcode(X86::MOV64rm);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(Reg));
     Inst.addOperand(MCOperand::createReg(X86::RIP));        // BaseReg
     Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
@@ -3551,13 +3566,12 @@ class X86MCPlusBuilder : public MCPlusBuilder {
         MCSymbolRefExpr::create(Src, MCSymbolRefExpr::VK_None,
                                 *Ctx)));                    // Displacement
     Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
-
-    return true;
   }
 
-  bool createLea(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
+  void createLea(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
                  MCContext *Ctx) const {
     Inst.setOpcode(X86::LEA64r);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(Reg));
     Inst.addOperand(MCOperand::createReg(X86::RIP));        // BaseReg
     Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
@@ -3566,7 +3580,6 @@ class X86MCPlusBuilder : public MCPlusBuilder {
         MCSymbolRefExpr::create(Src, MCSymbolRefExpr::VK_None,
                                 *Ctx)));                    // Displacement
     Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
-    return true;
   }
 };
 
diff --git a/bolt/test/X86/dwarf5-debug-names-dw-at-specification.s b/bolt/test/X86/dwarf5-debug-names-dw-at-specification.s
new file mode 100644
index 0000000000000..ec5b425625ce9
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-dw-at-specification.s
@@ -0,0 +1,694 @@
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s   -o %tmain.o
+# RUN: %clang %cflags -gdwarf-5 %tmain.o -o %tmain.exe
+# RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+# RUN: llvm-dwarfdump --debug-info -r 0 --debug-names %tmain.exe.bolt > %tlog.txt
+# RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+## Tests that BOLT correctly generates entries in .debug_names with DW_AT_specification.
+
+# BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+# BOLT:       Name Index @ 0x0
+# BOLT-NEXT:    Header {
+# BOLT-NEXT:      Length: 0x10F
+# BOLT-NEXT:      Format: DWARF32
+# BOLT-NEXT:      Version: 5
+# BOLT-NEXT:      CU count: 1
+# BOLT-NEXT:      Local TU count: 0
+# BOLT-NEXT:      Foreign TU count: 0
+# BOLT-NEXT:      Bucket count: 9
+# BOLT-NEXT:      Name count: 9
+# BOLT-NEXT:      Abbreviations table size: 0x21
+# BOLT-NEXT:      Augmentation: 'BOLT'
+# BOLT-NEXT:    }
+# BOLT-NEXT:    Compilation Unit offsets [
+# BOLT-NEXT:      CU[0]: [[OFFSET1]]
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Abbreviations [
+# BOLT-NEXT:      Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+# BOLT-NEXT:        Tag: DW_TAG_variable
+# BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+# BOLT-NEXT:        DW_IDX_parent: DW_FORM_flag_present
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+# BOLT-NEXT:        Tag: DW_TAG_structure_type
+# BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+# BOLT-NEXT:        DW_IDX_parent: DW_FORM_flag_present
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+# BOLT-NEXT:        Tag: DW_TAG_base_type
+# BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+# BOLT-NEXT:        DW_IDX_parent: DW_FORM_flag_present
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+# BOLT-NEXT:        Tag: DW_TAG_subprogram
+# BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+# BOLT-NEXT:        DW_IDX_parent: DW_FORM_flag_present
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 0 [
+# BOLT-NEXT:      Name 1 {
+# BOLT-NEXT:        Hash: 0x5D3CA9E0
+# BOLT-NEXT:        String: {{.+}} "_ZN1A15fully_specifiedE"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV1]]
+# BOLT-NEXT:          Tag: DW_TAG_variable
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000024
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Name 2 {
+# BOLT-NEXT:        Hash: 0x7C9DFC37
+# BOLT-NEXT:        String: {{.+}} "smem"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV1]]
+# BOLT-NEXT:          Tag: DW_TAG_variable
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000057
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 1 [
+# BOLT-NEXT:      Name 3 {
+# BOLT-NEXT:        Hash: 0x2B606
+# BOLT-NEXT:        String: {{.+}} "A"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV2]]
+# BOLT-NEXT:          Tag: DW_TAG_structure_type
+# BOLT-NEXT:          DW_IDX_die_offset: 0x0000002d
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 2 [
+# BOLT-NEXT:      Name 4 {
+# BOLT-NEXT:        Hash: 0xB888030
+# BOLT-NEXT:        String: {{.+}} "int"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV3]]
+# BOLT-NEXT:          Tag: DW_TAG_base_type
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000044
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 3 [
+# BOLT-NEXT:      EMPTY
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 4 [
+# BOLT-NEXT:      EMPTY
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 5 [
+# BOLT-NEXT:      EMPTY
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 6 [
+# BOLT-NEXT:      EMPTY
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 7 [
+# BOLT-NEXT:      Name 5 {
+# BOLT-NEXT:        Hash: 0x65788E1C
+# BOLT-NEXT:        String: {{.+}} "fully_specified"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV1]]
+# BOLT-NEXT:          Tag: DW_TAG_variable
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000024
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Name 6 {
+# BOLT-NEXT:        Hash: 0x7C9A7F6A
+# BOLT-NEXT:        String: {{.+}} "main"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV4]]
+# BOLT-NEXT:          Tag: DW_TAG_subprogram
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000070
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 8 [
+# BOLT-NEXT:      Name 7 {
+# BOLT-NEXT:        Hash: 0xCEF4CFB
+# BOLT-NEXT:        String: {{.+}} "__ARRAY_SIZE_TYPE__"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV3]]
+# BOLT-NEXT:          Tag: DW_TAG_base_type
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000053
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Name 8 {
+# BOLT-NEXT:        Hash: 0x48684B69
+# BOLT-NEXT:        String: {{.+}} "_ZN1A4smemE"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV1]]
+# BOLT-NEXT:          Tag: DW_TAG_variable
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000057
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Name 9 {
+# BOLT-NEXT:        Hash: 0x7C952063
+# BOLT-NEXT:        String: {{.+}} "char"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV3]]
+# BOLT-NEXT:          Tag: DW_TAG_base_type
+# BOLT-NEXT:          DW_IDX_die_offset: 0x0000009e
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:  }
+
+# clang++ main.cpp -O2 -g2 -gdwarf-5 -gpubnames -S
+# struct A {
+#   static int fully_specified;
+#   static int smem[];
+# };
+#
+# int A::fully_specified;
+# int A::smem[] = { 0, 1, 2, 3 };
+# int main(int argc, char *argv[]) {
+#   return 0;
+# }
+	.text
+	.file	"main.cpp"
+	.file	0 "/specification" "main.cpp" md5 0x6c1b1c014d300f2e0efd26584acae1a9
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.cfi_startproc
+# %bb.0:                                # %entry
+	#DEBUG_VALUE: main:argc <- $edi
+	#DEBUG_VALUE: main:argv <- $rsi
+	.loc	0 9 3 prologue_end              # main.cpp:9:3
+	xorl	%eax, %eax
+	retq
+.Ltmp0:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.type	_ZN1A15fully_specifiedE,@object # @_ZN1A15fully_specifiedE
+	.bss
+	.globl	_ZN1A15fully_specifiedE
+	.p2align	2, 0x0
+_ZN1A15fully_specifiedE:
+	.long	0                               # 0x0
+	.size	_ZN1A15fully_specifiedE, 4
+
+	.type	_ZN1A4smemE,@object             # @_ZN1A4smemE
+	.data
+	.globl	_ZN1A4smemE
+	.p2align	4, 0x0
+_ZN1A4smemE:
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	2                               # 0x2
+	.long	3                               # 0x3
+	.size	_ZN1A4smemE, 16
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	71                              # DW_AT_specification
+	.byte	19                              # DW_FORM_ref4
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	1                               # DW_TAG_array_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	33                              # DW_TAG_subrange_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	71                              # DW_AT_specification
+	.byte	19                              # DW_FORM_ref4
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	33                              # DW_TAG_subrange_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	55                              # DW_AT_count
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	122                             # DW_AT_call_all_calls
+	.byte	25                              # DW_FORM_flag_present
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	12                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	13                              # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	1                               # Abbrev [1] 0xc:0x96 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	2                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	2                               # Abbrev [2] 0x23:0x9 DW_TAG_variable
+	.long	50                              # DW_AT_specification
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	8                               # DW_AT_linkage_name
+	.byte	3                               # Abbrev [3] 0x2c:0x17 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	7                               # DW_AT_name
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x32:0x8 DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+                                        # DW_AT_external
+                                        # DW_AT_declaration
+	.byte	4                               # Abbrev [4] 0x3a:0x8 DW_TAG_variable
+	.byte	5                               # DW_AT_name
+	.long	71                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+                                        # DW_AT_external
+                                        # DW_AT_declaration
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x43:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	6                               # Abbrev [6] 0x47:0xb DW_TAG_array_type
+	.long	67                              # DW_AT_type
+	.byte	7                               # Abbrev [7] 0x4c:0x5 DW_TAG_subrange_type
+	.long	82                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x52:0x4 DW_TAG_base_type
+	.byte	6                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	7                               # DW_AT_encoding
+	.byte	9                               # Abbrev [9] 0x56:0xd DW_TAG_variable
+	.long	58                              # DW_AT_specification
+	.long	99                              # DW_AT_type
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	1
+	.byte	9                               # DW_AT_linkage_name
+	.byte	6                               # Abbrev [6] 0x63:0xc DW_TAG_array_type
+	.long	67                              # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x68:0x6 DW_TAG_subrange_type
+	.long	82                              # DW_AT_type
+	.byte	4                               # DW_AT_count
+	.byte	0                               # End Of Children Mark
+	.byte	11                              # Abbrev [11] 0x6f:0x24 DW_TAG_subprogram
+	.byte	2                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	87
+                                        # DW_AT_call_all_calls
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	67                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	12                              # Abbrev [12] 0x7e:0xa DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.byte	11                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	67                              # DW_AT_type
+	.byte	12                              # Abbrev [12] 0x88:0xa DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	84
+	.byte	12                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	147                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	13                              # Abbrev [13] 0x93:0x5 DW_TAG_pointer_type
+	.long	152                             # DW_AT_type
+	.byte	13                              # Abbrev [13] 0x98:0x5 DW_TAG_pointer_type
+	.long	157                             # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x9d:0x4 DW_TAG_base_type
+	.byte	13                              # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	60                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)" # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=104
+.Linfo_string2:
+	.asciz	"/specification" # string offset=113
+.Linfo_string3:
+	.asciz	"A"                             # string offset=165
+.Linfo_string4:
+	.asciz	"fully_specified"               # string offset=167
+.Linfo_string5:
+	.asciz	"int"                           # string offset=183
+.Linfo_string6:
+	.asciz	"smem"                          # string offset=187
+.Linfo_string7:
+	.asciz	"__ARRAY_SIZE_TYPE__"           # string offset=192
+.Linfo_string8:
+	.asciz	"_ZN1A15fully_specifiedE"       # string offset=212
+.Linfo_string9:
+	.asciz	"_ZN1A4smemE"                   # string offset=236
+.Linfo_string10:
+	.asciz	"main"                          # string offset=248
+.Linfo_string11:
+	.asciz	"argc"                          # string offset=253
+.Linfo_string12:
+	.asciz	"argv"                          # string offset=258
+.Linfo_string13:
+	.asciz	"char"                          # string offset=263
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string7
+	.long	.Linfo_string3
+	.long	.Linfo_string8
+	.long	.Linfo_string9
+	.long	.Linfo_string10
+	.long	.Linfo_string11
+	.long	.Linfo_string12
+	.long	.Linfo_string13
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	_ZN1A15fully_specifiedE
+	.quad	_ZN1A4smemE
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	9                               # Header: bucket count
+	.long	9                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	1                               # Bucket 0
+	.long	3                               # Bucket 1
+	.long	4                               # Bucket 2
+	.long	0                               # Bucket 3
+	.long	0                               # Bucket 4
+	.long	0                               # Bucket 5
+	.long	0                               # Bucket 6
+	.long	5                               # Bucket 7
+	.long	7                               # Bucket 8
+	.long	1564256736                      # Hash in Bucket 0
+	.long	2090728503                      # Hash in Bucket 0
+	.long	177670                          # Hash in Bucket 1
+	.long	193495088                       # Hash in Bucket 2
+	.long	1702399516                      # Hash in Bucket 7
+	.long	2090499946                      # Hash in Bucket 7
+	.long	217009403                       # Hash in Bucket 8
+	.long	1214794601                      # Hash in Bucket 8
+	.long	2090147939                      # Hash in Bucket 8
+	.long	.Linfo_string8                  # String in Bucket 0: _ZN1A15fully_specifiedE
+	.long	.Linfo_string6                  # String in Bucket 0: smem
+	.long	.Linfo_string3                  # String in Bucket 1: A
+	.long	.Linfo_string5                  # String in Bucket 2: int
+	.long	.Linfo_string4                  # String in Bucket 7: fully_specified
+	.long	.Linfo_string10                 # String in Bucket 7: main
+	.long	.Linfo_string7                  # String in Bucket 8: __ARRAY_SIZE_TYPE__
+	.long	.Linfo_string9                  # String in Bucket 8: _ZN1A4smemE
+	.long	.Linfo_string13                 # String in Bucket 8: char
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 7
+	.long	.Lnames7-.Lnames_entries0       # Offset in Bucket 7
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 8
+	.long	.Lnames6-.Lnames_entries0       # Offset in Bucket 8
+	.long	.Lnames8-.Lnames_entries0       # Offset in Bucket 8
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames4:
+.L3:
+	.byte	1                               # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _ZN1A15fully_specifiedE
+.Lnames5:
+.L4:
+	.byte	1                               # Abbreviation code
+	.long	86                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: smem
+.Lnames0:
+.L6:
+	.byte	2                               # Abbreviation code
+	.long	44                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: A
+.Lnames1:
+.L5:
+	.byte	3                               # Abbreviation code
+	.long	67                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames3:
+	.byte	1                               # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: fully_specified
+.Lnames7:
+.L0:
+	.byte	4                               # Abbreviation code
+	.long	111                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames2:
+.L2:
+	.byte	3                               # Abbreviation code
+	.long	82                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: __ARRAY_SIZE_TYPE__
+.Lnames6:
+	.byte	1                               # Abbreviation code
+	.long	86                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _ZN1A4smemE
+.Lnames8:
+.L1:
+	.byte	3                               # Abbreviation code
+	.long	157                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: char
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test b/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
index b8789a6ad2300..b7d8fda63315f 100644
--- a/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
+++ b/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
@@ -14,15 +14,15 @@
 ; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x103
+; BOLT-NEXT:     Length: 0x155
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 2
 ; BOLT-NEXT:     Local TU count: 0
 ; BOLT-NEXT:     Foreign TU count: 0
-; BOLT-NEXT:     Bucket count: 8
-; BOLT-NEXT:     Name count: 8
-; BOLT-NEXT:     Abbreviations table size: 0x19
+; BOLT-NEXT:     Bucket count: 10
+; BOLT-NEXT:     Name count: 10
+; BOLT-NEXT:     Abbreviations table size: 0x29
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -31,124 +31,183 @@
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Abbreviations [
 ; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
-; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
-; BOLT-NEXT:       Tag: DW_TAG_subprogram
-; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:          Tag: DW_TAG_inlined_subroutine
+; BOLT-NEXT:          DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:          DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:          DW_IDX_parent: DW_FORM_ref4
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
-; BOLT-NEXT:     Name 1 {
-; BOLT-NEXT:       Hash: 0xB888030
-; BOLT-NEXT:       String: {{.+}} "int"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
-; BOLT-NEXT:         Tag: DW_TAG_base_type
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:       Name 1 {
+; BOLT-NEXT:         Hash: 0x2124BB36
+; BOLT-NEXT:         String: {{.+}} "useFoo"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000037
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV2]]
+; BOLT-NEXT:           Tag: DW_TAG_inlined_subroutine
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000081
+; BOLT-NEXT:           DW_IDX_parent: Entry @ 0x14b
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
-; BOLT-NEXT:         Tag: DW_TAG_base_type
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x0000007f
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 1 [
+; BOLT-NEXT:       EMPTY
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 2 [
+; BOLT-NEXT:       Name 2 {
+; BOLT-NEXT:         Hash: 0xFDE4B5D2
+; BOLT-NEXT:         String: {{.+}} "fooVar"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV3]]
+; BOLT-NEXT:           Tag: DW_TAG_variable
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 1 [
-; BOLT-NEXT:     Name 2 {
-; BOLT-NEXT:       Hash: 0xB887389
-; BOLT-NEXT:       String: {{.+}} "foo"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 3 [
+; BOLT-NEXT:       EMPTY
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 4 [
+; BOLT-NEXT:       EMPTY
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 5 [
+; BOLT-NEXT:       EMPTY
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 6 [
+; BOLT-NEXT:       Name 3 {
+; BOLT-NEXT:         Hash: 0xB88B3D2
+; BOLT-NEXT:         String: {{.+}} "use"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:     Name 3 {
-; BOLT-NEXT:       Hash: 0x8C06E589
-; BOLT-NEXT:       String: {{.+}} "_Z3usePiS_"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       Name 4 {
+; BOLT-NEXT:         Hash: 0x69A7307E
+; BOLT-NEXT:         String: {{.+}} "_Z6useFooPi"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000037
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV2]]
+; BOLT-NEXT:           Tag: DW_TAG_inlined_subroutine
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000081
+; BOLT-NEXT:           DW_IDX_parent: Entry @ 0x14b
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 2 [
-; BOLT-NEXT:     Name 4 {
-; BOLT-NEXT:       Hash: 0xB88B3D2
-; BOLT-NEXT:       String: {{.+}} "use"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       Name 5 {
+; BOLT-NEXT:         Hash: 0x7C9A7F6A
+; BOLT-NEXT:         String: {{.+}} "main"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000049
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:     Name 5 {
-; BOLT-NEXT:       Hash: 0x7C9A7F6A
-; BOLT-NEXT:       String: {{.+}} "main"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000049
+; BOLT-NEXT:       Name 6 {
+; BOLT-NEXT:         Hash: 0xB5063CFE
+; BOLT-NEXT:         String: {{.+}} "_Z3fooi"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:     Name 6 {
-; BOLT-NEXT:       Hash: 0xFDE4B5D2
-; BOLT-NEXT:       String: {{.+}} "fooVar"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV3]]
-; BOLT-NEXT:         Tag: DW_TAG_variable
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 7 [
+; BOLT-NEXT:       Name 7 {
+; BOLT-NEXT:         Hash: 0x8C06E589
+; BOLT-NEXT:         String: {{.+}} "_Z3usePiS_"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 3 [
-; BOLT-NEXT:     Name 7 {
-; BOLT-NEXT:       Hash: 0x7C952063
-; BOLT-NEXT:       String: {{.+}} "char"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
-; BOLT-NEXT:         Tag: DW_TAG_base_type
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000092
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 8 [
+; BOLT-NEXT:       Name 8 {
+; BOLT-NEXT:         Hash: 0xB888030
+; BOLT-NEXT:         String: {{.+}} "int"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV4]]
+; BOLT-NEXT:           Tag: DW_TAG_base_type
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
+; BOLT-NEXT:         Entry @ 0x{{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV4]]
+; BOLT-NEXT:           Tag: DW_TAG_base_type
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x0000007f
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 4 [
-; BOLT-NEXT:     EMPTY
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 5 [
-; BOLT-NEXT:     EMPTY
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 6 [
-; BOLT-NEXT:     Name 8 {
-; BOLT-NEXT:       Hash: 0xB5063CFE
-; BOLT-NEXT:       String: {{.+}} "_Z3fooi"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 9 [
+; BOLT-NEXT:       Name 9 {
+; BOLT-NEXT:         Hash: 0xB887389
+; BOLT-NEXT:         String: {{.+}} "foo"
+; BOLT-NEXT:         Entry @ 0x14b {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 7 [
-; BOLT-NEXT:     EMPTY
-; BOLT-NEXT:   ]
-; BOLT-NEXT: }
+; BOLT-NEXT:       Name 10 {
+; BOLT-NEXT:         Hash: 0x7C952063
+; BOLT-NEXT:         String: {{.+}} "char"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV4]]
+; BOLT-NEXT:           Tag: DW_TAG_base_type
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000092
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
+; BOLT-NEXT:       }
+; BOLT-NEXT:     ]
+; BOLT-NEXT:   }
diff --git a/bolt/test/X86/dwarf5-debug-names.test b/bolt/test/X86/dwarf5-debug-names.test
index de0e88d7a36ae..ef11ee0c479b7 100644
--- a/bolt/test/X86/dwarf5-debug-names.test
+++ b/bolt/test/X86/dwarf5-debug-names.test
@@ -11,7 +11,7 @@
 ; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x1C2
+; BOLT-NEXT:     Length: 0x1D4
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 2
@@ -19,7 +19,7 @@
 ; BOLT-NEXT:     Foreign TU count: 0
 ; BOLT-NEXT:     Bucket count: 14
 ; BOLT-NEXT:     Name count: 15
-; BOLT-NEXT:     Abbreviations table size: 0x29
+; BOLT-NEXT:     Abbreviations table size: 0x3D
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -31,27 +31,38 @@
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_namespace
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
+; BOLT-NEXT:       Abbreviation [[ABBREV6:0x[0-9a-f]*]] {
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:         DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:         DW_IDX_parent: DW_FORM_ref4
+; BOLT-NEXT:       }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
 ; BOLT-NEXT:     Name 1 {
@@ -62,6 +73,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -74,6 +86,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000eb
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -81,17 +94,12 @@
 ; BOLT-NEXT:     Name 3 {
 ; BOLT-NEXT:       Hash: 0x8CFC710C
 ; BOLT-NEXT:       String: {{.+}} "(anonymous namespace)"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_namespace
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
-; BOLT-NEXT:       }
-; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:       Entry @ [[ENTRY:0x[0-9a-f]*]] {
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_namespace
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 4 {
@@ -102,6 +110,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -120,6 +129,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c9
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 6 {
@@ -130,6 +140,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 7 {
@@ -140,12 +151,14 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000009f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV4]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c5
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -158,6 +171,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 9 {
@@ -168,6 +182,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -180,6 +195,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -192,6 +208,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -207,12 +224,14 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV4]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005d
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 13 {
@@ -223,12 +242,14 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000104
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 14 {
@@ -239,6 +260,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000073
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -250,10 +272,11 @@
 ; BOLT-NEXT:       Hash: 0x59796A
 ; BOLT-NEXT:       String: {{.+}} "t1"
 ; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Abbrev: [[ABBREV6]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000062
+; BOLT-NEXT:         DW_IDX_parent: Entry @ [[ENTRY]]
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test b/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
index 3be327b780e52..b0b6707a4ea5c 100644
--- a/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
@@ -17,7 +17,7 @@
 ; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x148
+; BOLT-NEXT:     Length: 0x14E
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 2
@@ -25,7 +25,7 @@
 ; BOLT-NEXT:     Foreign TU count: 0
 ; BOLT-NEXT:     Bucket count: 11
 ; BOLT-NEXT:     Name count: 11
-; BOLT-NEXT:     Abbreviations table size: 0x19
+; BOLT-NEXT:     Abbreviations table size: 0x1F
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -37,16 +37,19 @@
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -58,6 +61,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -68,6 +72,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000008c
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -80,6 +85,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 4 {
@@ -90,6 +96,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -102,6 +109,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -114,6 +122,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -129,6 +138,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -141,12 +151,14 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -159,6 +171,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 10 {
@@ -169,6 +182,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000057
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -187,6 +201,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-debug-names.test b/bolt/test/X86/dwarf5-df-debug-names.test
index 0352d0ff72082..d53f2bd13620f 100644
--- a/bolt/test/X86/dwarf5-df-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-debug-names.test
@@ -14,136 +14,148 @@
 
 ; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
 ; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
-: BOLT:       Name Index @ 0x0 {
-: BOLT-NEXT:   Header {
-: BOLT-NEXT:     Length: 0xF4
-: BOLT-NEXT:     Format: DWARF32
-: BOLT-NEXT:     Version: 5
-: BOLT-NEXT:     CU count: 2
-: BOLT-NEXT:     Local TU count: 0
-: BOLT-NEXT:     Foreign TU count: 0
-: BOLT-NEXT:     Bucket count: 7
-: BOLT-NEXT:     Name count: 7
-: BOLT-NEXT:     Abbreviations table size: 0x21
-: BOLT-NEXT:     Augmentation: 'BOLT'
-: BOLT-NEXT:   }
-: BOLT-NEXT:   Compilation Unit offsets [
-: BOLT-NEXT:     CU[0]: [[OFFSET]]
-: BOLT-NEXT:     CU[1]: [[OFFSET1]]
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Abbreviations [
-: BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
-: BOLT-NEXT:       Tag: DW_TAG_structure_type
-: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
-: BOLT-NEXT:       Tag: DW_TAG_base_type
-: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
-: BOLT-NEXT:       Tag: DW_TAG_variable
-: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
-: BOLT-NEXT:       Tag: DW_TAG_subprogram
-: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 0 [
-: BOLT-NEXT:     EMPTY
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 1 [
-: BOLT-NEXT:     Name 1 {
-: BOLT-NEXT:       Hash: 0x7C96E4DB
-: BOLT-NEXT:       String: {{.+}} "Foo2"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV1]]
-: BOLT-NEXT:         Tag: DW_TAG_structure_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 2 [
-: BOLT-NEXT:     Name 2 {
-: BOLT-NEXT:       Hash: 0xBA564846
-: BOLT-NEXT:       String: {{.+}} "Foo2Int"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV1]]
-: BOLT-NEXT:         Tag: DW_TAG_structure_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 3 [
-: BOLT-NEXT:     Name 3 {
-: BOLT-NEXT:       Hash: 0xB888030
-: BOLT-NEXT:       String: {{.+}} "int"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV2]]
-: BOLT-NEXT:         Tag: DW_TAG_base_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000043
-: BOLT-NEXT:       }
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV2]]
-: BOLT-NEXT:         Tag: DW_TAG_base_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Name 4 {
-: BOLT-NEXT:       Hash: 0xF73809C
-: BOLT-NEXT:       String: {{.+}} "Foo2a"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV1]]
-: BOLT-NEXT:         Tag: DW_TAG_structure_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Name 5 {
-: BOLT-NEXT:       Hash: 0x7C96CB76
-: BOLT-NEXT:       String: {{.+}} "fint"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV3]]
-: BOLT-NEXT:         Tag: DW_TAG_variable
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-: BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Name 6 {
-: BOLT-NEXT:       Hash: 0x7C9A7F6A
-: BOLT-NEXT:       String: {{.+}} "main"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV4]]
-: BOLT-NEXT:         Tag: DW_TAG_subprogram
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 4 [
-: BOLT-NEXT:     EMPTY
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 5 [
-: BOLT-NEXT:     Name 7 {
-: BOLT-NEXT:       Hash: 0x7C952063
-: BOLT-NEXT:       String: {{.+}} "char"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV2]]
-: BOLT-NEXT:         Tag: DW_TAG_base_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 6 [
-: BOLT-NEXT:     EMPTY
-: BOLT-NEXT:   ]
-: BOLT-NEXT: }
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0xFC
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 7
+; BOLT-NEXT:     Name count: 7
+; BOLT-NEXT:     Abbreviations table size: 0x29
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET]]
+; BOLT-NEXT:     CU[1]: [[OFFSET1]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0xBA564846
+; BOLT-NEXT:       String: {{.+}} "Foo2Int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000043
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C96CB76
+; BOLT-NEXT:       String: {{.+}} "fint"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test b/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
index 8a8a4b118b8c0..3a0260fd0bfb9 100644
--- a/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
+++ b/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
@@ -31,6 +31,7 @@
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: 0x02
 ; BOLT-NEXT:       DW_IDX_die_offset: 0x0000001e
+; BOLT-NEXT:       DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   }
 ; BOLT:        Name 6 {
@@ -42,6 +43,7 @@
 ; BOLT-NEXT:       DW_IDX_type_unit: 0x02
 ; BOLT-NEXT:       DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:       DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   }
 ; BOLT:        Name 7 {
@@ -52,5 +54,6 @@
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:       DW_IDX_die_offset: 0x00000023
+; BOLT-NEXT:       DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   }
diff --git a/bolt/test/X86/dwarf5-df-one-cu-debug-names.test b/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
index 246ce7efb3bad..bf1cad81040bd 100644
--- a/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
@@ -13,7 +13,7 @@
 ; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0xA9
+; BOLT-NEXT:     Length: 0xAF
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 1
@@ -21,7 +21,7 @@
 ; BOLT-NEXT:     Foreign TU count: 0
 ; BOLT-NEXT:     Bucket count: 5
 ; BOLT-NEXT:     Name count: 5
-; BOLT-NEXT:     Abbreviations table size: 0x13
+; BOLT-NEXT:     Abbreviations table size: 0x19
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -31,14 +31,17 @@
 ; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -52,6 +55,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -61,6 +65,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -75,6 +80,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -86,6 +92,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 5 {
@@ -95,6 +102,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-types-debug-names.test b/bolt/test/X86/dwarf5-df-types-debug-names.test
index fdb962b80c751..f5a2c9c10353e 100644
--- a/bolt/test/X86/dwarf5-df-types-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-types-debug-names.test
@@ -25,7 +25,7 @@
 
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x174
+; BOLT-NEXT:     Length: 0x17E
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 2
@@ -33,7 +33,7 @@
 ; BOLT-NEXT:     Foreign TU count: 4
 ; BOLT-NEXT:     Bucket count: 9
 ; BOLT-NEXT:     Name count: 9
-; BOLT-NEXT:     Abbreviations table size: 0x2D
+; BOLT-NEXT:     Abbreviations table size: 0x37
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -52,27 +52,32 @@
 ; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:         DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -88,6 +93,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -98,6 +104,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 3 {
@@ -108,6 +115,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -120,6 +128,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: 0x5
@@ -127,12 +136,14 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x02
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -145,6 +156,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 6 {
@@ -156,6 +168,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV]]
@@ -163,6 +176,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x03
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 7 {
@@ -174,6 +188,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x02
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -195,6 +210,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -208,6 +224,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000036
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: 0x5
@@ -215,6 +232,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: 0x5
@@ -222,12 +240,14 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x03
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test b/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
index 1c77583f50883..2a489d86f5d7d 100644
--- a/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
@@ -17,7 +17,7 @@
 ; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0xD1
+; BOLT-NEXT:     Length: 0xD9
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 1
@@ -25,7 +25,7 @@
 ; BOLT-NEXT:     Foreign TU count: 2
 ; BOLT-NEXT:     Bucket count: 5
 ; BOLT-NEXT:     Name count: 5
-; BOLT-NEXT:     Abbreviations table size: 0x1D
+; BOLT-NEXT:     Abbreviations table size: 0x25
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -40,19 +40,23 @@
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -67,6 +71,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -76,6 +81,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -90,6 +96,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -102,6 +109,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 5 {
@@ -112,17 +120,20 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000036
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV4]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-one-cu-debug-names.test b/bolt/test/X86/dwarf5-one-cu-debug-names.test
index e7754521d61cb..74d6b3041c933 100644
--- a/bolt/test/X86/dwarf5-one-cu-debug-names.test
+++ b/bolt/test/X86/dwarf5-one-cu-debug-names.test
@@ -9,7 +9,7 @@
 ; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x13E
+; BOLT-NEXT:     Length: 0x14F
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 1
@@ -17,7 +17,7 @@
 ; BOLT-NEXT:     Foreign TU count: 0
 ; BOLT-NEXT:     Bucket count: 11
 ; BOLT-NEXT:     Name count: 11
-; BOLT-NEXT:     Abbreviations table size: 0x1F
+; BOLT-NEXT:     Abbreviations table size: 0x31
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -27,22 +27,32 @@
 ; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:        Tag: DW_TAG_structure_type
+; BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:        DW_IDX_parent: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
-; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
-; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
+; BOLT-NEXT:     Abbreviation [[ABBREV6:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_namespace
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -53,6 +63,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000104
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -62,6 +73,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c5
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -73,6 +85,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c9
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 4 {
@@ -82,6 +95,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -93,6 +107,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000eb
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -107,18 +122,20 @@
 ; BOLT-NEXT:       Hash: 0x59796A
 ; BOLT-NEXT:       String: {{.+}} "t1"
 ; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000062
+; BOLT-NEXT:         DW_IDX_parent: Entry @ 0x14d
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 7 {
 ; BOLT-NEXT:       Hash: 0x5979AC
 ; BOLT-NEXT:       String: {{.+}} "v1"
 ; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -130,6 +147,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005d
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -141,15 +159,17 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 10 {
 ; BOLT-NEXT:       Hash: 0x7C9A7F6A
 ; BOLT-NEXT:       String: {{.+}} "main"
 ; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000073
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -157,15 +177,11 @@
 ; BOLT-NEXT:     Name 11 {
 ; BOLT-NEXT:       Hash: 0x8CFC710C
 ; BOLT-NEXT:       String: {{.+}} "(anonymous namespace)"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV5]]
-; BOLT-NEXT:         Tag: DW_TAG_namespace
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
-; BOLT-NEXT:       }
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:       Entry @ 0x14d {
+; BOLT-NEXT:         Abbrev: [[ABBREV6]]
 ; BOLT-NEXT:         Tag: DW_TAG_namespace
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-types-debug-names.test b/bolt/test/X86/dwarf5-types-debug-names.test
index 6a26477d4cdb5..94624298e289d 100644
--- a/bolt/test/X86/dwarf5-types-debug-names.test
+++ b/bolt/test/X86/dwarf5-types-debug-names.test
@@ -14,7 +14,7 @@
 
 ; BOLT: Name Index @ 0x0 {
 ; BOLT:   Header {
-; BOLT:     Length: 0xE1
+; BOLT:     Length: 0xE9
 ; BOLT:     Format: DWARF32
 ; BOLT:     Version: 5
 ; BOLT:     CU count: 2
@@ -22,7 +22,7 @@
 ; BOLT:     Foreign TU count: 0
 ; BOLT:     Bucket count: 6
 ; BOLT:     Name count: 6
-; BOLT:     Abbreviations table size: 0x21
+; BOLT:     Abbreviations table size: 0x29
 ; BOLT:     Augmentation: 'BOLT'
 ; BOLT:   }
 ; BOLT:   Compilation Unit offsets [
@@ -37,21 +37,25 @@
 ; BOLT:       Tag: DW_TAG_structure_type
 ; BOLT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT:     }
 ; BOLT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT:       Tag: DW_TAG_subprogram
 ; BOLT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT:     }
 ; BOLT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT:       Tag: DW_TAG_base_type
 ; BOLT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT:     }
 ; BOLT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT:       Tag: DW_TAG_base_type
 ; BOLT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT:     }
 ; BOLT:   ]
 ; BOLT:   Bucket 0 [
@@ -63,6 +67,7 @@
 ; BOLT:         Tag: DW_TAG_structure_type
 ; BOLT:         DW_IDX_type_unit: 0x00
 ; BOLT:         DW_IDX_die_offset: 0x00000023
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -75,6 +80,7 @@
 ; BOLT:         Tag: DW_TAG_subprogram
 ; BOLT:         DW_IDX_compile_unit: 0x01
 ; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -87,6 +93,7 @@
 ; BOLT:         Tag: DW_TAG_base_type
 ; BOLT:         DW_IDX_compile_unit: 0x01
 ; BOLT:         DW_IDX_die_offset: 0x00000040
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -99,6 +106,7 @@
 ; BOLT:         Tag: DW_TAG_subprogram
 ; BOLT:         DW_IDX_compile_unit: 0x01
 ; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -111,6 +119,7 @@
 ; BOLT:         Tag: DW_TAG_subprogram
 ; BOLT:         DW_IDX_compile_unit: 0x00
 ; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -123,6 +132,7 @@
 ; BOLT:         Tag: DW_TAG_base_type
 ; BOLT:         DW_IDX_type_unit: 0x00
 ; BOLT:         DW_IDX_die_offset: 0x00000038
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
diff --git a/bolt/test/X86/dwarf5-types-one-cu-debug-names.test b/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
index 00a1319a55786..02c0ef81351f1 100644
--- a/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
+++ b/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
@@ -11,7 +11,7 @@
 
 ; BOLT:Name Index @ 0x0 {
 ; BOLT-NEXT:  Header {
-; BOLT-NEXT:    Length: 0xA3
+; BOLT-NEXT:    Length: 0xAB
 ; BOLT-NEXT:    Format: DWARF32
 ; BOLT-NEXT:    Version: 5
 ; BOLT-NEXT:    CU count: 1
@@ -19,7 +19,7 @@
 ; BOLT-NEXT:    Foreign TU count: 0
 ; BOLT-NEXT:    Bucket count: 4
 ; BOLT-NEXT:    Name count: 4
-; BOLT-NEXT:    Abbreviations table size: 0x1D
+; BOLT-NEXT:    Abbreviations table size: 0x25
 ; BOLT-NEXT:    Augmentation: 'BOLT'
 ; BOLT-NEXT:  }
 ; BOLT-NEXT:  Compilation Unit offsets [
@@ -32,20 +32,24 @@
 ; BOLT-NEXT:    Abbreviation [[ABBREV:0x[0-9a-f]*]] {
 ; BOLT-NEXT:      Tag: DW_TAG_base_type
 ; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:      DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:    Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT-NEXT:      Tag: DW_TAG_structure_type
 ; BOLT-NEXT:      DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:      DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:    Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:      Tag: DW_TAG_subprogram
 ; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:      DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:    Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:      Tag: DW_TAG_base_type
 ; BOLT-NEXT:      DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:      DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:  ]
 ; BOLT-NEXT:  Bucket 0 [
@@ -56,6 +60,7 @@
 ; BOLT-NEXT:        Abbrev: [[ABBREV]]
 ; BOLT-NEXT:        Tag: DW_TAG_base_type
 ; BOLT-NEXT:        DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:        DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:      }
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:    Name 2 {
@@ -66,6 +71,7 @@
 ; BOLT-NEXT:        Tag: DW_TAG_structure_type
 ; BOLT-NEXT:        DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:        DW_IDX_die_offset: 0x00000023
+; BOLT-NEXT:        DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:      }
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:  ]
@@ -80,6 +86,7 @@
 ; BOLT-NEXT:        Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:        Tag: DW_TAG_subprogram
 ; BOLT-NEXT:        DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:        DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:      }
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:  ]
@@ -92,6 +99,7 @@
 ; BOLT-NEXT:        Tag: DW_TAG_base_type
 ; BOLT-NEXT:        DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:        DW_IDX_die_offset: 0x00000038
+; BOLT-NEXT:        DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:      }
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:  ]
diff --git a/bolt/test/X86/reader-stale-yaml.test b/bolt/test/X86/reader-stale-yaml.test
index 533ed917c1aeb..f4a8865b1f9a4 100644
--- a/bolt/test/X86/reader-stale-yaml.test
+++ b/bolt/test/X86/reader-stale-yaml.test
@@ -14,6 +14,10 @@ RUN:   --profile-ignore-hash=1 --profile-use-dfs=0 --debug-only=bolt-prof 2>&1 |
 RUN: llvm-bolt %t.exe -o %t.null --b %p/Inputs/blarge_profile_stale.yaml \
 RUN:   --print-cfg --print-only=SolveCubic --infer-stale-profile=1 \
 RUN:   --profile-ignore-hash=1 --profile-use-dfs=0 --debug-only=bolt-prof 2>&1 | FileCheck %s -check-prefix=CHECK2
+# Testing skipped function
+RUN: llvm-bolt %t.exe -o %t.null --b %p/Inputs/blarge_profile_stale.yaml \
+RUN:   --print-cfg --print-only=usqrt --infer-stale-profile=1 --skip-funcs=usqrt \
+RUN:   --profile-ignore-hash=1 --profile-use-dfs=0
 
 CHECK0: BOLT-INFO: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile
 CHECK0: BOLT-WARNING: 2 (100.0% of all profiled) functions have invalid (possibly stale) profile
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index 63448039c53e6..daf9f392b8228 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -134,9 +134,8 @@ TEST_P(MCPlusBuilderTester, ReplaceRegWithImm) {
 
 TEST_P(MCPlusBuilderTester, Annotation) {
   MCInst Inst;
-  bool Success = BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
-                                         BC->Ctx.get());
-  ASSERT_TRUE(Success);
+  BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
+                          BC->Ctx.get());
   MCSymbol *LPSymbol = BC->Ctx->createNamedTempSymbol("LP");
   uint64_t Value = INT32_MIN;
   // Test encodeAnnotationImm using this indirect way
@@ -151,9 +150,8 @@ TEST_P(MCPlusBuilderTester, Annotation) {
   // Large int64 should trigger an out of range assertion
   Value = 0x1FF'FFFF'FFFF'FFFFULL;
   Inst.clear();
-  Success = BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
-                                    BC->Ctx.get());
-  ASSERT_TRUE(Success);
+  BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
+                          BC->Ctx.get());
   ASSERT_DEATH(BC->MIB->addEHInfo(Inst, MCPlus::MCLandingPad(LPSymbol, Value)),
                "annotation value out of range");
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index e518a64abc52e..2931325d8b579 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -73,6 +73,7 @@
 #include "SuspiciousReallocUsageCheck.h"
 #include "SuspiciousSemicolonCheck.h"
 #include "SuspiciousStringCompareCheck.h"
+#include "SuspiciousStringviewDataUsageCheck.h"
 #include "SwappedArgumentsCheck.h"
 #include "SwitchMissingDefaultCaseCheck.h"
 #include "TerminatingContinueCheck.h"
@@ -218,6 +219,8 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-suspicious-semicolon");
     CheckFactories.registerCheck<SuspiciousStringCompareCheck>(
         "bugprone-suspicious-string-compare");
+    CheckFactories.registerCheck<SuspiciousStringviewDataUsageCheck>(
+        "bugprone-suspicious-stringview-data-usage");
     CheckFactories.registerCheck<SwappedArgumentsCheck>(
         "bugprone-swapped-arguments");
     CheckFactories.registerCheck<TerminatingContinueCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 638fba03a4358..081ba67efe153 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -26,6 +26,7 @@ add_clang_library(clangTidyBugproneModule
   ImplicitWideningOfMultiplicationResultCheck.cpp
   InaccurateEraseCheck.cpp
   IncorrectEnableIfCheck.cpp
+  SuspiciousStringviewDataUsageCheck.cpp
   SwitchMissingDefaultCaseCheck.cpp
   IncDecInConditionsCheck.cpp
   IncorrectRoundingsCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp
new file mode 100644
index 0000000000000..8f4b0c5e0dced
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp
@@ -0,0 +1,97 @@
+//===--- SuspiciousStringviewDataUsageCheck.cpp - clang-tidy --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SuspiciousStringviewDataUsageCheck.h"
+#include "../utils/Matchers.h"
+#include "../utils/OptionsUtils.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::bugprone {
+
+SuspiciousStringviewDataUsageCheck::SuspiciousStringviewDataUsageCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      StringViewTypes(utils::options::parseStringList(Options.get(
+          "StringViewTypes", "::std::basic_string_view;::llvm::StringRef"))),
+      AllowedCallees(
+          utils::options::parseStringList(Options.get("AllowedCallees", ""))) {}
+
+void SuspiciousStringviewDataUsageCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "StringViewTypes",
+                utils::options::serializeStringList(StringViewTypes));
+  Options.store(Opts, "AllowedCallees",
+                utils::options::serializeStringList(AllowedCallees));
+}
+
+bool SuspiciousStringviewDataUsageCheck::isLanguageVersionSupported(
+    const LangOptions &LangOpts) const {
+  return LangOpts.CPlusPlus;
+}
+
+std::optional<TraversalKind>
+SuspiciousStringviewDataUsageCheck::getCheckTraversalKind() const {
+  return TK_AsIs;
+}
+
+void SuspiciousStringviewDataUsageCheck::registerMatchers(MatchFinder *Finder) {
+
+  auto AncestorCall = anyOf(
+      cxxConstructExpr(), callExpr(unless(cxxOperatorCallExpr())), lambdaExpr(),
+      initListExpr(
+          hasType(qualType(hasCanonicalType(hasDeclaration(recordDecl()))))));
+
+  auto DataMethod =
+      cxxMethodDecl(hasName("data"),
+                    ofClass(matchers::matchesAnyListedName(StringViewTypes)));
+
+  auto SizeCall = cxxMemberCallExpr(
+      callee(cxxMethodDecl(hasAnyName("size", "length"))),
+      on(ignoringParenImpCasts(
+          matchers::isStatementIdenticalToBoundNode("self"))));
+
+  auto DescendantSizeCall = expr(hasDescendant(
+      expr(SizeCall, hasAncestor(expr(AncestorCall).bind("ancestor-size")),
+           hasAncestor(expr(equalsBoundNode("parent"),
+                            equalsBoundNode("ancestor-size"))))));
+
+  Finder->addMatcher(
+      cxxMemberCallExpr(
+          on(ignoringParenImpCasts(expr().bind("self"))), callee(DataMethod),
+          expr().bind("data-call"),
+          hasParent(expr(anyOf(
+              invocation(
+                  expr().bind("parent"), unless(cxxOperatorCallExpr()),
+                  hasAnyArgument(
+                      ignoringParenImpCasts(equalsBoundNode("data-call"))),
+                  unless(hasAnyArgument(ignoringParenImpCasts(SizeCall))),
+                  unless(hasAnyArgument(DescendantSizeCall)),
+                  hasDeclaration(namedDecl(
+                      unless(matchers::matchesAnyListedName(AllowedCallees))))),
+              initListExpr(expr().bind("parent"),
+                           hasType(qualType(hasCanonicalType(hasDeclaration(
+                               recordDecl(unless(matchers::matchesAnyListedName(
+                                   AllowedCallees))))))),
+                           unless(DescendantSizeCall)))))),
+      this);
+}
+
+void SuspiciousStringviewDataUsageCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *DataCallExpr =
+      Result.Nodes.getNodeAs<CXXMemberCallExpr>("data-call");
+  diag(DataCallExpr->getExprLoc(),
+       "result of a `data()` call may not be null terminated, provide size "
+       "information to the callee to prevent potential issues")
+      << DataCallExpr->getCallee()->getSourceRange();
+}
+
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h
new file mode 100644
index 0000000000000..31eca0a48722f
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h
@@ -0,0 +1,38 @@
+//===--- SuspiciousStringviewDataUsageCheck.h - clang-tidy -------//C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSSTRINGVIEWDATAUSAGECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSSTRINGVIEWDATAUSAGECHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::bugprone {
+
+/// Identifies suspicious usages of std::string_view::data() that could lead to
+/// reading out-of-bounds data due to inadequate or incorrect string null
+/// termination.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.html
+class SuspiciousStringviewDataUsageCheck : public ClangTidyCheck {
+public:
+  SuspiciousStringviewDataUsageCheck(StringRef Name, ClangTidyContext *Context);
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override;
+
+private:
+  std::vector<llvm::StringRef> StringViewTypes;
+  std::vector<llvm::StringRef> AllowedCallees;
+};
+
+} // namespace clang::tidy::bugprone
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSSTRINGVIEWDATAUSAGECHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
index 243fe47c2036b..73373147e96fc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
@@ -31,9 +31,16 @@ AST_MATCHER_P(FunctionDecl, isInstantiatedFrom, Matcher<FunctionDecl>,
                               Finder, Builder);
 }
 
-AST_MATCHER_P(CXXMethodDecl, isOperatorOverloading,
-              llvm::SmallVector<OverloadedOperatorKind>, Kinds) {
-  return llvm::is_contained(Kinds, Node.getOverloadedOperator());
+constexpr std::initializer_list<OverloadedOperatorKind>
+    AssignmentOverloadedOperatorKinds = {
+        OO_Equal,      OO_PlusEqual,     OO_MinusEqual,          OO_StarEqual,
+        OO_SlashEqual, OO_PercentEqual,  OO_CaretEqual,          OO_AmpEqual,
+        OO_PipeEqual,  OO_LessLessEqual, OO_GreaterGreaterEqual, OO_PlusPlus,
+        OO_MinusMinus};
+
+AST_MATCHER(FunctionDecl, isAssignmentOverloadedOperator) {
+  return llvm::is_contained(AssignmentOverloadedOperatorKinds,
+                            Node.getOverloadedOperator());
 }
 } // namespace
 
@@ -164,22 +171,18 @@ void UnusedReturnValueCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 }
 
 void UnusedReturnValueCheck::registerMatchers(MatchFinder *Finder) {
-  auto MatchedDirectCallExpr = expr(
-      callExpr(
-          callee(functionDecl(
-              // Don't match void overloads of checked functions.
-              unless(returns(voidType())),
-              // Don't match copy or move assignment operator.
-              unless(cxxMethodDecl(isOperatorOverloading(
-                  {OO_Equal, OO_PlusEqual, OO_MinusEqual, OO_StarEqual,
-                   OO_SlashEqual, OO_PercentEqual, OO_CaretEqual, OO_AmpEqual,
-                   OO_PipeEqual, OO_LessLessEqual, OO_GreaterGreaterEqual}))),
-              anyOf(
-                  isInstantiatedFrom(
-                      matchers::matchesAnyListedName(CheckedFunctions)),
-                  returns(hasCanonicalType(hasDeclaration(namedDecl(
-                      matchers::matchesAnyListedName(CheckedReturnTypes)))))))))
-          .bind("match"));
+  auto MatchedDirectCallExpr =
+      expr(callExpr(callee(functionDecl(
+                        // Don't match copy or move assignment operator.
+                        unless(isAssignmentOverloadedOperator()),
+                        // Don't match void overloads of checked functions.
+                        unless(returns(voidType())),
+                        anyOf(isInstantiatedFrom(matchers::matchesAnyListedName(
+                                  CheckedFunctions)),
+                              returns(hasCanonicalType(hasDeclaration(
+                                  namedDecl(matchers::matchesAnyListedName(
+                                      CheckedReturnTypes)))))))))
+               .bind("match"));
 
   auto CheckCastToVoid =
       AllowCastToVoid ? castExpr(unless(hasCastKind(CK_ToVoid))) : castExpr();
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp
index 9215b833573af..9b4d2ef99e5bf 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp
@@ -19,6 +19,17 @@ using namespace clang::ast_matchers::internal;
 
 namespace clang::tidy::cppcoreguidelines {
 
+namespace {
+AST_MATCHER_P(LambdaExpr, hasCallOperator, Matcher<CXXMethodDecl>,
+              InnerMatcher) {
+  return InnerMatcher.matches(*Node.getCallOperator(), Finder, Builder);
+}
+
+AST_MATCHER_P(LambdaExpr, hasLambdaBody, Matcher<Stmt>, InnerMatcher) {
+  return InnerMatcher.matches(*Node.getBody(), Finder, Builder);
+}
+} // namespace
+
 void OwningMemoryCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "LegacyResourceProducers", LegacyResourceProducers);
   Options.store(Opts, "LegacyResourceConsumers", LegacyResourceConsumers);
@@ -55,6 +66,8 @@ void OwningMemoryCheck::registerMatchers(MatchFinder *Finder) {
             CreatesLegacyOwner, LegacyOwnerCast);
 
   const auto ConsideredOwner = eachOf(IsOwnerType, CreatesOwner);
+  const auto ScopeDeclaration = anyOf(translationUnitDecl(), namespaceDecl(),
+                                      recordDecl(), functionDecl());
 
   // Find delete expressions that delete non-owners.
   Finder->addMatcher(
@@ -144,13 +157,51 @@ void OwningMemoryCheck::registerMatchers(MatchFinder *Finder) {
                              .bind("bad_owner_creation_parameter"))),
                      this);
 
+  auto IsNotInSubLambda = stmt(
+      hasAncestor(
+          stmt(anyOf(equalsBoundNode("body"), lambdaExpr())).bind("scope")),
+      hasAncestor(stmt(equalsBoundNode("scope"), equalsBoundNode("body"))));
+
   // Matching on functions, that return an owner/resource, but don't declare
   // their return type as owner.
   Finder->addMatcher(
-      functionDecl(hasDescendant(returnStmt(hasReturnValue(ConsideredOwner))
-                                     .bind("bad_owner_return")),
-                   unless(returns(qualType(hasDeclaration(OwnerDecl)))))
-          .bind("function_decl"),
+      functionDecl(
+          decl().bind("function_decl"),
+          hasBody(
+              stmt(stmt().bind("body"),
+                   hasDescendant(
+                       returnStmt(hasReturnValue(ConsideredOwner),
+                                  // Ignore sub-lambda expressions
+                                  IsNotInSubLambda,
+                                  // Ignore sub-functions
+                                  hasAncestor(functionDecl().bind("context")),
+                                  hasAncestor(functionDecl(
+                                      equalsBoundNode("context"),
+                                      equalsBoundNode("function_decl"))))
+                           .bind("bad_owner_return")))),
+          returns(qualType(unless(hasDeclaration(OwnerDecl))).bind("result"))),
+      this);
+
+  // Matching on lambdas, that return an owner/resource, but don't declare
+  // their return type as owner.
+  Finder->addMatcher(
+      lambdaExpr(
+          hasAncestor(decl(ScopeDeclaration).bind("scope-decl")),
+          hasLambdaBody(
+              stmt(stmt().bind("body"),
+                   hasDescendant(
+                       returnStmt(
+                           hasReturnValue(ConsideredOwner),
+                           // Ignore sub-lambdas
+                           IsNotInSubLambda,
+                           // Ignore sub-functions
+                           hasAncestor(decl(ScopeDeclaration).bind("context")),
+                           hasAncestor(decl(equalsBoundNode("context"),
+                                            equalsBoundNode("scope-decl"))))
+                           .bind("bad_owner_return")))),
+          hasCallOperator(returns(
+              qualType(unless(hasDeclaration(OwnerDecl))).bind("result"))))
+          .bind("lambda"),
       this);
 
   // Match on classes that have an owner as member, but don't declare a
@@ -329,7 +380,7 @@ bool OwningMemoryCheck::handleReturnValues(const BoundNodes &Nodes) {
   // Function return statements, that are owners/resources, but the function
   // declaration does not declare its return value as owner.
   const auto *BadReturnType = Nodes.getNodeAs<ReturnStmt>("bad_owner_return");
-  const auto *Function = Nodes.getNodeAs<FunctionDecl>("function_decl");
+  const auto *ResultType = Nodes.getNodeAs<QualType>("result");
 
   // Function return values, that should be owners but aren't.
   if (BadReturnType) {
@@ -338,8 +389,9 @@ bool OwningMemoryCheck::handleReturnValues(const BoundNodes &Nodes) {
     diag(BadReturnType->getBeginLoc(),
          "returning a newly created resource of "
          "type %0 or 'gsl::owner<>' from a "
-         "function whose return type is not 'gsl::owner<>'")
-        << Function->getReturnType() << BadReturnType->getSourceRange();
+         "%select{function|lambda}1 whose return type is not 'gsl::owner<>'")
+        << *ResultType << (Nodes.getNodeAs<Expr>("lambda") != nullptr)
+        << BadReturnType->getSourceRange();
 
     // FIXME: Rewrite the return type as 'gsl::owner<OriginalType>'
     return true;
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index 4f02950e7794c..74152c6034510 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -81,8 +81,7 @@ void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
 
   const Expr *SubExpr = Cast->getSubExpr();
 
-  bool NeedInnerParens =
-      SubExpr != nullptr && utils::fixit::areParensNeededForStatement(*SubExpr);
+  bool NeedInnerParens = utils::fixit::areParensNeededForStatement(*SubExpr);
   bool NeedOuterParens =
       Parent != nullptr && utils::fixit::areParensNeededForStatement(*Parent);
 
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index 036eb9808ea08..9e321dce4c504 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -619,7 +619,8 @@ struct CodeCompletionBuilder {
     }
     // 'CompletionItemKind::Interface' matches template type aliases.
     if (Completion.Kind == CompletionItemKind::Interface ||
-        Completion.Kind == CompletionItemKind::Class) {
+        Completion.Kind == CompletionItemKind::Class ||
+        Completion.Kind == CompletionItemKind::Variable) {
       if (Snippet->front() != '<')
         return *Snippet; // Not an arg snippet?
 
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index 5721feecd58ea..49337bddf98d5 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -2648,12 +2648,15 @@ TEST(CompletionTest, CompletionFunctionArgsDisabled) {
       class foo_class{};
       template <class T>
       using foo_alias = T**;
+      template <class T>
+      T foo_var = T{};
       void f() { foo_^ })cpp",
         {}, Opts);
     EXPECT_THAT(
         Results.Completions,
         UnorderedElementsAre(AllOf(named("foo_class"), snippetSuffix("<$0>")),
-                             AllOf(named("foo_alias"), snippetSuffix("<$0>"))));
+                             AllOf(named("foo_alias"), snippetSuffix("<$0>")),
+                             AllOf(named("foo_var"), snippetSuffix("<$0>"))));
   }
   {
     auto Results = completions(
diff --git a/clang-tools-extra/clangd/unittests/SelectionTests.cpp b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
index 754e8c287c514..db516a1f62a35 100644
--- a/clang-tools-extra/clangd/unittests/SelectionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
@@ -10,6 +10,7 @@
 #include "SourceCode.h"
 #include "TestTU.h"
 #include "support/TestTracer.h"
+#include "clang/AST/ASTConcept.h"
 #include "clang/AST/Decl.h"
 #include "llvm/Support/Casting.h"
 #include "gmock/gmock.h"
@@ -893,6 +894,33 @@ TEST(SelectionTest, DeclContextLambda) {
   EXPECT_TRUE(ST.commonAncestor()->getDeclContext().isFunctionOrMethod());
 }
 
+TEST(SelectionTest, UsingConcepts) {
+  llvm::Annotations Test(R"cpp(
+namespace ns {
+template <typename T>
+concept Foo = true;
+}
+
+using ns::Foo;
+
+template <Fo^o... T, Fo^o auto U>
+auto Func(Fo^o auto V) -> Fo^o decltype(auto) {
+  Fo^o auto W = V;
+  return W;
+}
+)cpp");
+  auto TU = TestTU::withCode(Test.code());
+  TU.ExtraArgs.emplace_back("-std=c++2c");
+  auto AST = TU.build();
+  for (auto Point : Test.points()) {
+    auto ST = SelectionTree::createRight(AST.getASTContext(), AST.getTokens(),
+                                         Point, Point);
+    auto *C = ST.commonAncestor()->ASTNode.get<ConceptReference>();
+    EXPECT_TRUE(C && C->getFoundDecl() &&
+                C->getFoundDecl()->getKind() == Decl::UsingShadow);
+  }
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp
index 42dd612eeeec4..656b62c9a1f4e 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp
@@ -72,6 +72,22 @@ TEST_F(ExtractVariableTest, Test) {
     )cpp";
   EXPECT_UNAVAILABLE(NoCrashCasesC);
 
+  ExtraArgs = {"-xc"};
+  const char *NoCrashDesignator = R"cpp(
+    struct A {
+      struct {
+        int x;
+      };
+    };
+    struct B {
+      int y;
+    };
+    void foo(struct B *b) {
+      struct A a = {.x=b[[->]]y};
+    }
+  )cpp";
+  EXPECT_AVAILABLE(NoCrashDesignator);
+
   ExtraArgs = {"-xobjective-c"};
   const char *AvailableObjC = R"cpp(
     __attribute__((objc_root_class))
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 44680f79de6f5..a604e9276668a 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -110,6 +110,13 @@ New checks
   Detects error-prone Curiously Recurring Template Pattern usage, when the CRTP
   can be constructed outside itself and the derived class.
 
+- New :doc:`bugprone-suspicious-stringview-data-usage
+  <clang-tidy/checks/bugprone/suspicious-stringview-data-usage>` check.
+
+  Identifies suspicious usages of ``std::string_view::data()`` that could lead
+  to reading out-of-bounds data due to inadequate or incorrect string null
+  termination.
+
 - New :doc:`modernize-use-designated-initializers
   <clang-tidy/checks/modernize/use-designated-initializers>` check.
 
@@ -165,6 +172,10 @@ Changes in existing checks
   giving false positives for deleted functions and fix false negative when some
   parameters are forwarded, but other aren't.
 
+- Improved :doc:`cppcoreguidelines-owning-memory
+  <clang-tidy/checks/cppcoreguidelines/owning-memory>` check to properly handle
+  return type in lambdas and in nested functions.
+
 - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>`
   by removing enforcement of rule `C.48
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst
new file mode 100644
index 0000000000000..9b38d83601810
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst
@@ -0,0 +1,58 @@
+.. title:: clang-tidy - bugprone-suspicious-stringview-data-usage
+
+bugprone-suspicious-stringview-data-usage
+=========================================
+
+Identifies suspicious usages of ``std::string_view::data()`` that could lead to
+reading out-of-bounds data due to inadequate or incorrect string null
+termination.
+
+It warns when the result of ``data()`` is passed to a constructor or function
+without also passing the corresponding result of ``size()`` or ``length()``
+member function. Such usage can lead to unintended behavior, particularly when
+assuming the data pointed to by ``data()`` is null-terminated.
+
+The absence of a ``c_str()`` method in ``std::string_view`` often leads
+developers to use ``data()`` as a substitute, especially when interfacing with
+C APIs that expect null-terminated strings. However, since ``data()`` does not
+guarantee null termination, this can result in unintended behavior if the API
+relies on proper null termination for correct string interpretation.
+
+In today's programming landscape, this scenario can occur when implicitly
+converting an ``std::string_view`` to an ``std::string``. Since the constructor
+in ``std::string`` designed for string-view-like objects is ``explicit``,
+attempting to pass an ``std::string_view`` to a function expecting an
+``std::string`` will result in a compilation error. As a workaround, developers
+may be tempted to utilize the ``.data()`` method to achieve compilation,
+introducing potential risks.
+
+For instance:
+
+.. code-block:: c++
+
+  void printString(const std::string& str) {
+    std::cout << "String: " << str << std::endl;
+  }
+
+  void something(std::string_view sv) {
+    printString(sv.data());
+  }
+
+In this example, directly passing ``sv`` to the ``printString`` function would
+lead to a compilation error due to the explicit nature of the ``std::string``
+constructor. Consequently, developers might opt for ``sv.data()`` to resolve the
+compilation error, albeit introducing potential hazards as discussed.
+
+.. option:: StringViewTypes
+
+  Option allows users to specify custom string view-like types for analysis. It
+  accepts a semicolon-separated list of type names or regular expressions
+  matching these types. Default value is:
+  `::std::basic_string_view;::llvm::StringRef`.
+
+.. option:: AllowedCallees
+
+  Specifies methods, functions, or classes where the result of ``.data()`` is
+  passed to. Allows to exclude such calls from the analysis. Accepts a
+  semicolon-separated list of names or regular expressions matching these
+  entities. Default value is: empty string.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index d03e7af688f00..79e81dd174e4f 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -139,6 +139,7 @@ Clang-Tidy Checks
    :doc:`bugprone-suspicious-realloc-usage <bugprone/suspicious-realloc-usage>`,
    :doc:`bugprone-suspicious-semicolon <bugprone/suspicious-semicolon>`, "Yes"
    :doc:`bugprone-suspicious-string-compare <bugprone/suspicious-string-compare>`, "Yes"
+   :doc:`bugprone-suspicious-stringview-data-usage <bugprone/suspicious-stringview-data-usage>`,
    :doc:`bugprone-swapped-arguments <bugprone/swapped-arguments>`, "Yes"
    :doc:`bugprone-switch-missing-default-case <bugprone/switch-missing-default-case>`,
    :doc:`bugprone-terminating-continue <bugprone/terminating-continue>`, "Yes"
diff --git a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
index 5dc88157e13af..dac3f39e1a658 100644
--- a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
@@ -548,9 +548,7 @@ TEST(WalkAST, Concepts) {
   testWalk(Concept, "template<typename T> requires ^Foo<T> void func() {}");
   testWalk(Concept, "template<typename T> void func() requires ^Foo<T> {}");
   testWalk(Concept, "void func(^Foo auto x) {}");
-  // FIXME: Foo should be explicitly referenced.
-  testWalk("template<typename T> concept Foo = true;",
-           "void func() { ^Foo auto x = 1; }");
+  testWalk(Concept, "void func() { ^Foo auto x = 1; }");
 }
 
 TEST(WalkAST, FriendDecl) {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string
index f2e4159a22451..28e2b4a231e52 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string
@@ -24,6 +24,7 @@ struct basic_string {
   basic_string();
   basic_string(const C *p, const A &a = A());
   basic_string(const C *p, size_type count);
+  basic_string(const C *b, const C *e);
 
   ~basic_string();
 
@@ -85,6 +86,12 @@ struct basic_string_view {
   const C *str;
   constexpr basic_string_view(const C* s) : str(s) {}
 
+  const C *data() const;
+
+  bool empty() const;
+  size_type size() const;
+  size_type length() const;
+
   size_type find(_Type v, size_type pos = 0) const;
   size_type find(C ch, size_type pos = 0) const;
   size_type find(const C* s, size_type pos, size_type count) const;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-stringview-data-usage.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-stringview-data-usage.cpp
new file mode 100644
index 0000000000000..8072d60a2f47a
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-stringview-data-usage.cpp
@@ -0,0 +1,56 @@
+// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-suspicious-stringview-data-usage %t -- -- -isystem %clang_tidy_headers
+#include <string>
+
+struct View {
+   const char* str;
+};
+
+struct Pair {
+   const char* begin;
+   const char* end;
+};
+
+struct ViewWithSize {
+   const char* str;
+   std::string_view::size_type size;
+};
+
+void something(const char*);
+void something(const char*, unsigned);
+void something(const char*, unsigned, const char*);
+void something_str(std::string, unsigned);
+
+void invalid(std::string_view sv, std::string_view sv2) {
+  std::string s(sv.data());
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  std::string si{sv.data()};
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  std::string_view s2(sv.data());
+// CHECK-MESSAGES: :[[@LINE-1]]:26: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  something(sv.data());
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  something(sv.data(), sv.size(), sv2.data());
+// CHECK-MESSAGES: :[[@LINE-1]]:39: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  something_str(sv.data(), sv.size());
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  View view{sv.data()};
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+}
+
+void valid(std::string_view sv) {
+  std::string s1(sv.data(), sv.data() + sv.size());
+  std::string s2(sv.data(), sv.data() + sv.length());
+  std::string s3(sv.data(), sv.size() + sv.data());
+  std::string s4(sv.data(), sv.length() + sv.data());
+  std::string s5(sv.data(), sv.size());
+  std::string s6(sv.data(), sv.length());
+  something(sv.data(), sv.size());
+  something(sv.data(), sv.length());
+  ViewWithSize view1{sv.data(), sv.size()};
+  ViewWithSize view2{sv.data(), sv.length()};
+  Pair view3{sv.data(), sv.data() + sv.size()};
+  Pair view4{sv.data(), sv.data() + sv.length()};
+  Pair view5{sv.data(), sv.size() + sv.data()};
+  Pair view6{sv.data(), sv.length() + sv.data()};
+  const char* str{sv.data()};
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-avoid-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-avoid-assignment.cpp
index b4a41004adf89..564c07a724ccd 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-avoid-assignment.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-avoid-assignment.cpp
@@ -3,23 +3,37 @@
 // RUN:  {bugprone-unused-return-value.CheckedFunctions: "::*"}}' \
 // RUN: --
 
-struct S {
-  S(){};
-  S(S const &);
-  S(S &&);
-  S &operator=(S const &);
-  S &operator=(S &&);
-  S &operator+=(S);
+struct S1 {
+  S1(){};
+  S1(S1 const &);
+  S1(S1 &&);
+  S1 &operator=(S1 const &);
+  S1 &operator=(S1 &&);
+  S1 &operator+=(S1);
+  S1 &operator++();
+  S1 &operator++(int);
+  S1 &operator--();
+  S1 &operator--(int);
 };
 
-S returnValue();
-S const &returnRef();
+struct S2 {
+  S2(){};
+  S2(S2 const &);
+  S2(S2 &&);
+};
+
+S2 &operator-=(S2&, int);
+S2 &operator++(S2 &);
+S2 &operator++(S2 &, int);
+
+S1 returnValue();
+S1 const &returnRef();
 
 void bar() {
   returnValue();
   // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 
-  S a{};
+  S1 a{};
   a = returnValue();
   a.operator=(returnValue());
 
@@ -27,4 +41,15 @@ void bar() {
   a.operator=(returnRef());
 
   a += returnRef();
+
+  a++;
+  ++a;
+  a--;
+  --a;
+
+  S2 b{};
+
+  b -= 1;
+  b++;
+  ++b;
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
index eb8ad1b8b8792..574efe7bd9147 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
@@ -395,3 +395,109 @@ namespace PR63994 {
     // CHECK-NOTES: [[@LINE-1]]:5: warning: returning a newly created resource of type 'A *' or 'gsl::owner<>' from a function whose return type is not 'gsl::owner<>'
   }
 }
+
+namespace PR59389 {
+  struct S {
+    S();
+    S(int);
+
+    int value = 1;
+  };
+
+  void testLambdaInFunctionNegative() {
+    const auto MakeS = []() -> ::gsl::owner<S*> {
+      return ::gsl::owner<S*>{new S{}};
+    };
+  }
+
+  void testLambdaInFunctionPositive() {
+    const auto MakeS = []() -> S* {
+      return ::gsl::owner<S*>{new S{}};
+      // CHECK-NOTES: [[@LINE-1]]:7: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>'
+    };
+  }
+
+  void testFunctionInFunctionNegative() {
+    struct C {
+      ::gsl::owner<S*> test() {
+        return ::gsl::owner<S*>{new S{}};
+      }
+    };
+  }
+
+  void testFunctionInFunctionPositive() {
+    struct C {
+      S* test() {
+        return ::gsl::owner<S*>{new S{}};
+        // CHECK-NOTES: [[@LINE-1]]:9: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a function whose return type is not 'gsl::owner<>'
+      }
+    };
+  }
+
+  ::gsl::owner<S*> testReverseLambdaNegative() {
+    const auto MakeI = [] -> int { return 5; };
+    return ::gsl::owner<S*>{new S(MakeI())};
+  }
+
+  S* testReverseLambdaPositive() {
+    const auto MakeI = [] -> int { return 5; };
+    return ::gsl::owner<S*>{new S(MakeI())};
+    // CHECK-NOTES: [[@LINE-1]]:5: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a function whose return type is not 'gsl::owner<>'
+  }
+
+  ::gsl::owner<S*> testReverseFunctionNegative() {
+    struct C {
+      int test() { return 5; }
+    };
+    return ::gsl::owner<S*>{new S(C().test())};
+  }
+
+  S* testReverseFunctionPositive() {
+    struct C {
+      int test() { return 5; }
+    };
+    return ::gsl::owner<S*>{new S(C().test())};
+    // CHECK-NOTES: [[@LINE-1]]:5: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a function whose return type is not 'gsl::owner<>'
+  }
+
+  void testLambdaInLambdaNegative() {
+    const auto MakeS = []() -> ::gsl::owner<S*> {
+      const auto MakeI = []() -> int { return 5; };
+      return ::gsl::owner<S*>{new S(MakeI())};
+    };
+  }
+
+  void testLambdaInLambdaPositive() {
+    const auto MakeS = []() -> S* {
+      const auto MakeI = []() -> int { return 5; };
+      return ::gsl::owner<S*>{new S(MakeI())};
+      // CHECK-NOTES: [[@LINE-1]]:7: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>'
+    };
+  }
+
+  void testLambdaInLambdaWithDoubleReturns() {
+    const auto MakeS = []() -> S* {
+      const auto MakeS2 = []() -> S* {
+        return ::gsl::owner<S*>{new S(1)};
+        // CHECK-NOTES: [[@LINE-1]]:9: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>' [cppcoreguidelines-owning-memory]
+      };
+      return ::gsl::owner<S*>{new S(2)};
+      // CHECK-NOTES: [[@LINE-1]]:7: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>'
+    };
+  }
+
+  void testReverseLambdaInLambdaNegative() {
+    const auto MakeI = []() -> int {
+      const auto MakeS = []() -> ::gsl::owner<S*> { return new S(); };
+      return 5;
+    };
+  }
+
+  void testReverseLambdaInLambdaPositive() {
+    const auto MakeI = []() -> int {
+      const auto MakeS = []() -> S* { return new S(); };
+      // CHECK-NOTES: [[@LINE-1]]:39: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>'
+      return 5;
+    };
+  }
+}
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index 44a34ca196274..302d99dccd77b 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -1091,6 +1091,29 @@ def __repr__(self):
 # Represents an @available(...) check.
 CursorKind.OBJC_AVAILABILITY_CHECK_EXPR = CursorKind(148)
 
+# Fixed point literal.
+CursorKind.FIXED_POINT_LITERAL = CursorKind(149)
+
+# OpenMP 5.0 [2.1.4, Array Shaping].
+CursorKind.OMP_ARRAY_SHAPING_EXPR = CursorKind(150)
+
+# OpenMP 5.0 [2.1.6 Iterators].
+CursorKind.OMP_ITERATOR_EXPR = CursorKind(151)
+
+# OpenCL's addrspace_cast<> expression.
+CursorKind.CXX_ADDRSPACE_CAST_EXPR = CursorKind(152)
+
+# Expression that references a C++20 concept.
+CursorKind.CONCEPT_SPECIALIZATION_EXPR = CursorKind(153)
+
+# Expression that references a C++20 requires expression.
+CursorKind.REQUIRES_EXPR = CursorKind(154)
+
+# Expression that references a C++20 parenthesized list aggregate initializer.
+CursorKind.CXX_PAREN_LIST_INIT_EXPR = CursorKind(155)
+
+# Represents a C++26 pack indexing expression.
+CursorKind.PACK_INDEXING_EXPR = CursorKind(156)
 
 # A statement whose specific kind is not exposed via this interface.
 #
@@ -1312,6 +1335,114 @@ def __repr__(self):
 # OpenMP teams distribute directive.
 CursorKind.OMP_TEAMS_DISTRIBUTE_DIRECTIVE = CursorKind(271)
 
+# OpenMP teams distribute simd directive.
+CursorKind.OMP_TEAMS_DISTRIBUTE_DIRECTIVE = CursorKind(272)
+
+# OpenMP teams distribute parallel for simd directive.
+CursorKind.OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE = CursorKind(273)
+
+# OpenMP teams distribute parallel for directive.
+CursorKind.OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE = CursorKind(274)
+
+# OpenMP target teams directive.
+CursorKind.OMP_TARGET_TEAMS_DIRECTIVE = CursorKind(275)
+
+# OpenMP target teams distribute directive.
+CursorKind.OMP_TARGET_TEAMS_DISTRIBUTE_DIRECTIVE = CursorKind(276)
+
+# OpenMP target teams distribute parallel for directive.
+CursorKind.OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE = CursorKind(277)
+
+# OpenMP target teams distribute parallel for simd directive.
+CursorKind.OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE = CursorKind(278)
+
+# OpenMP target teams distribute simd directive.
+CursorKind.OMP_TARGET_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE = CursorKind(279)
+
+# C++2a std::bit_cast expression.
+CursorKind.BUILTIN_BIT_CAST_EXPR = CursorKind(280)
+
+# OpenMP master taskloop directive.
+CursorKind.OMP_MASTER_TASK_LOOP_DIRECTIVE = CursorKind(281)
+
+# OpenMP parallel master taskloop directive.
+CursorKind.OMP_PARALLEL_MASTER_TASK_LOOP_DIRECTIVE = CursorKind(282)
+
+# OpenMP master taskloop simd directive.
+CursorKind.OMP_MASTER_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(283)
+
+# OpenMP parallel master taskloop simd directive.
+CursorKind.OMP_PARALLEL_MASTER_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(284)
+
+# OpenMP parallel master directive.
+CursorKind.OMP_PARALLEL_MASTER_DIRECTIVE = CursorKind(285)
+
+# OpenMP depobj directive.
+CursorKind.OMP_DEPOBJ_DIRECTIVE = CursorKind(286)
+
+# OpenMP scan directive.
+CursorKind.OMP_SCAN_DIRECTIVE = CursorKind(287)
+
+# OpenMP tile directive.
+CursorKind.OMP_TILE_DIRECTIVE = CursorKind(288)
+
+# OpenMP canonical loop.
+CursorKind.OMP_CANONICAL_LOOP = CursorKind(289)
+
+# OpenMP interop directive.
+CursorKind.OMP_INTEROP_DIRECTIVE = CursorKind(290)
+
+# OpenMP dispatch directive.
+CursorKind.OMP_DISPATCH_DIRECTIVE = CursorKind(291)
+
+# OpenMP masked directive.
+CursorKind.OMP_MASKED_DIRECTIVE = CursorKind(292)
+
+# OpenMP unroll directive.
+CursorKind.OMP_UNROLL_DIRECTIVE = CursorKind(293)
+
+# OpenMP metadirective directive.
+CursorKind.OMP_META_DIRECTIVE = CursorKind(294)
+
+# OpenMP loop directive.
+CursorKind.OMP_GENERIC_LOOP_DIRECTIVE = CursorKind(295)
+
+# OpenMP teams loop directive.
+CursorKind.OMP_TEAMS_GENERIC_LOOP_DIRECTIVE = CursorKind(296)
+
+# OpenMP target teams loop directive.
+CursorKind.OMP_TARGET_TEAMS_GENERIC_LOOP_DIRECTIVE = CursorKind(297)
+
+# OpenMP parallel loop directive.
+CursorKind.OMP_PARALLEL_GENERIC_LOOP_DIRECTIVE = CursorKind(298)
+
+# OpenMP target parallel loop directive.
+CursorKind.OMP_TARGET_PARALLEL_GENERIC_LOOP_DIRECTIVE = CursorKind(299)
+
+# OpenMP parallel masked directive.
+CursorKind.OMP_PARALLEL_MASKED_DIRECTIVE = CursorKind(300)
+
+# OpenMP masked taskloop directive.
+CursorKind.OMP_MASKED_TASK_LOOP_DIRECTIVE = CursorKind(301)
+
+# OpenMP masked taskloop simd directive.
+CursorKind.OMP_MASKED_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(302)
+
+# OpenMP parallel masked taskloop directive.
+CursorKind.OMP_PARALLEL_MASKED_TASK_LOOP_DIRECTIVE = CursorKind(303)
+
+# OpenMP parallel masked taskloop simd directive.
+CursorKind.OMP_PARALLEL_MASKED_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(304)
+
+# OpenMP error directive.
+CursorKind.OMP_ERROR_DIRECTIVE = CursorKind(305)
+
+# OpenMP scope directive.
+CursorKind.OMP_SCOPE_DIRECTIVE = CursorKind(306)
+
+# OpenACC Compute Construct.
+CursorKind.OPEN_ACC_COMPUTE_DIRECTIVE = CursorKind(320)
+
 ###
 # Other Kinds
 
@@ -1349,6 +1480,24 @@ def __repr__(self):
 
 CursorKind.DLLEXPORT_ATTR = CursorKind(418)
 CursorKind.DLLIMPORT_ATTR = CursorKind(419)
+CursorKind.NS_RETURNS_RETAINED = CursorKind(420)
+CursorKind.NS_RETURNS_NOT_RETAINED = CursorKind(421)
+CursorKind.NS_RETURNS_AUTORELEASED = CursorKind(422)
+CursorKind.NS_CONSUMES_SELF = CursorKind(423)
+CursorKind.NS_CONSUMED = CursorKind(424)
+CursorKind.OBJC_EXCEPTION = CursorKind(425)
+CursorKind.OBJC_NSOBJECT = CursorKind(426)
+CursorKind.OBJC_INDEPENDENT_CLASS = CursorKind(427)
+CursorKind.OBJC_PRECISE_LIFETIME = CursorKind(428)
+CursorKind.OBJC_RETURNS_INNER_POINTER = CursorKind(429)
+CursorKind.OBJC_REQUIRES_SUPER = CursorKind(430)
+CursorKind.OBJC_ROOT_CLASS = CursorKind(431)
+CursorKind.OBJC_SUBCLASSING_RESTRICTED = CursorKind(432)
+CursorKind.OBJC_EXPLICIT_PROTOCOL_IMPL = CursorKind(433)
+CursorKind.OBJC_DESIGNATED_INITIALIZER = CursorKind(434)
+CursorKind.OBJC_RUNTIME_VISIBLE = CursorKind(435)
+CursorKind.OBJC_BOXABLE = CursorKind(436)
+CursorKind.FLAG_ENUM = CursorKind(437)
 CursorKind.CONVERGENT_ATTR = CursorKind(438)
 CursorKind.WARN_UNUSED_ATTR = CursorKind(439)
 CursorKind.WARN_UNUSED_RESULT_ATTR = CursorKind(440)
@@ -1395,6 +1544,11 @@ class TemplateArgumentKind(BaseEnumeration):
 TemplateArgumentKind.DECLARATION = TemplateArgumentKind(2)
 TemplateArgumentKind.NULLPTR = TemplateArgumentKind(3)
 TemplateArgumentKind.INTEGRAL = TemplateArgumentKind(4)
+TemplateArgumentKind.TEMPLATE = TemplateArgumentKind(5)
+TemplateArgumentKind.TEMPLATE_EXPANSION = TemplateArgumentKind(6)
+TemplateArgumentKind.EXPRESSION = TemplateArgumentKind(7)
+TemplateArgumentKind.PACK = TemplateArgumentKind(8)
+TemplateArgumentKind.INVALID = TemplateArgumentKind(9)
 
 ### Exception Specification Kinds ###
 class ExceptionSpecificationKind(BaseEnumeration):
@@ -2240,9 +2394,26 @@ def __repr__(self):
 TypeKind.OCLQUEUE = TypeKind(159)
 TypeKind.OCLRESERVEID = TypeKind(160)
 
+TypeKind.OBJCOBJECT = TypeKind(161)
+TypeKind.OBJCCLASS = TypeKind(162)
+TypeKind.ATTRIBUTED = TypeKind(163)
+
+TypeKind.OCLINTELSUBGROUPAVCMCEPAYLOAD = TypeKind(164)
+TypeKind.OCLINTELSUBGROUPAVCIMEPAYLOAD = TypeKind(165)
+TypeKind.OCLINTELSUBGROUPAVCREFPAYLOAD = TypeKind(166)
+TypeKind.OCLINTELSUBGROUPAVCSICPAYLOAD = TypeKind(167)
+TypeKind.OCLINTELSUBGROUPAVCMCERESULT = TypeKind(168)
+TypeKind.OCLINTELSUBGROUPAVCIMERESULT = TypeKind(169)
+TypeKind.OCLINTELSUBGROUPAVCREFRESULT = TypeKind(170)
+TypeKind.OCLINTELSUBGROUPAVCSICRESULT = TypeKind(171)
+TypeKind.OCLINTELSUBGROUPAVCIMERESULTSINGLEREFERENCESTREAMOUT = TypeKind(172)
+TypeKind.OCLINTELSUBGROUPAVCIMERESULTSDUALREFERENCESTREAMOUT = TypeKind(173)
+TypeKind.OCLINTELSUBGROUPAVCIMERESULTSSINGLEREFERENCESTREAMIN = TypeKind(174)
+TypeKind.OCLINTELSUBGROUPAVCIMEDUALREFERENCESTREAMIN = TypeKind(175)
+
 TypeKind.EXTVECTOR = TypeKind(176)
 TypeKind.ATOMIC = TypeKind(177)
-
+TypeKind.BTFTAGATTRIBUTED = TypeKind(178)
 
 class RefQualifierKind(BaseEnumeration):
     """Describes a specific ref-qualifier of a type."""
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index db7430b3344c3..d5546e20873b3 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -11,6 +11,7 @@ set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "
 
 set(LLVM_ENABLE_BACKTRACES OFF CACHE BOOL "")
 set(LLVM_ENABLE_DIA_SDK OFF CACHE BOOL "")
+set(LLVM_ENABLE_FATLTO ON CACHE BOOL "")
 set(LLVM_ENABLE_HTTPLIB ON CACHE BOOL "")
 set(LLVM_ENABLE_LIBCXX ON CACHE BOOL "")
 set(LLVM_ENABLE_LIBEDIT OFF CACHE BOOL "")
diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 1209fd9359861..df69d7d0dd414 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -66,6 +66,7 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH
   LLDB_PYTHON_HOME
   LLDB_PYTHON_RELATIVE_PATH
   LLDB_TEST_USE_VENDOR_PACKAGES
+  LLDB_TEST_USER_ARGS
   Python3_EXECUTABLE
   Python3_LIBRARIES
   Python3_INCLUDE_DIRS
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 5b00a8f4c00fb..be021dfc5c084 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -6158,6 +6158,70 @@ the configuration (without a prefix: ``Auto``).
 **TabWidth** (``Unsigned``) :versionbadge:`clang-format 3.7` :ref:`¶ <TabWidth>`
   The number of columns used for tab stops.
 
+.. _TableGenBreakInsideDAGArg:
+
+**TableGenBreakInsideDAGArg** (``DAGArgStyle``) :versionbadge:`clang-format 19` :ref:`¶ <TableGenBreakInsideDAGArg>`
+  The styles of the line break inside the DAGArg in TableGen.
+
+  Possible values:
+
+  * ``DAS_DontBreak`` (in configuration: ``DontBreak``)
+    Never break inside DAGArg.
+
+    .. code-block:: c++
+
+      let DAGArgIns = (ins i32:$src1, i32:$src2);
+
+  * ``DAS_BreakElements`` (in configuration: ``BreakElements``)
+    Break inside DAGArg after each list element but for the last.
+    This aligns to the first element.
+
+    .. code-block:: c++
+
+      let DAGArgIns = (ins i32:$src1,
+                           i32:$src2);
+
+  * ``DAS_BreakAll`` (in configuration: ``BreakAll``)
+    Break inside DAGArg after the operator and the all elements.
+
+    .. code-block:: c++
+
+      let DAGArgIns = (ins
+          i32:$src1,
+          i32:$src2
+      );
+
+
+
+.. _TableGenBreakingDAGArgOperators:
+
+**TableGenBreakingDAGArgOperators** (``List of Strings``) :versionbadge:`clang-format 19` :ref:`¶ <TableGenBreakingDAGArgOperators>`
+  Works only when TableGenBreakInsideDAGArg is not DontBreak.
+  The string list needs to consist of identifiers in TableGen.
+  If any identifier is specified, this limits the line breaks by
+  TableGenBreakInsideDAGArg option only on DAGArg values beginning with
+  the specified identifiers.
+
+  For example the configuration,
+
+  .. code-block:: yaml
+
+    TableGenBreakInsideDAGArg: BreakAll
+    TableGenBreakingDAGArgOperators: ['ins', 'outs']
+
+  makes the line break only occurs inside DAGArgs beginning with the
+  specified identifiers 'ins' and 'outs'.
+
+
+  .. code-block:: c++
+
+    let DAGArgIns = (ins
+        i32:$src1,
+        i32:$src2
+    );
+    let DAGArgOtherID = (other i32:$other1, i32:$other2);
+    let DAGArgBang = (!cast<SomeType>("Some") i32:$src1, i32:$src2)
+
 .. _TypeNames:
 
 **TypeNames** (``List of Strings``) :versionbadge:`clang-format 17` :ref:`¶ <TypeNames>`
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index e6bf48a88b525..2c03afe665f49 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -13,6 +13,7 @@ Clang Language Extensions
    BlockLanguageSpec
    Block-ABI-Apple
    AutomaticReferenceCounting
+   PointerAuthentication
    MatrixTypes
 
 Introduction
@@ -3443,6 +3444,21 @@ Query for this feature with ``__has_builtin(__builtin_debugtrap)``.
 
 Query for this feature with ``__has_builtin(__builtin_trap)``.
 
+``__builtin_arm_trap``
+----------------------
+
+``__builtin_arm_trap`` is an AArch64 extension to ``__builtin_trap`` which also accepts a compile-time constant value, encoded directly into the trap instruction for later inspection.
+
+**Syntax**:
+
+.. code-block:: c++
+
+    __builtin_arm_trap(const unsigned short payload)
+
+**Description**
+
+``__builtin_arm_trap`` is lowered to the ``llvm.aarch64.break`` builtin, and then to ``brk #payload``.
+
 ``__builtin_nondeterministic_value``
 ------------------------------------
 
@@ -4324,6 +4340,10 @@ reordering of memory accesses and side effect instructions. Other instructions
 like simple arithmetic may be reordered around the intrinsic. If you expect to
 have no reordering at all, use inline assembly instead.
 
+Pointer Authentication
+^^^^^^^^^^^^^^^^^^^^^^
+See :doc:`PointerAuthentication`.
+
 X86/X86-64 Language Extensions
 ------------------------------
 
diff --git a/clang/docs/PointerAuthentication.rst b/clang/docs/PointerAuthentication.rst
new file mode 100644
index 0000000000000..19b3384293aed
--- /dev/null
+++ b/clang/docs/PointerAuthentication.rst
@@ -0,0 +1,485 @@
+Pointer Authentication
+======================
+
+.. contents::
+   :local:
+
+Introduction
+------------
+
+Pointer authentication is a technology which offers strong probabilistic
+protection against exploiting a broad class of memory bugs to take control of
+program execution.  When adopted consistently in a language ABI, it provides
+a form of relatively fine-grained control flow integrity (CFI) check that
+resists both return-oriented programming (ROP) and jump-oriented programming
+(JOP) attacks.
+
+While pointer authentication can be implemented purely in software, direct
+hardware support (e.g. as provided by Armv8.3 PAuth) can dramatically improve
+performance and code size.  Similarly, while pointer authentication
+can be implemented on any architecture, taking advantage of the (typically)
+excess addressing range of a target with 64-bit pointers minimizes the impact
+on memory performance and can allow interoperation with existing code (by
+disabling pointer authentication dynamically).  This document will generally
+attempt to present the pointer authentication feature independent of any
+hardware implementation or ABI.  Considerations that are
+implementation-specific are clearly identified throughout.
+
+Note that there are several different terms in use:
+
+- **Pointer authentication** is a target-independent language technology.
+
+- **PAuth** (sometimes referred to as **PAC**, for Pointer Authentication
+  Codes) is an AArch64 architecture extension that provides hardware support
+  for pointer authentication.  Additional extensions either modify some of the
+  PAuth instruction behavior (notably FPAC), or provide new instruction
+  variants (PAuth_LR).
+
+- **Armv8.3** is an AArch64 architecture revision that makes PAuth mandatory.
+
+- **arm64e** is a specific ABI (not yet fully stable) for implementing pointer
+  authentication using PAuth on certain Apple operating systems.
+
+This document serves four purposes:
+
+- It describes the basic ideas of pointer authentication.
+
+- It documents several language extensions that are useful on targets using
+  pointer authentication.
+
+- It will eventually present a theory of operation for the security mitigation,
+  describing the basic requirements for correctness, various weaknesses in the
+  mechanism, and ways in which programmers can strengthen its protections
+  (including recommendations for language implementors).
+
+- It will eventually document the language ABIs currently used for C, C++,
+  Objective-C, and Swift on arm64e, although these are not yet stable on any
+  target.
+
+Basic Concepts
+--------------
+
+The simple address of an object or function is a **raw pointer**.  A raw
+pointer can be **signed** to produce a **signed pointer**.  A signed pointer
+can be then **authenticated** in order to verify that it was **validly signed**
+and extract the original raw pointer.  These terms reflect the most likely
+implementation technique: computing and storing a cryptographic signature along
+with the pointer.
+
+An **abstract signing key** is a name which refers to a secret key which is
+used to sign and authenticate pointers.  The concrete key value for a
+particular name is consistent throughout a process.
+
+A **discriminator** is an arbitrary value used to **diversify** signed pointers
+so that one validly-signed pointer cannot simply be copied over another.
+A discriminator is simply opaque data of some implementation-defined size that
+is included in the signature as a salt (see `Discriminators`_ for details.)
+
+Nearly all aspects of pointer authentication use just these two primary
+operations:
+
+- ``sign(raw_pointer, key, discriminator)`` produces a signed pointer given
+  a raw pointer, an abstract signing key, and a discriminator.
+
+- ``auth(signed_pointer, key, discriminator)`` produces a raw pointer given
+  a signed pointer, an abstract signing key, and a discriminator.
+
+``auth(sign(raw_pointer, key, discriminator), key, discriminator)`` must
+succeed and produce ``raw_pointer``.  ``auth`` applied to a value that was
+ultimately produced in any other way is expected to fail, which halts the
+program either:
+
+- immediately, on implementations that enforce ``auth`` success (e.g., when
+  using compiler-generated ``auth`` failure checks, or Armv8.3 with the FPAC
+  extension), or
+
+- when the resulting pointer value is used, on implementations that don't.
+
+However, regardless of the implementation's handling of ``auth`` failures, it
+is permitted for ``auth`` to fail to detect that a signed pointer was not
+produced in this way, in which case it may return anything; this is what makes
+pointer authentication a probabilistic mitigation rather than a perfect one.
+
+There are two secondary operations which are required only to implement certain
+intrinsics in ``<ptrauth.h>``:
+
+- ``strip(signed_pointer, key)`` produces a raw pointer given a signed pointer
+  and a key without verifying its validity, unlike ``auth``.  This is useful
+  for certain kinds of tooling, such as crash backtraces; it should generally
+  not be used in the basic language ABI except in very careful ways.
+
+- ``sign_generic(value)`` produces a cryptographic signature for arbitrary
+  data, not necessarily a pointer.  This is useful for efficiently verifying
+  that non-pointer data has not been tampered with.
+
+Whenever any of these operations is called for, the key value must be known
+statically.  This is because the layout of a signed pointer may vary according
+to the signing key.  (For example, in Armv8.3, the layout of a signed pointer
+depends on whether Top Byte Ignore (TBI) is enabled, which can be set
+independently for I and D keys.)
+
+.. admonition:: Note for API designers and language implementors
+
+  These are the *primitive* operations of pointer authentication, provided for
+  clarity of description.  They are not suitable either as high-level
+  interfaces or as primitives in a compiler IR because they expose raw
+  pointers.  Raw pointers require special attention in the language
+  implementation to avoid the accidental creation of exploitable code
+  sequences.
+
+The following details are all implementation-defined:
+
+- the nature of a signed pointer
+- the size of a discriminator
+- the number and nature of the signing keys
+- the implementation of the ``sign``, ``auth``, ``strip``, and ``sign_generic``
+  operations
+
+While the use of the terms "sign" and "signed pointer" suggest the use of
+a cryptographic signature, other implementations may be possible.  See
+`Alternative implementations`_ for an exploration of implementation options.
+
+.. admonition:: Implementation example: Armv8.3
+
+  Readers may find it helpful to know how these terms map to Armv8.3 PAuth:
+
+  - A signed pointer is a pointer with a signature stored in the
+    otherwise-unused high bits.  The kernel configures the address width based
+    on the system's addressing needs, and enables TBI for I or D keys as
+    needed.  The bits above the address bits and below the TBI bits (if
+    enabled) are unused.  The signature width then depends on this addressing
+    configuration.
+
+  - A discriminator is a 64-bit integer.  Constant discriminators are 16-bit
+    integers.  Blending a constant discriminator into an address consists of
+    replacing the top 16 bits of the pointer containing the address with the
+    constant.  Pointers used for blending purposes should only have address
+    bits, since higher bits will be at least partially overwritten with the
+    constant discriminator.
+
+  - There are five 128-bit signing-key registers, each of which can only be
+    directly read or set by privileged code.  Of these, four are used for
+    signing pointers, and the fifth is used only for ``sign_generic``.  The key
+    data is simply a pepper added to the hash, not an encryption key, and so
+    can be initialized using random data.
+
+  - ``sign`` computes a cryptographic hash of the pointer, discriminator, and
+    signing key, and stores it in the high bits as the signature. ``auth``
+    removes the signature, computes the same hash, and compares the result with
+    the stored signature.  ``strip`` removes the signature without
+    authenticating it.  While ``aut*`` instructions do not themselves trap on
+    failure in Armv8.3 PAuth, they do with the later optional FPAC extension.
+    An implementation can also choose to emulate this trapping behavior by
+    emitting additional instructions around ``aut*``.
+
+  - ``sign_generic`` corresponds to the ``pacga`` instruction, which takes two
+    64-bit values and produces a 64-bit cryptographic hash. Implementations of
+    this instruction are not required to produce meaningful data in all bits of
+    the result.
+
+Discriminators
+~~~~~~~~~~~~~~
+
+A discriminator is arbitrary extra data which alters the signature calculated
+for a pointer.  When two pointers are signed differently --- either with
+different keys or with different discriminators --- an attacker cannot simply
+replace one pointer with the other.
+
+To use standard cryptographic terminology, a discriminator acts as a
+`salt <https://en.wikipedia.org/wiki/Salt_(cryptography)>`_ in the signing of a
+pointer, and the key data acts as a
+`pepper <https://en.wikipedia.org/wiki/Pepper_(cryptography)>`_.  That is,
+both the discriminator and key data are ultimately just added as inputs to the
+signing algorithm along with the pointer, but they serve significantly
+different roles.  The key data is a common secret added to every signature,
+whereas the discriminator is a value that can be derived from
+the context in which a specific pointer is signed.  However, unlike a password
+salt, it's important that discriminators be *independently* derived from the
+circumstances of the signing; they should never simply be stored alongside
+a pointer.  Discriminators are then re-derived in authentication operations.
+
+The intrinsic interface in ``<ptrauth.h>`` allows an arbitrary discriminator
+value to be provided, but can only be used when running normal code.  The
+discriminators used by language ABIs must be restricted to make it feasible for
+the loader to sign pointers stored in global memory without needing excessive
+amounts of metadata.  Under these restrictions, a discriminator may consist of
+either or both of the following:
+
+- The address at which the pointer is stored in memory.  A pointer signed with
+  a discriminator which incorporates its storage address is said to have
+  **address diversity**.  In general, using address diversity means that
+  a pointer cannot be reliably copied by an attacker to or from a different
+  memory location.  However, an attacker may still be able to attack a larger
+  call sequence if they can alter the address through which the pointer is
+  accessed.  Furthermore, some situations cannot use address diversity because
+  of language or other restrictions.
+
+- A constant integer, called a **constant discriminator**. A pointer signed
+  with a non-zero constant discriminator is said to have **constant
+  diversity**.  If the discriminator is specific to a single declaration, it is
+  said to have **declaration diversity**; if the discriminator is specific to
+  a type of value, it is said to have **type diversity**.  For example, C++
+  v-tables on arm64e sign their component functions using a hash of their
+  method names and signatures, which provides declaration diversity; similarly,
+  C++ member function pointers sign their invocation functions using a hash of
+  the member pointer type, which provides type diversity.
+
+The implementation may need to restrict constant discriminators to be
+significantly smaller than the full size of a discriminator.  For example, on
+arm64e, constant discriminators are only 16-bit values.  This is believed to
+not significantly weaken the mitigation, since collisions remain uncommon.
+
+The algorithm for blending a constant discriminator with a storage address is
+implementation-defined.
+
+.. _Signing schemas:
+
+Signing Schemas
+~~~~~~~~~~~~~~~
+
+Correct use of pointer authentication requires the signing code and the
+authenticating code to agree about the **signing schema** for the pointer:
+
+- the abstract signing key with which the pointer should be signed and
+- an algorithm for computing the discriminator.
+
+As described in the section above on `Discriminators`_, in most situations, the
+discriminator is produced by taking a constant discriminator and optionally
+blending it with the storage address of the pointer.  In these situations, the
+signing schema breaks down even more simply:
+
+- the abstract signing key,
+- a constant discriminator, and
+- whether to use address diversity.
+
+It is important that the signing schema be independently derived at all signing
+and authentication sites.  Preferably, the schema should be hard-coded
+everywhere it is needed, but at the very least, it must not be derived by
+inspecting information stored along with the pointer.
+
+Language Features
+-----------------
+
+There is currently one main pointer authentication language feature:
+
+- The language provides the ``<ptrauth.h>`` intrinsic interface for manually
+  signing and authenticating pointers in code.  These can be used in
+  circumstances where very specific behavior is required.
+
+
+Language Extensions
+~~~~~~~~~~~~~~~~~~~
+
+Feature Testing
+^^^^^^^^^^^^^^^
+
+Whether the current target uses pointer authentication can be tested for with
+a number of different tests.
+
+- ``__has_feature(ptrauth_intrinsics)`` is true if ``<ptrauth.h>`` provides its
+  normal interface.  This may be true even on targets where pointer
+  authentication is not enabled by default.
+
+``<ptrauth.h>``
+~~~~~~~~~~~~~~~
+
+This header defines the following types and operations:
+
+``ptrauth_key``
+^^^^^^^^^^^^^^^
+
+This ``enum`` is the type of abstract signing keys.  In addition to defining
+the set of implementation-specific signing keys (for example, Armv8.3 defines
+``ptrauth_key_asia``), it also defines some portable aliases for those keys.
+For example, ``ptrauth_key_function_pointer`` is the key generally used for
+C function pointers, which will generally be suitable for other
+function-signing schemas.
+
+In all the operation descriptions below, key values must be constant values
+corresponding to one of the implementation-specific abstract signing keys from
+this ``enum``.
+
+``ptrauth_extra_data_t``
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+This is a ``typedef`` of a standard integer type of the correct size to hold
+a discriminator value.
+
+In the signing and authentication operation descriptions below, discriminator
+values must have either pointer type or integer type. If the discriminator is
+an integer, it will be coerced to ``ptrauth_extra_data_t``.
+
+``ptrauth_blend_discriminator``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_blend_discriminator(pointer, integer)
+
+Produce a discriminator value which blends information from the given pointer
+and the given integer.
+
+Implementations may ignore some bits from each value, which is to say, the
+blending algorithm may be chosen for speed and convenience over theoretical
+strength as a hash-combining algorithm.  For example, arm64e simply overwrites
+the high 16 bits of the pointer with the low 16 bits of the integer, which can
+be done in a single instruction with an immediate integer.
+
+``pointer`` must have pointer type, and ``integer`` must have integer type. The
+result has type ``ptrauth_extra_data_t``.
+
+``ptrauth_strip``
+^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_strip(signedPointer, key)
+
+Given that ``signedPointer`` matches the layout for signed pointers signed with
+the given key, extract the raw pointer from it.  This operation does not trap
+and cannot fail, even if the pointer is not validly signed.
+
+``ptrauth_sign_unauthenticated``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_sign_unauthenticated(pointer, key, discriminator)
+
+Produce a signed pointer for the given raw pointer without applying any
+authentication or extra treatment.  This operation is not required to have the
+same behavior on a null pointer that the language implementation would.
+
+This is a treacherous operation that can easily result in signing oracles.
+Programs should use it seldom and carefully.
+
+``ptrauth_auth_and_resign``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_auth_and_resign(pointer, oldKey, oldDiscriminator, newKey, newDiscriminator)
+
+Authenticate that ``pointer`` is signed with ``oldKey`` and
+``oldDiscriminator`` and then resign the raw-pointer result of that
+authentication with ``newKey`` and ``newDiscriminator``.
+
+``pointer`` must have pointer type.  The result will have the same type as
+``pointer``.  This operation is not required to have the same behavior on
+a null pointer that the language implementation would.
+
+The code sequence produced for this operation must not be directly attackable.
+However, if the discriminator values are not constant integers, their
+computations may still be attackable.  In the future, Clang should be enhanced
+to guaranteed non-attackability if these expressions are safely-derived.
+
+``ptrauth_auth_data``
+^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_auth_data(pointer, key, discriminator)
+
+Authenticate that ``pointer`` is signed with ``key`` and ``discriminator`` and
+remove the signature.
+
+``pointer`` must have object pointer type.  The result will have the same type
+as ``pointer``.  This operation is not required to have the same behavior on
+a null pointer that the language implementation would.
+
+In the future when Clang makes safe derivation guarantees, the result of
+this operation should be considered safely-derived.
+
+``ptrauth_sign_generic_data``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_sign_generic_data(value1, value2)
+
+Computes a signature for the given pair of values, incorporating a secret
+signing key.
+
+This operation can be used to verify that arbitrary data has not been tampered
+with by computing a signature for the data, storing that signature, and then
+repeating this process and verifying that it yields the same result.  This can
+be reasonably done in any number of ways; for example, a library could compute
+an ordinary checksum of the data and just sign the result in order to get the
+tamper-resistance advantages of the secret signing key (since otherwise an
+attacker could reliably overwrite both the data and the checksum).
+
+``value1`` and ``value2`` must be either pointers or integers.  If the integers
+are larger than ``uintptr_t`` then data not representable in ``uintptr_t`` may
+be discarded.
+
+The result will have type ``ptrauth_generic_signature_t``, which is an integer
+type.  Implementations are not required to make all bits of the result equally
+significant; in particular, some implementations are known to not leave
+meaningful data in the low bits.
+
+
+
+Alternative Implementations
+---------------------------
+
+Signature Storage
+~~~~~~~~~~~~~~~~~
+
+It is not critical for the security of pointer authentication that the
+signature be stored "together" with the pointer, as it is in Armv8.3. An
+implementation could just as well store the signature in a separate word, so
+that the ``sizeof`` a signed pointer would be larger than the ``sizeof`` a raw
+pointer.
+
+Storing the signature in the high bits, as Armv8.3 does, has several trade-offs:
+
+- Disadvantage: there are substantially fewer bits available for the signature,
+  weakening the mitigation by making it much easier for an attacker to simply
+  guess the correct signature.
+
+- Disadvantage: future growth of the address space will necessarily further
+  weaken the mitigation.
+
+- Advantage: memory layouts don't change, so it's possible for
+  pointer-authentication-enabled code (for example, in a system library) to
+  efficiently interoperate with existing code, as long as pointer
+  authentication can be disabled dynamically.
+
+- Advantage: the size of a signed pointer doesn't grow, which might
+  significantly increase memory requirements, code size, and register pressure.
+
+- Advantage: the size of a signed pointer is the same as a raw pointer, so
+  generic APIs which work in types like `void *` (such as `dlsym`) can still
+  return signed pointers.  This means that clients of these APIs will not
+  require insecure code in order to correctly receive a function pointer.
+
+Hashing vs. Encrypting Pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Armv8.3 implements ``sign`` by computing a cryptographic hash and storing that
+in the spare bits of the pointer.  This means that there are relatively few
+possible values for the valid signed pointer, since the bits corresponding to
+the raw pointer are known.  Together with an ``auth`` oracle, this can make it
+computationally feasible to discover the correct signature with brute force.
+(The implementation should of course endeavor not to introduce ``auth``
+oracles, but this can be difficult, and attackers can be devious.)
+
+If the implementation can instead *encrypt* the pointer during ``sign`` and
+*decrypt* it during ``auth``, this brute-force attack becomes far less
+feasible, even with an ``auth`` oracle.  However, there are several problems
+with this idea:
+
+- It's unclear whether this kind of encryption is even possible without
+  increasing the storage size of a signed pointer.  If the storage size can be
+  increased, brute-force atacks can be equally well mitigated by simply storing
+  a larger signature.
+
+- It would likely be impossible to implement a ``strip`` operation, which might
+  make debuggers and other out-of-process tools far more difficult to write, as
+  well as generally making primitive debugging more challenging.
+
+- Implementations can benefit from being able to extract the raw pointer
+  immediately from a signed pointer.  An Armv8.3 processor executing an
+  ``auth``-and-load instruction can perform the load and ``auth`` in parallel;
+  a processor which instead encrypted the pointer would be forced to perform
+  these operations serially.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7173c1400f53f..0ce3cbe3266ef 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -47,6 +47,12 @@ C++ Specific Potentially Breaking Changes
 
 ABI Changes in This Version
 ---------------------------
+- Fixed Microsoft name mangling of implicitly defined variables used for thread
+  safe static initialization of static local variables. This change resolves
+  incompatibilities with code compiled by MSVC but might introduce
+  incompatibilities with code compiled by earlier versions of Clang when an
+  inline member function that contains a static local variable with a dynamic
+  initializer is declared with ``__declspec(dllimport)``. (#GH83616).
 
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
@@ -191,6 +197,9 @@ Removed Compiler Flags
 -------------------------
 
 - The ``-freroll-loops`` flag has been removed. It had no effect since Clang 13.
+- ``-m[no-]unaligned-access`` is removed for RISC-V and LoongArch.
+  ``-m[no-]strict-align``, also supported by GCC, should be used instead.
+  (`#85350 <https://github.com/llvm/llvm-project/pull/85350>`_.)
 
 Attribute Changes in Clang
 --------------------------
@@ -241,6 +250,10 @@ Improvements to Clang's diagnostics
   such as attempting to call ``free`` on an unallocated object. Fixes
   `#79443 <https://github.com/llvm/llvm-project/issues/79443>`_.
 
+- Clang no longer warns when the ``bitand`` operator is used with boolean
+  operands, distinguishing it from potential typographical errors or unintended
+  bitwise operations. Fixes #GH77601.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -271,6 +284,9 @@ Bug Fixes in This Version
   for variables created through copy initialization having side-effects in C++17 and later.
   Fixes (#GH64356) (#GH79518).
 
+- Fix value of predefined macro ``__FUNCTION__`` in MSVC compatibility mode.
+  Fixes (#GH66114).
+
 - Clang now emits errors for explicit specializations/instatiations of lambda call
   operator.
   Fixes (#GH83267).
@@ -278,6 +294,16 @@ Bug Fixes in This Version
 - Clang now correctly generates overloads for bit-precise integer types for
   builtin operators in C++. Fixes #GH82998.
 
+- When performing mixed arithmetic between ``_Complex`` floating-point types and integers,
+  Clang now correctly promotes the integer to its corresponding real floating-point
+  type only rather than to the complex type (e.g. ``_Complex float / int`` is now evaluated
+  as ``_Complex float / float`` rather than ``_Complex float / _Complex float``), as mandated
+  by the C standard. This significantly improves codegen of `*` and `/` especially.
+  Fixes (`#31205 <https://github.com/llvm/llvm-project/issues/31205>`_).
+
+- Fixes an assertion failure on invalid code when trying to define member
+  functions in lambdas.
+
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -366,9 +392,20 @@ Bug Fixes to C++ Support
   and (`#74494 <https://github.com/llvm/llvm-project/issues/74494>`_)
 - Allow access to a public template alias declaration that refers to friend's
   private nested type. (#GH25708).
+- Fixed a crash in constant evaluation when trying to access a
+  captured ``this`` pointer in a lambda with an explicit object parameter.
+  Fixes (#GH80997)
+- Fix an issue where missing set friend declaration in template class instantiation.
+  Fixes (#GH84368).
+- Fixed a crash while checking constraints of a trailing requires-expression of a lambda, that the
+  expression references to an entity declared outside of the lambda. (#GH64808)
+- Clang's __builtin_bit_cast will now produce a constant value for records with empty bases. See:
+  (#GH82383)
+- Fix a crash when instantiating a lambda that captures ``this`` outside of its context. Fixes (#GH85343).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
+- Clang now properly preserves ``FoundDecls`` within a ``ConceptReference``. (#GH82628)
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
@@ -412,6 +449,8 @@ Arm and AArch64 Support
   like ``target_version`` or ``target_clones``.
 - Support has been added for the following processors (-mcpu identifiers in parenthesis):
     * Arm Cortex-A78AE (cortex-a78ae).
+    * Arm Cortex-A520AE (cortex-a520ae).
+    * Arm Cortex-A720AE (cortex-a720ae).
 
 Android Support
 ^^^^^^^^^^^^^^^
@@ -419,6 +458,26 @@ Android Support
 Windows Support
 ^^^^^^^^^^^^^^^
 
+- Clang-cl now supports function targets with intrinsic headers. This allows
+  for runtime feature detection of intrinsics. Previously under clang-cl
+  ``immintrin.h`` and similar intrinsic headers would only include the intrinsics
+  if building with that feature enabled at compile time, e.g. ``avxintrin.h``
+  would only be included if AVX was enabled at compile time. This was done to work
+  around include times from MSVC STL including ``intrin.h`` under clang-cl.
+  Clang-cl now provides ``intrin0.h`` for MSVC STL and therefore all intrinsic
+  features without requiring enablement at compile time.
+  Fixes: (`#53520 <https://github.com/llvm/llvm-project/issues/53520>`_)
+
+- Improved compile times with MSVC STL. MSVC provides ``intrin0.h`` which is a
+  header that only includes intrinsics that are used by MSVC STL to avoid the
+  use of ``intrin.h``. MSVC STL when compiled under clang uses ``intrin.h``
+  instead. Clang-cl now provides ``intrin0.h`` for the same compiler throughput
+  purposes as MSVC. Clang-cl also provides ``yvals_core.h`` to redefine
+  ``_STL_INTRIN_HEADER`` to expand to ``intrin0.h`` instead of ``intrin.h``.
+  This also means that if all intrinsic features are enabled at compile time
+  including STL headers will no longer slow down compile times since ``intrin.h``
+  is not included from MSVC STL.
+
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 
@@ -479,6 +538,10 @@ libclang
 Static Analyzer
 ---------------
 
+- Fixed crashing on loops if the loop variable was declared in switch blocks
+  but not under any case blocks if ``unroll-loops=true`` analyzer config is
+  set. (#GH68819)
+
 New features
 ^^^^^^^^^^^^
 
@@ -513,6 +576,8 @@ Python Binding Changes
 ----------------------
 
 - Exposed `CXRewriter` API as `class Rewriter`.
+- Add some missing kinds from Index.h (CursorKind: 149-156, 272-320, 420-437.
+  TemplateArgumentKind: 5-9. TypeKind: 161-175 and 178).
 
 OpenMP Support
 --------------
diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
index 40ab76fa26a9e..70687c23b15e6 100644
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -122,6 +122,7 @@ clang/include/clang/Analysis/MacroExpansionContext.h
 clang/include/clang/Analysis/Analyses/CalledOnceCheck.h
 clang/include/clang/Analysis/Analyses/CFGReachabilityAnalysis.h
 clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h
+clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
 clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
 clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
 clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -306,7 +307,7 @@ clang/include/clang-c/Index.h
 clang/lib/Analysis/CalledOnceCheck.cpp
 clang/lib/Analysis/CloneDetection.cpp
 clang/lib/Analysis/CodeInjector.cpp
-clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
+clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
 clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
 clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
 clang/lib/Analysis/FlowSensitive/DebugSupport.cpp
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 2e60f746cd61e..62a05d9497a3a 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -1675,7 +1675,7 @@ enum CXCursorKind {
   CXCursor_ConceptSpecializationExpr = 153,
 
   /**
-   * Expression that references a C++20 concept.
+   * Expression that references a C++20 requires expression.
    */
   CXCursor_RequiresExpr = 154,
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 834e339304dd2..07e99f8a2cae1 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -250,6 +250,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
       DependentBitIntTypes;
   llvm::FoldingSet<BTFTagAttributedType> BTFTagAttributedTypes;
 
+  mutable llvm::FoldingSet<CountAttributedType> CountAttributedTypes;
+
   mutable llvm::FoldingSet<QualifiedTemplateName> QualifiedTemplateNames;
   mutable llvm::FoldingSet<DependentTemplateName> DependentTemplateNames;
   mutable llvm::FoldingSet<SubstTemplateTemplateParmStorage>
@@ -1346,6 +1348,11 @@ class ASTContext : public RefCountedBase<ASTContext> {
     return CanQualType::CreateUnsafe(getPointerType((QualType) T));
   }
 
+  QualType
+  getCountAttributedType(QualType T, Expr *CountExpr, bool CountInBytes,
+                         bool OrNull,
+                         ArrayRef<TypeCoupledDeclRefInfo> DependentDecls) const;
+
   /// Return the uniqued reference to a type adjusted from the original
   /// type to a new type.
   QualType getAdjustedType(QualType Orig, QualType New) const;
diff --git a/clang/include/clang/AST/Availability.h b/clang/include/clang/AST/Availability.h
index ae3acbeffe7f1..26ae622e5b449 100644
--- a/clang/include/clang/AST/Availability.h
+++ b/clang/include/clang/AST/Availability.h
@@ -67,6 +67,7 @@ struct AvailabilityInfo {
   VersionTuple Introduced;
   VersionTuple Deprecated;
   VersionTuple Obsoleted;
+  bool Unavailable = false;
   bool UnconditionallyDeprecated = false;
   bool UnconditionallyUnavailable = false;
 
@@ -75,6 +76,15 @@ struct AvailabilityInfo {
   /// Determine if this AvailabilityInfo represents the default availability.
   bool isDefault() const { return *this == AvailabilityInfo(); }
 
+  /// Check if the symbol has been obsoleted.
+  bool isObsoleted() const { return !Obsoleted.empty(); }
+
+  /// Check if the symbol is unavailable unconditionally or
+  /// on the active platform and os version.
+  bool isUnavailable() const {
+    return Unavailable || isUnconditionallyUnavailable();
+  }
+
   /// Check if the symbol is unconditionally deprecated.
   ///
   /// i.e. \code __attribute__((deprecated)) \endcode
@@ -88,9 +98,10 @@ struct AvailabilityInfo {
   }
 
   AvailabilityInfo(StringRef Domain, VersionTuple I, VersionTuple D,
-                   VersionTuple O, bool UD, bool UU)
+                   VersionTuple O, bool U, bool UD, bool UU)
       : Domain(Domain), Introduced(I), Deprecated(D), Obsoleted(O),
-        UnconditionallyDeprecated(UD), UnconditionallyUnavailable(UU) {}
+        Unavailable(U), UnconditionallyDeprecated(UD),
+        UnconditionallyUnavailable(UU) {}
 
   friend bool operator==(const AvailabilityInfo &Lhs,
                          const AvailabilityInfo &Rhs);
@@ -102,10 +113,10 @@ struct AvailabilityInfo {
 inline bool operator==(const AvailabilityInfo &Lhs,
                        const AvailabilityInfo &Rhs) {
   return std::tie(Lhs.Introduced, Lhs.Deprecated, Lhs.Obsoleted,
-                  Lhs.UnconditionallyDeprecated,
+                  Lhs.Unavailable, Lhs.UnconditionallyDeprecated,
                   Lhs.UnconditionallyUnavailable) ==
          std::tie(Rhs.Introduced, Rhs.Deprecated, Rhs.Obsoleted,
-                  Rhs.UnconditionallyDeprecated,
+                  Rhs.Unavailable, Rhs.UnconditionallyDeprecated,
                   Rhs.UnconditionallyUnavailable);
 }
 
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index d1033e97024dd..724b2193f904e 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -2045,7 +2045,8 @@ class PredefinedExpr final
   }
 
   static std::string ComputeName(PredefinedIdentKind IK,
-                                 const Decl *CurrentDecl);
+                                 const Decl *CurrentDecl,
+                                 bool ForceElaboratedPrinting = false);
 
   SourceLocation getBeginLoc() const { return getLocation(); }
   SourceLocation getEndLoc() const { return getLocation(); }
diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td
index 0270c086d06b6..6df1d93a7ba2e 100644
--- a/clang/include/clang/AST/PropertiesBase.td
+++ b/clang/include/clang/AST/PropertiesBase.td
@@ -143,6 +143,7 @@ def UInt32 : CountPropertyType<"uint32_t">;
 def UInt64 : CountPropertyType<"uint64_t">;
 def UnaryTypeTransformKind : EnumPropertyType<"UnaryTransformType::UTTKind">;
 def VectorKind : EnumPropertyType<"VectorKind">;
+def TypeCoupledDeclRefInfo : PropertyType;
 
 def ExceptionSpecInfo : PropertyType<"FunctionProtoType::ExceptionSpecInfo"> {
   let BufferElementTypes = [ QualType ];
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 72256e61682c4..079a6496c6941 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -1110,6 +1110,12 @@ DEF_TRAVERSE_TYPE(InjectedClassNameType, {})
 DEF_TRAVERSE_TYPE(AttributedType,
                   { TRY_TO(TraverseType(T->getModifiedType())); })
 
+DEF_TRAVERSE_TYPE(CountAttributedType, {
+  if (T->getCountExpr())
+    TRY_TO(TraverseStmt(T->getCountExpr()));
+  TRY_TO(TraverseType(T->desugar()));
+})
+
 DEF_TRAVERSE_TYPE(BTFTagAttributedType,
                   { TRY_TO(TraverseType(T->getWrappedType())); })
 
@@ -1401,6 +1407,9 @@ DEF_TRAVERSE_TYPELOC(MacroQualifiedType,
 DEF_TRAVERSE_TYPELOC(AttributedType,
                      { TRY_TO(TraverseTypeLoc(TL.getModifiedLoc())); })
 
+DEF_TRAVERSE_TYPELOC(CountAttributedType,
+                     { TRY_TO(TraverseTypeLoc(TL.getInnerLoc())); })
+
 DEF_TRAVERSE_TYPELOC(BTFTagAttributedType,
                      { TRY_TO(TraverseTypeLoc(TL.getWrappedLoc())); })
 
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 46b552aced428..f699ce8fc5cb2 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -61,6 +61,7 @@ class BTFTypeTagAttr;
 class ExtQuals;
 class QualType;
 class ConceptDecl;
+class ValueDecl;
 class TagDecl;
 class TemplateParameterList;
 class Type;
@@ -2000,6 +2001,21 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned NumExpansions;
   };
 
+  class CountAttributedTypeBitfields {
+    friend class CountAttributedType;
+
+    LLVM_PREFERRED_TYPE(TypeBitfields)
+    unsigned : NumTypeBits;
+
+    static constexpr unsigned NumCoupledDeclsBits = 4;
+    unsigned NumCoupledDecls : NumCoupledDeclsBits;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned CountInBytes : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned OrNull : 1;
+  };
+  static_assert(sizeof(CountAttributedTypeBitfields) <= sizeof(unsigned));
+
   union {
     TypeBitfields TypeBits;
     ArrayTypeBitfields ArrayTypeBits;
@@ -2022,6 +2038,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     DependentTemplateSpecializationTypeBitfields
       DependentTemplateSpecializationTypeBits;
     PackExpansionTypeBitfields PackExpansionTypeBits;
+    CountAttributedTypeBitfields CountAttributedTypeBits;
   };
 
 private:
@@ -2244,6 +2261,8 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isFloatingType() const;     // C99 6.2.5p11 (real floating + complex)
   bool isHalfType() const;         // OpenCL 6.1.1.1, NEON (IEEE 754-2008 half)
   bool isFloat16Type() const;      // C11 extension ISO/IEC TS 18661
+  bool isFloat32Type() const;
+  bool isDoubleType() const;
   bool isBFloat16Type() const;
   bool isFloat128Type() const;
   bool isIbm128Type() const;
@@ -2262,6 +2281,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isFunctionProtoType() const { return getAs<FunctionProtoType>(); }
   bool isPointerType() const;
   bool isAnyPointerType() const;   // Any C pointer or ObjC object pointer
+  bool isCountAttributedType() const;
   bool isBlockPointerType() const;
   bool isVoidPointerType() const;
   bool isReferenceType() const;
@@ -2728,6 +2748,14 @@ template <> const TemplateSpecializationType *Type::getAs() const;
 /// until it reaches an AttributedType or a non-sugared type.
 template <> const AttributedType *Type::getAs() const;
 
+/// This will check for a BoundsAttributedType by removing any existing
+/// sugar until it reaches an BoundsAttributedType or a non-sugared type.
+template <> const BoundsAttributedType *Type::getAs() const;
+
+/// This will check for a CountAttributedType by removing any existing
+/// sugar until it reaches an CountAttributedType or a non-sugared type.
+template <> const CountAttributedType *Type::getAs() const;
+
 // We can do canonical leaf types faster, because we don't have to
 // worry about preserving child type decoration.
 #define TYPE(Class, Base)
@@ -2930,6 +2958,136 @@ class PointerType : public Type, public llvm::FoldingSetNode {
   static bool classof(const Type *T) { return T->getTypeClass() == Pointer; }
 };
 
+/// [BoundsSafety] Represents information of declarations referenced by the
+/// arguments of the `counted_by` attribute and the likes.
+class TypeCoupledDeclRefInfo {
+public:
+  using BaseTy = llvm::PointerIntPair<ValueDecl *, 1, unsigned>;
+
+private:
+  enum {
+    DerefShift = 0,
+    DerefMask = 1,
+  };
+  BaseTy Data;
+
+public:
+  /// \p D is to a declaration referenced by the argument of attribute. \p Deref
+  /// indicates whether \p D is referenced as a dereferenced form, e.g., \p
+  /// Deref is true for `*n` in `int *__counted_by(*n)`.
+  TypeCoupledDeclRefInfo(ValueDecl *D = nullptr, bool Deref = false);
+
+  bool isDeref() const;
+  ValueDecl *getDecl() const;
+  unsigned getInt() const;
+  void *getOpaqueValue() const;
+  bool operator==(const TypeCoupledDeclRefInfo &Other) const;
+  void setFromOpaqueValue(void *V);
+};
+
+/// [BoundsSafety] Represents a parent type class for CountAttributedType and
+/// similar sugar types that will be introduced to represent a type with a
+/// bounds attribute.
+///
+/// Provides a common interface to navigate declarations referred to by the
+/// bounds expression.
+
+class BoundsAttributedType : public Type, public llvm::FoldingSetNode {
+  QualType WrappedTy;
+
+protected:
+  ArrayRef<TypeCoupledDeclRefInfo> Decls; // stored in trailing objects
+
+  BoundsAttributedType(TypeClass TC, QualType Wrapped, QualType Canon);
+
+public:
+  bool isSugared() const { return true; }
+  QualType desugar() const { return WrappedTy; }
+
+  using decl_iterator = const TypeCoupledDeclRefInfo *;
+  using decl_range = llvm::iterator_range<decl_iterator>;
+
+  decl_iterator dependent_decl_begin() const { return Decls.begin(); }
+  decl_iterator dependent_decl_end() const { return Decls.end(); }
+
+  unsigned getNumCoupledDecls() const { return Decls.size(); }
+
+  decl_range dependent_decls() const {
+    return decl_range(dependent_decl_begin(), dependent_decl_end());
+  }
+
+  ArrayRef<TypeCoupledDeclRefInfo> getCoupledDecls() const {
+    return {dependent_decl_begin(), dependent_decl_end()};
+  }
+
+  bool referencesFieldDecls() const;
+
+  static bool classof(const Type *T) {
+    // Currently, only `class CountAttributedType` inherits
+    // `BoundsAttributedType` but the subclass will grow as we add more bounds
+    // annotations.
+    switch (T->getTypeClass()) {
+    case CountAttributed:
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+
+/// Represents a sugar type with `__counted_by` or `__sized_by` annotations,
+/// including their `_or_null` variants.
+class CountAttributedType final
+    : public BoundsAttributedType,
+      public llvm::TrailingObjects<CountAttributedType,
+                                   TypeCoupledDeclRefInfo> {
+  friend class ASTContext;
+
+  Expr *CountExpr;
+  /// \p CountExpr represents the argument of __counted_by or the likes. \p
+  /// CountInBytes indicates that \p CountExpr is a byte count (i.e.,
+  /// __sized_by(_or_null)) \p OrNull means it's an or_null variant (i.e.,
+  /// __counted_by_or_null or __sized_by_or_null) \p CoupledDecls contains the
+  /// list of declarations referenced by \p CountExpr, which the type depends on
+  /// for the bounds information.
+  CountAttributedType(QualType Wrapped, QualType Canon, Expr *CountExpr,
+                      bool CountInBytes, bool OrNull,
+                      ArrayRef<TypeCoupledDeclRefInfo> CoupledDecls);
+
+  unsigned numTrailingObjects(OverloadToken<TypeCoupledDeclRefInfo>) const {
+    return CountAttributedTypeBits.NumCoupledDecls;
+  }
+
+public:
+  enum DynamicCountPointerKind {
+    CountedBy = 0,
+    SizedBy,
+    CountedByOrNull,
+    SizedByOrNull,
+  };
+
+  Expr *getCountExpr() const { return CountExpr; }
+  bool isCountInBytes() const { return CountAttributedTypeBits.CountInBytes; }
+  bool isOrNull() const { return CountAttributedTypeBits.OrNull; }
+
+  DynamicCountPointerKind getKind() const {
+    if (isOrNull())
+      return isCountInBytes() ? SizedByOrNull : CountedByOrNull;
+    return isCountInBytes() ? SizedBy : CountedBy;
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) {
+    Profile(ID, desugar(), CountExpr, isCountInBytes(), isOrNull());
+  }
+
+  static void Profile(llvm::FoldingSetNodeID &ID, QualType WrappedTy,
+                      Expr *CountExpr, bool CountInBytes, bool Nullable);
+
+  static bool classof(const Type *T) {
+    return T->getTypeClass() == CountAttributed;
+  }
+};
+
 /// Represents a type which was implicitly adjusted by the semantic
 /// engine for arbitrary reasons.  For example, array and function types can
 /// decay, and function types can have their calling conventions adjusted.
@@ -7480,6 +7638,14 @@ inline bool Type::isFloat16Type() const {
   return isSpecificBuiltinType(BuiltinType::Float16);
 }
 
+inline bool Type::isFloat32Type() const {
+  return isSpecificBuiltinType(BuiltinType::Float);
+}
+
+inline bool Type::isDoubleType() const {
+  return isSpecificBuiltinType(BuiltinType::Double);
+}
+
 inline bool Type::isBFloat16Type() const {
   return isSpecificBuiltinType(BuiltinType::BFloat16);
 }
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 32028e877fc8d..b09eb3539a4ba 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -1120,6 +1120,32 @@ class ObjCInterfaceTypeLoc : public ConcreteTypeLoc<ObjCObjectTypeLoc,
   }
 };
 
+struct BoundsAttributedLocInfo {};
+class BoundsAttributedTypeLoc
+    : public ConcreteTypeLoc<UnqualTypeLoc, BoundsAttributedTypeLoc,
+                             BoundsAttributedType, BoundsAttributedLocInfo> {
+public:
+  TypeLoc getInnerLoc() const { return getInnerTypeLoc(); }
+  QualType getInnerType() const { return getTypePtr()->desugar(); }
+  void initializeLocal(ASTContext &Context, SourceLocation Loc) {
+    // nothing to do
+  }
+  // LocalData is empty and TypeLocBuilder doesn't handle DataSize 1.
+  unsigned getLocalDataSize() const { return 0; }
+};
+
+class CountAttributedTypeLoc final
+    : public InheritingConcreteTypeLoc<BoundsAttributedTypeLoc,
+                                       CountAttributedTypeLoc,
+                                       CountAttributedType> {
+public:
+  Expr *getCountExpr() const { return getTypePtr()->getCountExpr(); }
+  bool isCountInBytes() const { return getTypePtr()->isCountInBytes(); }
+  bool isOrNull() const { return getTypePtr()->isOrNull(); }
+
+  SourceRange getLocalSourceRange() const;
+};
+
 struct MacroQualifiedLocInfo {
   SourceLocation ExpansionLoc;
 };
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index ccee29e57dae1..b8cc8982b45b5 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -25,6 +25,25 @@ let Class = PointerType in {
   def : Creator<[{ return ctx.getPointerType(pointeeType); }]>;
 }
 
+let Class = CountAttributedType in {
+  def : Property<"WrappedTy", QualType> {
+    let Read = [{ node->desugar() }];
+  }
+  def : Property<"CountExpr", ExprRef> {
+    let Read = [{ node->getCountExpr() }];
+  }
+  def : Property<"CountInBytes", Bool> {
+    let Read = [{ node->isCountInBytes() }];
+  }
+  def : Property<"OrNull", Bool> {
+    let Read = [{ node->isOrNull() }];
+  }
+  def : Property<"CoupledDecls", Array<TypeCoupledDeclRefInfo>> {
+    let Read = [{ node->getCoupledDecls() }];
+  }
+  def : Creator<[{ return ctx.getCountAttributedType(WrappedTy, CountExpr, CountInBytes, OrNull, CoupledDecls); }]>;
+}
+
 let Class = AdjustedType in {
   def : Property<"originalType", QualType> {
     let Read = [{ node->getOriginalType() }];
diff --git a/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
new file mode 100644
index 0000000000000..420f13ce11bfd
--- /dev/null
+++ b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
@@ -0,0 +1,96 @@
+//===-- AdornedCFG.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines an AdornedCFG class that is used by dataflow analyses that
+//  run over Control-Flow Graphs (CFGs).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ADORNEDCFG_H
+#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ADORNEDCFG_H
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/Stmt.h"
+#include "clang/Analysis/CFG.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Error.h"
+#include <memory>
+#include <utility>
+
+namespace clang {
+namespace dataflow {
+
+/// Holds CFG with additional information derived from it that is needed to
+/// perform dataflow analysis.
+class AdornedCFG {
+public:
+  /// Builds an `AdornedCFG` from a `FunctionDecl`.
+  /// `Func.doesThisDeclarationHaveABody()` must be true, and
+  /// `Func.isTemplated()` must be false.
+  static llvm::Expected<AdornedCFG> build(const FunctionDecl &Func);
+
+  /// Builds an `AdornedCFG` from an AST node. `D` is the function in which
+  /// `S` resides. `D.isTemplated()` must be false.
+  static llvm::Expected<AdornedCFG> build(const Decl &D, Stmt &S,
+                                          ASTContext &C);
+
+  /// Returns the `Decl` containing the statement used to construct the CFG, if
+  /// available.
+  const Decl &getDecl() const { return ContainingDecl; }
+
+  /// Returns the CFG that is stored in this context.
+  const CFG &getCFG() const { return *Cfg; }
+
+  /// Returns a mapping from statements to basic blocks that contain them.
+  const llvm::DenseMap<const Stmt *, const CFGBlock *> &getStmtToBlock() const {
+    return StmtToBlock;
+  }
+
+  /// Returns whether `B` is reachable from the entry block.
+  bool isBlockReachable(const CFGBlock &B) const {
+    return BlockReachable[B.getBlockID()];
+  }
+
+  /// Returns whether `B` contains an expression that is consumed in a
+  /// different block than `B` (i.e. the parent of the expression is in a
+  /// different block).
+  /// This happens if there is control flow within a full-expression (triggered
+  /// by `&&`, `||`, or the conditional operator). Note that the operands of
+  /// these operators are not the only expressions that can be consumed in a
+  /// different block. For example, in the function call
+  /// `f(&i, cond() ? 1 : 0)`, `&i` is in a different block than the `CallExpr`.
+  bool containsExprConsumedInDifferentBlock(const CFGBlock &B) const {
+    return ContainsExprConsumedInDifferentBlock.contains(&B);
+  }
+
+private:
+  AdornedCFG(
+      const Decl &D, std::unique_ptr<CFG> Cfg,
+      llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock,
+      llvm::BitVector BlockReachable,
+      llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock)
+      : ContainingDecl(D), Cfg(std::move(Cfg)),
+        StmtToBlock(std::move(StmtToBlock)),
+        BlockReachable(std::move(BlockReachable)),
+        ContainsExprConsumedInDifferentBlock(
+            std::move(ContainsExprConsumedInDifferentBlock)) {}
+
+  /// The `Decl` containing the statement used to construct the CFG.
+  const Decl &ContainingDecl;
+  std::unique_ptr<CFG> Cfg;
+  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+  llvm::BitVector BlockReachable;
+  llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock;
+};
+
+} // namespace dataflow
+} // namespace clang
+
+#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ADORNEDCFG_H
diff --git a/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
index 9a0a00f3c0134..3972962d0b2da 100644
--- a/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
@@ -6,89 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file defines a ControlFlowContext class that is used by dataflow
-//  analyses that run over Control-Flow Graphs (CFGs).
+//  This file defines a deprecated alias for AdornedCFG.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CONTROLFLOWCONTEXT_H
 #define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CONTROLFLOWCONTEXT_H
 
-#include "clang/AST/ASTContext.h"
-#include "clang/AST/Decl.h"
-#include "clang/AST/Stmt.h"
-#include "clang/Analysis/CFG.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Error.h"
-#include <memory>
-#include <utility>
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 
 namespace clang {
 namespace dataflow {
 
-/// Holds CFG and other derived context that is needed to perform dataflow
-/// analysis.
-class ControlFlowContext {
-public:
-  /// Builds a ControlFlowContext from a `FunctionDecl`.
-  /// `Func.doesThisDeclarationHaveABody()` must be true, and
-  /// `Func.isTemplated()` must be false.
-  static llvm::Expected<ControlFlowContext> build(const FunctionDecl &Func);
-
-  /// Builds a ControlFlowContext from an AST node. `D` is the function in which
-  /// `S` resides. `D.isTemplated()` must be false.
-  static llvm::Expected<ControlFlowContext> build(const Decl &D, Stmt &S,
-                                                  ASTContext &C);
-
-  /// Returns the `Decl` containing the statement used to construct the CFG, if
-  /// available.
-  const Decl &getDecl() const { return ContainingDecl; }
-
-  /// Returns the CFG that is stored in this context.
-  const CFG &getCFG() const { return *Cfg; }
-
-  /// Returns a mapping from statements to basic blocks that contain them.
-  const llvm::DenseMap<const Stmt *, const CFGBlock *> &getStmtToBlock() const {
-    return StmtToBlock;
-  }
-
-  /// Returns whether `B` is reachable from the entry block.
-  bool isBlockReachable(const CFGBlock &B) const {
-    return BlockReachable[B.getBlockID()];
-  }
-
-  /// Returns whether `B` contains an expression that is consumed in a
-  /// different block than `B` (i.e. the parent of the expression is in a
-  /// different block).
-  /// This happens if there is control flow within a full-expression (triggered
-  /// by `&&`, `||`, or the conditional operator). Note that the operands of
-  /// these operators are not the only expressions that can be consumed in a
-  /// different block. For example, in the function call
-  /// `f(&i, cond() ? 1 : 0)`, `&i` is in a different block than the `CallExpr`.
-  bool containsExprConsumedInDifferentBlock(const CFGBlock &B) const {
-    return ContainsExprConsumedInDifferentBlock.contains(&B);
-  }
-
-private:
-  ControlFlowContext(
-      const Decl &D, std::unique_ptr<CFG> Cfg,
-      llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock,
-      llvm::BitVector BlockReachable,
-      llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock)
-      : ContainingDecl(D), Cfg(std::move(Cfg)),
-        StmtToBlock(std::move(StmtToBlock)),
-        BlockReachable(std::move(BlockReachable)),
-        ContainsExprConsumedInDifferentBlock(
-            std::move(ContainsExprConsumedInDifferentBlock)) {}
-
-  /// The `Decl` containing the statement used to construct the CFG.
-  const Decl &ContainingDecl;
-  std::unique_ptr<CFG> Cfg;
-  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
-  llvm::BitVector BlockReachable;
-  llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock;
-};
+// This is a deprecated alias. Use `AdornedCFG` instead.
+using ControlFlowContext = AdornedCFG;
 
 } // namespace dataflow
 } // namespace clang
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
index 3c84704d0d6ce..67eccdd030dcd 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
@@ -22,7 +22,7 @@
 
 #include "clang/AST/ASTContext.h"
 #include "clang/Analysis/CFG.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
 #include "clang/Analysis/FlowSensitive/MatchSwitch.h"
@@ -195,8 +195,7 @@ template <typename AnalysisT>
 llvm::Expected<std::vector<
     std::optional<DataflowAnalysisState<typename AnalysisT::Lattice>>>>
 runDataflowAnalysis(
-    const ControlFlowContext &CFCtx, AnalysisT &Analysis,
-    const Environment &InitEnv,
+    const AdornedCFG &ACFG, AnalysisT &Analysis, const Environment &InitEnv,
     std::function<void(const CFGElement &, const DataflowAnalysisState<
                                                typename AnalysisT::Lattice> &)>
         PostVisitCFG = nullptr,
@@ -218,7 +217,7 @@ runDataflowAnalysis(
   }
 
   auto TypeErasedBlockStates = runTypeErasedDataflowAnalysis(
-      CFCtx, Analysis, InitEnv, PostVisitCFGClosure, MaxBlockVisits);
+      ACFG, Analysis, InitEnv, PostVisitCFGClosure, MaxBlockVisits);
   if (!TypeErasedBlockStates)
     return TypeErasedBlockStates.takeError();
 
@@ -280,8 +279,7 @@ llvm::Expected<llvm::SmallVector<Diagnostic>> diagnoseFunction(
         Diagnoser,
     std::int64_t MaxSATIterations = 1'000'000'000,
     std::int32_t MaxBlockVisits = 20'000) {
-  llvm::Expected<ControlFlowContext> Context =
-      ControlFlowContext::build(FuncDecl);
+  llvm::Expected<AdornedCFG> Context = AdornedCFG::build(FuncDecl);
   if (!Context)
     return Context.takeError();
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 98bdf037880ab..909a91059438c 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -18,8 +18,8 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/TypeOrdering.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/Arena.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
 #include "clang/Analysis/FlowSensitive/Solver.h"
 #include "clang/Analysis/FlowSensitive/StorageLocation.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
@@ -183,9 +183,9 @@ class DataflowAnalysisContext {
   LLVM_DUMP_METHOD void dumpFlowCondition(Atom Token,
                                           llvm::raw_ostream &OS = llvm::dbgs());
 
-  /// Returns the `ControlFlowContext` registered for `F`, if any. Otherwise,
+  /// Returns the `AdornedCFG` registered for `F`, if any. Otherwise,
   /// returns null.
-  const ControlFlowContext *getControlFlowContext(const FunctionDecl *F);
+  const AdornedCFG *getAdornedCFG(const FunctionDecl *F);
 
   const Options &getOptions() { return Opts; }
 
@@ -296,7 +296,7 @@ class DataflowAnalysisContext {
   llvm::DenseMap<Atom, const Formula *> FlowConditionConstraints;
   const Formula *Invariant = nullptr;
 
-  llvm::DenseMap<const FunctionDecl *, ControlFlowContext> FunctionContexts;
+  llvm::DenseMap<const FunctionDecl *, AdornedCFG> FunctionContexts;
 
   // Fields modeled by environments covered by this context.
   FieldSet ModeledFields;
diff --git a/clang/include/clang/Analysis/FlowSensitive/Logger.h b/clang/include/clang/Analysis/FlowSensitive/Logger.h
index f4bd39f6ed49e..f2e64810d0050 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Logger.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Logger.h
@@ -15,7 +15,7 @@
 
 namespace clang::dataflow {
 // Forward declarations so we can use Logger anywhere in the framework.
-class ControlFlowContext;
+class AdornedCFG;
 class TypeErasedDataflowAnalysis;
 struct TypeErasedDataflowAnalysisState;
 
@@ -40,8 +40,8 @@ class Logger {
 
   /// Called by the framework as we start analyzing a new function or statement.
   /// Forms a pair with endAnalysis().
-  virtual void beginAnalysis(const ControlFlowContext &,
-                             TypeErasedDataflowAnalysis &) {}
+  virtual void beginAnalysis(const AdornedCFG &, TypeErasedDataflowAnalysis &) {
+  }
   virtual void endAnalysis() {}
 
   // At any time during the analysis, we're computing the state for some target
diff --git a/clang/include/clang/Analysis/FlowSensitive/RecordOps.h b/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
index 783e53e980aa2..8fad45fc11d81 100644
--- a/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
+++ b/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
@@ -31,7 +31,11 @@ namespace dataflow {
 ///
 /// Requirements:
 ///
-///  `Src` and `Dst` must have the same canonical unqualified type.
+///  Either:
+///    - `Src` and `Dest` must have the same canonical unqualified type, or
+///    - The type of `Src` must be derived from `Dest`, or
+///    - The type of `Dest` must be derived from `Src` (in this case, any fields
+///      that are only present in `Dest` are not overwritten).
 void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
                 Environment &Env);
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/Transfer.h b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
index 7713df747cb76..ed148250d8eb2 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Transfer.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
@@ -29,12 +29,12 @@ class StmtToEnvMap {
   // `CurState` is the pending state currently associated with this block. These
   // are supplied separately as the pending state for the current block may not
   // yet be represented in `BlockToState`.
-  StmtToEnvMap(const ControlFlowContext &CFCtx,
+  StmtToEnvMap(const AdornedCFG &ACFG,
                llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>>
                    BlockToState,
                unsigned CurBlockID,
                const TypeErasedDataflowAnalysisState &CurState)
-      : CFCtx(CFCtx), BlockToState(BlockToState), CurBlockID(CurBlockID),
+      : ACFG(ACFG), BlockToState(BlockToState), CurBlockID(CurBlockID),
         CurState(CurState) {}
 
   /// Returns the environment of the basic block that contains `S`.
@@ -42,7 +42,7 @@ class StmtToEnvMap {
   const Environment *getEnvironment(const Stmt &S) const;
 
 private:
-  const ControlFlowContext &CFCtx;
+  const AdornedCFG &ACFG;
   llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>> BlockToState;
   unsigned CurBlockID;
   const TypeErasedDataflowAnalysisState &CurState;
diff --git a/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
index a0ca7440230b0..b3722bf3ec80a 100644
--- a/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
@@ -21,7 +21,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
@@ -146,7 +146,7 @@ struct TypeErasedDataflowAnalysisState {
 /// from converging.
 llvm::Expected<std::vector<std::optional<TypeErasedDataflowAnalysisState>>>
 runTypeErasedDataflowAnalysis(
-    const ControlFlowContext &CFCtx, TypeErasedDataflowAnalysis &Analysis,
+    const AdornedCFG &ACFG, TypeErasedDataflowAnalysis &Analysis,
     const Environment &InitEnv,
     std::function<void(const CFGElement &,
                        const TypeErasedDataflowAnalysisState &)>
diff --git a/clang/include/clang/Basic/AllDiagnostics.h b/clang/include/clang/Basic/AllDiagnostics.h
index cc6aa631534a5..e64634cc138f7 100644
--- a/clang/include/clang/Basic/AllDiagnostics.h
+++ b/clang/include/clang/Basic/AllDiagnostics.h
@@ -20,6 +20,7 @@
 #include "clang/Basic/DiagnosticCrossTU.h"
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Basic/DiagnosticFrontend.h"
+#include "clang/Basic/DiagnosticInstallAPI.h"
 #include "clang/Basic/DiagnosticLex.h"
 #include "clang/Basic/DiagnosticParse.h"
 #include "clang/Basic/DiagnosticSema.h"
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 4473dde0cb50f..7d309d4ddd472 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3115,6 +3115,15 @@ def TypeNullUnspecified : TypeAttr {
   let Documentation = [TypeNullUnspecifiedDocs];
 }
 
+def CountedBy : DeclOrTypeAttr {
+  let Spellings = [Clang<"counted_by">];
+  let Subjects = SubjectList<[Field], ErrorDiag>;
+  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel">];
+  let ParseArgumentsAsUnevaluated = 1;
+  let Documentation = [CountedByDocs];
+  let LangOpts = [COnly];
+}
+
 // This is a marker used to indicate that an __unsafe_unretained qualifier was
 // ignored because ARC is not enabled. The usual representation for this
 // qualifier is as an ObjCOwnership attribute with Kind == "none".
@@ -5492,21 +5501,3 @@ def CodeAlign: StmtAttr {
     static constexpr int MaximumAlignment = 4096;
   }];
 }
-
-def CountedBy : InheritableAttr {
-  let Spellings = [Clang<"counted_by">];
-  let Subjects = SubjectList<[Field]>;
-  let Args = [IdentifierArgument<"CountedByField">];
-  let Documentation = [CountedByDocs];
-  let LangOpts = [COnly];
-  // FIXME: This is ugly. Let using a DeclArgument would be nice, but a Decl
-  // isn't yet available due to the fact that we're still parsing the
-  // structure. Maybe that code could be changed sometime in the future.
-  code AdditionalMembers = [{
-    private:
-      SourceRange CountedByFieldLoc;
-    public:
-      SourceRange getCountedByFieldLoc() const { return CountedByFieldLoc; }
-      void setCountedByFieldLoc(SourceRange Loc) { CountedByFieldLoc = Loc; }
-  }];
-}
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
new file mode 100644
index 0000000000000..f356f881d5ef9
--- /dev/null
+++ b/clang/include/clang/Basic/Builtins.def
@@ -0,0 +1,102 @@
+//===--- Builtins.def - Builtin function info database ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This is only documentation for the database layout. This will be removed once
+// all builtin databases are converted to tablegen files
+
+// The second value provided to the macro specifies the type of the function
+// (result value, then each argument) as follows:
+//  v -> void
+//  b -> boolean
+//  c -> char
+//  s -> short
+//  i -> int
+//  h -> half (__fp16, OpenCL)
+//  x -> half (_Float16)
+//  y -> half (__bf16)
+//  f -> float
+//  d -> double
+//  z -> size_t
+//  w -> wchar_t
+//  F -> constant CFString
+//  G -> id
+//  H -> SEL
+//  M -> struct objc_super
+//  a -> __builtin_va_list
+//  A -> "reference" to __builtin_va_list
+//  V -> Vector, followed by the number of elements and the base type.
+//  q -> Scalable vector, followed by the number of elements and the base type.
+//  Q -> target builtin type, followed by a character to distinguish the builtin type
+//    Qa -> AArch64 svcount_t builtin type.
+//  E -> ext_vector, followed by the number of elements and the base type.
+//  X -> _Complex, followed by the base type.
+//  Y -> ptrdiff_t
+//  P -> FILE
+//  J -> jmp_buf
+//  SJ -> sigjmp_buf
+//  K -> ucontext_t
+//  p -> pid_t
+//  . -> "...".  This may only occur at the end of the function list.
+//
+// Types may be prefixed with the following modifiers:
+//  L   -> long (e.g. Li for 'long int', Ld for 'long double')
+//  LL  -> long long (e.g. LLi for 'long long int', LLd for __float128)
+//  LLL -> __int128_t (e.g. LLLi)
+//  Z   -> int32_t (require a native 32-bit integer type on the target)
+//  W   -> int64_t (require a native 64-bit integer type on the target)
+//  N   -> 'int' size if target is LP64, 'L' otherwise.
+//  O   -> long for OpenCL targets, long long otherwise.
+//  S   -> signed
+//  U   -> unsigned
+//  I   -> Required to constant fold to an integer constant expression.
+//
+// Types may be postfixed with the following modifiers:
+// * -> pointer (optionally followed by an address space number, if no address
+//               space is specified than any address space will be accepted)
+// & -> reference (optionally followed by an address space number)
+// C -> const
+// D -> volatile
+// R -> restrict
+
+// The third value provided to the macro specifies information about attributes
+// of the function.  These must be kept in sync with the predicates in the
+// Builtin::Context class.  Currently we have:
+//  n -> nothrow
+//  r -> noreturn
+//  U -> pure
+//  c -> const
+//  t -> signature is meaningless, use custom typechecking
+//  T -> type is not important to semantic analysis and codegen; recognize as
+//       builtin even if type doesn't match signature, and don't warn if we
+//       can't be sure the type is right
+//  F -> this is a libc/libm function with a '__builtin_' prefix added.
+//  f -> this is a libc/libm function without a '__builtin_' prefix, or with
+//       'z', a C++ standard library function in namespace std::. This builtin
+//       is disableable by '-fno-builtin-foo' / '-fno-builtin-std-foo'.
+//  h -> this function requires a specific header or an explicit declaration.
+//  i -> this is a runtime library implemented function without the
+//       '__builtin_' prefix. It will be implemented in compiler-rt or libgcc.
+//  p:N: -> this is a printf-like function whose Nth argument is the format
+//          string.
+//  P:N: -> similar to the p:N: attribute, but the function is like vprintf
+//          in that it accepts its arguments as a va_list rather than
+//          through an ellipsis
+//  s:N: -> this is a scanf-like function whose Nth argument is the format
+//          string.
+//  S:N: -> similar to the s:N: attribute, but the function is like vscanf
+//          in that it accepts its arguments as a va_list rather than
+//          through an ellipsis
+//  e -> const, but only when -fno-math-errno and FP exceptions are ignored
+//  g -> const when FP exceptions are ignored
+//  j -> returns_twice (like setjmp)
+//  u -> arguments are not evaluated for their side-effects
+//  V:N: -> requires vectors of at least N bits to be legal
+//  C<N,M_0,...,M_k> -> callback behavior: argument N is called with argument
+//                      M_0, ..., M_k as payload
+//  z -> this is a function in (possibly-versioned) namespace std
+//  E -> this function can be constant evaluated by Clang frontend
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index fc0b953dd5b67..94238b5ebc27e 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4354,6 +4354,43 @@ def CoroSuspend : CoroLangBuiltin {
   let Prototype = "char(_Constant bool)";
 }
 
+// Pointer authentication builtins.
+def PtrauthStrip : Builtin {
+  let Spellings = ["__builtin_ptrauth_strip"];
+  let Attributes = [CustomTypeChecking, NoThrow, Const];
+  let Prototype = "void*(void*,int)";
+}
+
+def PtrauthBlendDiscriminator : Builtin {
+  let Spellings = ["__builtin_ptrauth_blend_discriminator"];
+  let Attributes = [CustomTypeChecking, NoThrow, Const];
+  let Prototype = "size_t(void*,int)";
+}
+
+def PtrauthSignUnauthenticated : Builtin {
+  let Spellings = ["__builtin_ptrauth_sign_unauthenticated"];
+  let Attributes = [CustomTypeChecking, NoThrow, Const];
+  let Prototype = "void*(void*,int,void*)";
+}
+
+def PtrauthSignGenericData : Builtin {
+  let Spellings = ["__builtin_ptrauth_sign_generic_data"];
+  let Attributes = [CustomTypeChecking, NoThrow, Const];
+  let Prototype = "size_t(void*,void*)";
+}
+
+def PtrauthAuthAndResign : Builtin {
+  let Spellings = ["__builtin_ptrauth_auth_and_resign"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void*(void*,int,void*,int,void*)";
+}
+
+def PtrauthAuth : Builtin {
+  let Spellings = ["__builtin_ptrauth_auth"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void*(void*,int,void*)";
+}
+
 // OpenCL v2.0 s6.13.16, s9.17.3.5 - Pipe functions.
 // We need the generic prototype, since the packet type could be anything.
 def ReadPipe : OCLPipeLangBuiltin {
@@ -4554,6 +4591,12 @@ def HLSLWaveActiveCountBits : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "unsigned int(bool)";
 }
 
+def HLSLClamp : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_clamp"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLCreateHandle : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_create_handle"];
   let Attributes = [NoThrow, Const];
@@ -4572,6 +4615,12 @@ def HLSLFrac : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLIsinf : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_isinf"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLLerp : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_lerp"];
   let Attributes = [NoThrow, Const];
@@ -4590,6 +4639,12 @@ def HLSLRcp : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLRSqrt : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_rsqrt"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index b5cbe90c8fd6a..cf8711c6eaee3 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -50,6 +50,9 @@ BUILTIN(__builtin_arm_wfi, "v", "")
 BUILTIN(__builtin_arm_sev, "v", "")
 BUILTIN(__builtin_arm_sevl, "v", "")
 
+// Like __builtin_trap but provide an 16-bit immediate reason code (which goes into `brk #N`).
+BUILTIN(__builtin_arm_trap, "vUIs", "nr")
+
 // CRC32
 TARGET_BUILTIN(__builtin_arm_crc32b, "UiUiUc", "nc", "crc")
 TARGET_BUILTIN(__builtin_arm_crc32cb, "UiUiUc", "nc", "crc")
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 7785fb430c069..7d53c751c13ac 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -12,6 +12,7 @@ clang_diag_gen(Common)
 clang_diag_gen(CrossTU)
 clang_diag_gen(Driver)
 clang_diag_gen(Frontend)
+clang_diag_gen(InstallAPI)
 clang_diag_gen(Lex)
 clang_diag_gen(Parse)
 clang_diag_gen(Refactoring)
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 425d2db6d827c..ddaec93547c08 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -59,6 +59,7 @@ CODEGENOPT(UniqueBasicBlockSectionNames, 1, 1) ///< Set for -funique-basic-block
                                                ///< basic block sections.
 CODEGENOPT(EnableAIXExtendedAltivecABI, 1, 0) ///< Set for -mabi=vec-extabi. Enables the extended Altivec ABI on AIX.
 CODEGENOPT(XCOFFReadOnlyPointers, 1, 0) ///< Set for -mxcoff-roptr.
+CODEGENOPT(AllTocData, 1, 0) ///< AIX -mtocdata
 ENUM_CODEGENOPT(FramePointer, FramePointerKind, 2, FramePointerKind::None) /// frame-pointer: all,non-leaf,none
 
 CODEGENOPT(ClearASTBeforeBackend , 1, 0) ///< Free the AST before running backend code generation. Only works with -disable-free.
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index dae587bc5c647..41fc8053760e3 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -419,9 +419,6 @@ class CodeGenOptions : public CodeGenOptionsBase {
   /// List of global variables that over-ride the toc-data default.
   std::vector<std::string> NoTocDataVars;
 
-  /// Flag for all global variables to be treated as toc-data.
-  bool AllTocData;
-
   /// Path to allowlist file specifying which objects
   /// (files, functions) should exclusively be instrumented
   /// by sanitizer coverage pass.
diff --git a/clang/include/clang/Basic/Diagnostic.td b/clang/include/clang/Basic/Diagnostic.td
index 8d66e265fbaef..0b8b3af939ba0 100644
--- a/clang/include/clang/Basic/Diagnostic.td
+++ b/clang/include/clang/Basic/Diagnostic.td
@@ -162,6 +162,7 @@ include "DiagnosticCommonKinds.td"
 include "DiagnosticCrossTUKinds.td"
 include "DiagnosticDriverKinds.td"
 include "DiagnosticFrontendKinds.td"
+include "DiagnosticInstallAPIKinds.td"
 include "DiagnosticLexKinds.td"
 include "DiagnosticParseKinds.td"
 include "DiagnosticRefactoringKinds.td"
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index 289028967b02c..936e894287a1d 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -359,6 +359,8 @@ def warn_target_unrecognized_env : Warning<
 def warn_knl_knm_isa_support_removed : Warning<
   "KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.">,
   InGroup<DiagGroup<"knl-knm-isa-support-removed">>;
+def err_target_unsupported_abi_with_fpu : Error<
+  "'%0' ABI is not supported with FPU">;
 
 // Source manager
 def err_cannot_open_file : Error<"cannot open file '%0': %1">, DefaultFatal;
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index a45102ddfddc0..62c685bbdaeae 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -281,6 +281,8 @@ def err_builtin_needs_feature : Error<"%0 needs target feature %1">;
 def err_function_needs_feature : Error<
   "always_inline function %1 requires target feature '%2', but would "
   "be inlined into function %0 that is compiled without support for '%2'">;
+
+let CategoryName = "Codegen ABI Check" in {
 def err_function_always_inline_attribute_mismatch : Error<
   "always_inline function %1 and its caller %0 have mismatching %2 attributes">;
 def err_function_always_inline_new_za : Error<
@@ -292,6 +294,7 @@ def warn_avx_calling_convention
       InGroup<DiagGroup<"psabi">>;
 def err_avx_calling_convention : Error<warn_avx_calling_convention.Summary>;
 
+
 def warn_sycl_device_has_aspect_mismatch
     : Warning<"function '%0' uses aspect '%1' not listed in its "
               "%select{'device_has' property|'sycl::device_has' attribute}2">,
@@ -299,6 +302,10 @@ def warn_sycl_device_has_aspect_mismatch
 def note_sycl_aspect_propagated_from_call
     : Note<"propagated from call to function '%0'">, BackendInfo;
 
+def err_target_unsupported_type_for_abi
+    : Error<"%0 requires %1 type support, but ABI '%2' does not support it">;
+}
+
 def err_alias_to_undefined : Error<
   "%select{alias|ifunc}0 must point to a defined "
   "%select{variable or |}1function">;
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index f8007969d748c..4880868a7f2bf 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -886,6 +886,7 @@ def ZeroLengthArray : DiagGroup<"zero-length-array">;
 def GNUZeroLineDirective : DiagGroup<"gnu-zero-line-directive">;
 def GNUZeroVariadicMacroArguments : DiagGroup<"gnu-zero-variadic-macro-arguments">;
 def MisleadingIndentation : DiagGroup<"misleading-indentation">;
+def PtrAuthNullPointers : DiagGroup<"ptrauth-null-pointers">;
 
 // This covers both the deprecated case (in C++98)
 // and the extension case (in C++11 onwards).
@@ -1533,3 +1534,7 @@ def ReadOnlyPlacementChecks : DiagGroup<"read-only-types">;
 // Warnings and fixes to support the "safe buffers" programming model.
 def UnsafeBufferUsageInContainer : DiagGroup<"unsafe-buffer-usage-in-container">;
 def UnsafeBufferUsage : DiagGroup<"unsafe-buffer-usage", [UnsafeBufferUsageInContainer]>;
+
+// Warnings and notes InstallAPI verification.
+def InstallAPIViolation : DiagGroup<"installapi-violation">;
+
diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h
index c2fc7a62280f4..605ef648b7cfe 100644
--- a/clang/include/clang/Basic/DiagnosticIDs.h
+++ b/clang/include/clang/Basic/DiagnosticIDs.h
@@ -42,6 +42,7 @@ namespace clang {
       DIAG_SIZE_SEMA          = 5000,
       DIAG_SIZE_ANALYSIS      =  100,
       DIAG_SIZE_REFACTORING   = 1000,
+      DIAG_SIZE_INSTALLAPI    =  100,
     };
     // Start position for diagnostics.
     enum {
@@ -57,7 +58,8 @@ namespace clang {
       DIAG_START_SEMA          = DIAG_START_CROSSTU       + static_cast<int>(DIAG_SIZE_CROSSTU),
       DIAG_START_ANALYSIS      = DIAG_START_SEMA          + static_cast<int>(DIAG_SIZE_SEMA),
       DIAG_START_REFACTORING   = DIAG_START_ANALYSIS      + static_cast<int>(DIAG_SIZE_ANALYSIS),
-      DIAG_UPPER_LIMIT         = DIAG_START_REFACTORING   + static_cast<int>(DIAG_SIZE_REFACTORING)
+      DIAG_START_INSTALLAPI    = DIAG_START_REFACTORING   + static_cast<int>(DIAG_SIZE_REFACTORING),
+      DIAG_UPPER_LIMIT         = DIAG_START_INSTALLAPI    + static_cast<int>(DIAG_SIZE_INSTALLAPI)
     };
 
     class CustomDiagInfo;
@@ -276,6 +278,9 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   /// category.
   static bool isARCDiagnostic(unsigned DiagID);
 
+  /// Return true if a given diagnostic is a codegen-time ABI check.
+  static bool isCodegenABICheckDiagnostic(unsigned DiagID);
+
   /// Enumeration describing how the emission of a diagnostic should
   /// be treated when it occurs during C++ template argument deduction.
   enum SFINAEResponse {
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPI.h b/clang/include/clang/Basic/DiagnosticInstallAPI.h
new file mode 100644
index 0000000000000..a76f6e087a2b0
--- /dev/null
+++ b/clang/include/clang/Basic/DiagnosticInstallAPI.h
@@ -0,0 +1,26 @@
+//===--- DiagnosticInstallAPI.h - Diagnostics for InstallAPI-----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H
+#define LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H
+
+#include "clang/Basic/Diagnostic.h"
+namespace clang {
+namespace diag {
+enum {
+#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR,      \
+             SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
+  ENUM,
+#define INSTALLAPISTART
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+#undef DIAG
+  NUM_BUILTIN_INSTALLAPI_DIAGNOSTICS
+};
+} // namespace diag
+} // namespace clang
+#endif // LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
new file mode 100644
index 0000000000000..f99a5fca64cb4
--- /dev/null
+++ b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
@@ -0,0 +1,43 @@
+//==--- DiagnosticInstallAPIKinds.td - installapi diagnostics -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// InstallAPI Diagnostics
+//===----------------------------------------------------------------------===//
+
+let Component = "InstallAPI" in {
+let CategoryName = "Command line" in {
+def err_cannot_write_file : Error<"cannot write file '%0': %1">;
+def err_no_install_name : Error<"no install name specified: add -install_name <path>">;
+def err_no_output_file: Error<"no output file specified">;
+} // end of command line category.
+
+let CategoryName = "Verification" in {
+def warn_target: Warning<"violations found for %0">, InGroup<InstallAPIViolation>;
+def err_library_missing_symbol : Error<"declaration has external linkage, but dynamic library doesn't have symbol '%0'">;
+def warn_library_missing_symbol : Warning<"declaration has external linkage, but dynamic library doesn't have symbol '%0'">, InGroup<InstallAPIViolation>;
+def err_library_hidden_symbol : Error<"declaration has external linkage, but symbol has internal linkage in dynamic library '%0'">;
+def warn_library_hidden_symbol : Warning<"declaration has external linkage, but symbol has internal linkage in dynamic library '%0'">, InGroup<InstallAPIViolation>;
+def warn_header_hidden_symbol : Warning<"symbol exported in dynamic library, but marked hidden in declaration '%0'">, InGroup<InstallAPIViolation>;
+def err_header_hidden_symbol : Error<"symbol exported in dynamic library, but marked hidden in declaration '%0'">;
+def err_header_symbol_missing : Error<"no declaration found for exported symbol '%0' in dynamic library">;
+def warn_header_availability_mismatch : Warning<"declaration '%0' is marked %select{available|unavailable}1,"
+  " but symbol is %select{not |}2exported in dynamic library">, InGroup<InstallAPIViolation>;
+def err_header_availability_mismatch : Error<"declaration '%0' is marked %select{available|unavailable}1,"
+  " but symbol is %select{not |}2exported in dynamic library">;
+def warn_dylib_symbol_flags_mismatch : Warning<"dynamic library symbol '%0' is "
+  "%select{weak defined|thread local}1, but its declaration is not">, InGroup<InstallAPIViolation>;
+def warn_header_symbol_flags_mismatch : Warning<"declaration '%0' is "
+  "%select{weak defined|thread local}1, but symbol is not in dynamic library">, InGroup<InstallAPIViolation>;
+def err_dylib_symbol_flags_mismatch : Error<"dynamic library symbol '%0' is "
+  "%select{weak defined|thread local}1, but its declaration is not">;
+def err_header_symbol_flags_mismatch : Error<"declaration '%0' is "
+  "%select{weak defined|thread local}1, but symbol is not in dynamic library">;
+} // end of Verification category.
+
+} // end of InstallAPI component
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3094138b0717d..2595d2cfb6baf 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -976,6 +976,22 @@ def warn_fortify_scanf_overflow : Warning<
 def err_function_start_invalid_type: Error<
   "argument must be a function">;
 
+def err_ptrauth_disabled :
+  Error<"this target does not support pointer authentication">;
+def err_ptrauth_invalid_key :
+  Error<"%0 does not identify a valid pointer authentication key for "
+        "the current target">;
+def err_ptrauth_value_bad_type :
+  Error<"%select{signed value|extra discriminator|blended pointer|blended "
+        "integer}0 must have %select{pointer|integer|pointer or integer}1 "
+        "type; type here is %2">;
+def warn_ptrauth_sign_null_pointer :
+  Warning<"signing a null pointer will yield a non-null pointer">,
+  InGroup<PtrAuthNullPointers>;
+def warn_ptrauth_auth_null_pointer :
+  Warning<"authenticating a null pointer will almost certainly trap">,
+  InGroup<PtrAuthNullPointers>;
+
 /// main()
 // static main() is not an error in C, just in C++.
 def warn_static_main : Warning<"'main' should not be declared static">,
@@ -6585,12 +6601,18 @@ def err_flexible_array_count_not_in_same_struct : Error<
   "'counted_by' field %0 isn't within the same struct as the flexible array">;
 def err_counted_by_attr_not_on_flexible_array_member : Error<
   "'counted_by' only applies to C99 flexible array members">;
-def err_counted_by_attr_refers_to_flexible_array : Error<
-  "'counted_by' cannot refer to the flexible array %0">;
+def err_counted_by_attr_refer_to_itself : Error<
+  "'counted_by' cannot refer to the flexible array member %0">;
 def err_counted_by_must_be_in_structure : Error<
   "field %0 in 'counted_by' not inside structure">;
-def err_flexible_array_counted_by_attr_field_not_integer : Error<
-  "field %0 in 'counted_by' must be a non-boolean integer type">;
+def err_counted_by_attr_argument_not_integer : Error<
+  "'counted_by' requires a non-boolean integer type argument">;
+def err_counted_by_attr_only_support_simple_decl_reference : Error<
+  "'counted_by' argument must be a simple declaration reference">;
+def err_counted_by_attr_in_union : Error<
+  "'counted_by' cannot be applied to a union member">;
+def err_counted_by_attr_refer_to_union : Error<
+  "'counted_by' argument cannot refer to a union member">;
 def note_flexible_array_counted_by_attr_field : Note<
   "field %0 declared here">;
 
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 5fad5fc3623cb..eeed5f4751f2f 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -101,6 +101,7 @@ FEATURE(memory_sanitizer,
 FEATURE(thread_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Thread))
 FEATURE(dataflow_sanitizer, LangOpts.Sanitize.has(SanitizerKind::DataFlow))
 FEATURE(scudo, LangOpts.Sanitize.hasOneOf(SanitizerKind::Scudo))
+FEATURE(ptrauth_intrinsics, LangOpts.PointerAuthIntrinsics)
 FEATURE(swiftasynccc,
   PP.getTargetInfo().checkCallingConvention(CC_SwiftAsync) ==
   clang::TargetInfo::CCCR_OK)
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 7fe80879f7664..a068796b2811d 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -161,6 +161,8 @@ LANGOPT(DllExportInlines  , 1, 1, "dllexported classes dllexport inline methods"
 LANGOPT(RelaxedTemplateTemplateArgs, 1, 0, "C++17 relaxed matching of template template arguments")
 LANGOPT(ExperimentalLibrary, 1, 0, "enable unstable and experimental library features")
 
+LANGOPT(PointerAuthIntrinsics, 1, 0, "pointer authentication intrinsics")
+
 LANGOPT(DoubleSquareBracketAttributes, 1, 0, "'[[]]' attributes extension for all language standard modes")
 
 COMPATIBLE_LANGOPT(RecoveryAST, 1, 1, "Preserve expressions in AST when encountering errors")
diff --git a/clang/include/clang/Basic/LangStandard.h b/clang/include/clang/Basic/LangStandard.h
index bc49669a82ad2..199e24c673160 100644
--- a/clang/include/clang/Basic/LangStandard.h
+++ b/clang/include/clang/Basic/LangStandard.h
@@ -139,6 +139,7 @@ struct LangStandard {
   bool isOpenCL() const { return Flags & OpenCL; }
 
   static Kind getLangKind(StringRef Name);
+  static Kind getHLSLLangKind(StringRef Name);
   static const LangStandard &getLangStandardForKind(Kind K);
   static const LangStandard *getLangStandardForName(StringRef Name);
 };
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 7682f84e491c7..374595edd2ce4 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -24,6 +24,7 @@
 #include "clang/Basic/TargetOptions.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallSet.h"
@@ -1571,6 +1572,11 @@ class TargetInfo : public TransferrableTargetInfo,
     return getAddressSpaceMap()[(unsigned)AS];
   }
 
+  /// Determine whether the given pointer-authentication key is valid.
+  ///
+  /// The value has been coerced to type 'int'.
+  virtual bool validatePointerAuthKey(const llvm::APSInt &value) const;
+
   /// Map from the address space field in builtin description strings to the
   /// language address space.
   virtual LangAS getOpenCLBuiltinAddressSpace(unsigned AS) const {
diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td
index 6419762417371..3625f06375891 100644
--- a/clang/include/clang/Basic/TypeNodes.td
+++ b/clang/include/clang/Basic/TypeNodes.td
@@ -108,6 +108,8 @@ def ObjCTypeParamType : TypeNode<Type>, NeverCanonical;
 def ObjCObjectType : TypeNode<Type>;
 def ObjCInterfaceType : TypeNode<ObjCObjectType>, LeafType;
 def ObjCObjectPointerType : TypeNode<Type>;
+def BoundsAttributedType : TypeNode<Type, 1>;
+def CountAttributedType : TypeNode<BoundsAttributedType>, NeverCanonical;
 def PipeType : TypeNode<Type>;
 def AtomicType : TypeNode<Type>;
 def BitIntType : TypeNode<Type>;
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 413681e55c74c..31a7ea8778040 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -28,8 +28,8 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/StringSaver.h"
 
-#include <list>
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -625,6 +625,16 @@ class Driver {
   // FIXME: This should be in CompilationInfo.
   std::string GetProgramPath(StringRef Name, const ToolChain &TC) const;
 
+  /// Lookup the path to the Standard library module manifest.
+  ///
+  /// \param C - The compilation.
+  /// \param TC - The tool chain for additional information on
+  /// directories to search.
+  //
+  // FIXME: This should be in CompilationInfo.
+  std::string GetStdModuleManifestPath(const Compilation &C,
+                                       const ToolChain &TC) const;
+
   /// HandleAutocompletions - Handle --autocomplete by searching and printing
   /// possible flags, descriptions, and its arguments.
   void HandleAutocompletions(StringRef PassedFlags) const;
@@ -1007,6 +1017,13 @@ llvm::Error expandResponseFiles(SmallVectorImpl<const char *> &Args,
                                 bool ClangCLMode, llvm::BumpPtrAllocator &Alloc,
                                 llvm::vfs::FileSystem *FS = nullptr);
 
+/// Apply a space separated list of edits to the input argument lists.
+/// See applyOneOverrideOption.
+void applyOverrideOptions(SmallVectorImpl<const char *> &Args,
+                          const char *OverrideOpts,
+                          llvm::StringSet<> &SavedStrings,
+                          raw_ostream *OS = nullptr);
+
 } // end namespace driver
 } // end namespace clang
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1176cfd4bf10e..b47bea9f00d3d 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4392,6 +4392,14 @@ defm strict_return : BoolFOption<"strict-return",
             " of a non-void function as unreachable">,
   PosFlag<SetTrue>>;
 
+let Group = f_Group in {
+  let Visibility = [ClangOption,CC1Option] in {
+    def fptrauth_intrinsics : Flag<["-"], "fptrauth-intrinsics">,
+      HelpText<"Enable pointer authentication intrinsics">;
+  }
+  def fno_ptrauth_intrinsics : Flag<["-"], "fno-ptrauth-intrinsics">;
+}
+
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
     Visibility<[ClangOption, CC1Option]>,
     HelpText<"Enable matrix data type and related builtin functions">,
@@ -4982,21 +4990,17 @@ def mrvv_vector_bits_EQ : Joined<["-"], "mrvv-vector-bits=">, Group<m_Group>,
     " (RISC-V only)")>;
 
 def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_Group>,
-  HelpText<"Allow memory accesses to be unaligned (AArch32/AArch64/LoongArch/RISC-V only)">;
+  HelpText<"Allow memory accesses to be unaligned (AArch32/MIPSr6 only)">;
 def mno_unaligned_access : Flag<["-"], "mno-unaligned-access">, Group<m_Group>,
-  HelpText<"Force all memory accesses to be aligned (AArch32/AArch64/LoongArch/RISC-V only)">;
+  HelpText<"Force all memory accesses to be aligned (AArch32/MIPSr6 only)">;
 def munaligned_symbols : Flag<["-"], "munaligned-symbols">, Group<m_Group>,
   HelpText<"Expect external char-aligned symbols to be without ABI alignment (SystemZ only)">;
 def mno_unaligned_symbols : Flag<["-"], "mno-unaligned-symbols">, Group<m_Group>,
   HelpText<"Expect external char-aligned symbols to be without ABI alignment (SystemZ only)">;
-} // let Flags = [TargetSpecific]
-def mstrict_align : Flag<["-"], "mstrict-align">, Alias<mno_unaligned_access>,
-  Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Force all memory accesses to be aligned (same as mno-unaligned-access)">;
-def mno_strict_align : Flag<["-"], "mno-strict-align">, Alias<munaligned_access>,
-  Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Allow memory accesses to be unaligned (same as munaligned-access)">;
-let Flags = [TargetSpecific] in {
+def mstrict_align : Flag<["-"], "mstrict-align">,
+  HelpText<"Force all memory accesses to be aligned (AArch64/LoongArch/RISC-V only)">;
+def mno_strict_align : Flag<["-"], "mno-strict-align">,
+  HelpText<"Allow memory accesses to be unaligned (AArch64/LoongArch/RISC-V only)">;
 def mno_thumb : Flag<["-"], "mno-thumb">, Group<m_arm_Features_Group>;
 def mrestrict_it: Flag<["-"], "mrestrict-it">, Group<m_arm_Features_Group>,
   HelpText<"Disallow generation of complex IT blocks. It is off by default.">;
@@ -5683,6 +5687,9 @@ def print_resource_dir : Flag<["-", "--"], "print-resource-dir">,
 def print_search_dirs : Flag<["-", "--"], "print-search-dirs">,
   HelpText<"Print the paths used for finding libraries and programs">,
   Visibility<[ClangOption, CLOption]>;
+def print_std_module_manifest_path : Flag<["-", "--"], "print-library-module-manifest-path">,
+  HelpText<"Print the path for the C++ Standard library module manifest">,
+  Visibility<[ClangOption, CLOption]>;
 def print_targets : Flag<["-", "--"], "print-targets">,
   HelpText<"Print the registered targets">,
   Visibility<[ClangOption, CLOption]>;
@@ -6764,11 +6771,6 @@ def flang_deprecated_no_hlfir : Flag<["-"], "flang-deprecated-no-hlfir">,
   Flags<[HelpHidden]>, Visibility<[FlangOption, FC1Option]>,
   HelpText<"Do not use HLFIR lowering (deprecated)">;
 
-def flang_experimental_polymorphism : Flag<["-"], "flang-experimental-polymorphism">,
-  Flags<[HelpHidden]>, Visibility<[FlangOption, FC1Option]>,
-  HelpText<"Enable Fortran 2003 polymorphism (experimental)">;
-
-
 //===----------------------------------------------------------------------===//
 // FLangOption + Visibility<[FlangOption, CLOption, DXCOption] + NoXarchOption
 //===----------------------------------------------------------------------===//
@@ -8984,6 +8986,11 @@ def dxc_entrypoint : Option<["--", "/", "-"], "E", KIND_JOINED_OR_SEPARATE>,
                      Group<dxc_Group>,
                      Visibility<[DXCOption]>,
                      HelpText<"Entry point name">;
+def dxc_hlsl_version : Option<["/", "-"], "HV", KIND_JOINED_OR_SEPARATE>,
+                     Group<dxc_Group>,
+                     Visibility<[DXCOption]>,
+                     HelpText<"HLSL Version">,
+                     Values<"2016, 2017, 2018, 2021, 202x">;
 def dxc_validator_path_EQ : Joined<["--"], "dxv-path=">, Group<dxc_Group>,
   HelpText<"DXIL validator installation path">;
 def dxc_disable_validation : DXCFlag<"Vd">,
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index d095f5ecd5a3b..d984c69140a47 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -599,7 +599,7 @@ class ToolChain {
 
   // Return the DWARF version to emit, in the absence of arguments
   // to the contrary.
-  virtual unsigned GetDefaultDwarfVersion() const;
+  virtual unsigned GetDefaultDwarfVersion() const { return 5; }
 
   // Some toolchains may have different restrictions on the DWARF version and
   // may need to adjust it. E.g. NVPTX may need to enforce DWARF2 even when host
diff --git a/clang/include/clang/Format/.clang-format b/clang/include/clang/Format/.clang-format
index f95602cab0f7f..d7331b3c8cf02 100644
--- a/clang/include/clang/Format/.clang-format
+++ b/clang/include/clang/Format/.clang-format
@@ -1,6 +1 @@
-BasedOnStyle: LLVM
-InsertBraces: true
-InsertNewlineAtEOF: true
-LineEnding: LF
-RemoveBracesLLVM: true
-RemoveParentheses: ReturnStatement
+BasedOnStyle: clang-format
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 590297fd89a39..7ad2579bf7773 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4728,6 +4728,60 @@ struct FormatStyle {
   /// \version 8
   std::vector<std::string> StatementMacros;
 
+  /// Works only when TableGenBreakInsideDAGArg is not DontBreak.
+  /// The string list needs to consist of identifiers in TableGen.
+  /// If any identifier is specified, this limits the line breaks by
+  /// TableGenBreakInsideDAGArg option only on DAGArg values beginning with
+  /// the specified identifiers.
+  ///
+  /// For example the configuration,
+  /// \code{.yaml}
+  ///   TableGenBreakInsideDAGArg: BreakAll
+  ///   TableGenBreakingDAGArgOperators: ['ins', 'outs']
+  /// \endcode
+  ///
+  /// makes the line break only occurs inside DAGArgs beginning with the
+  /// specified identifiers 'ins' and 'outs'.
+  ///
+  /// \code
+  ///   let DAGArgIns = (ins
+  ///       i32:$src1,
+  ///       i32:$src2
+  ///   );
+  ///   let DAGArgOtherID = (other i32:$other1, i32:$other2);
+  ///   let DAGArgBang = (!cast<SomeType>("Some") i32:$src1, i32:$src2)
+  /// \endcode
+  /// \version 19
+  std::vector<std::string> TableGenBreakingDAGArgOperators;
+
+  /// Different ways to control the format inside TableGen DAGArg.
+  enum DAGArgStyle : int8_t {
+    /// Never break inside DAGArg.
+    /// \code
+    ///   let DAGArgIns = (ins i32:$src1, i32:$src2);
+    /// \endcode
+    DAS_DontBreak,
+    /// Break inside DAGArg after each list element but for the last.
+    /// This aligns to the first element.
+    /// \code
+    ///   let DAGArgIns = (ins i32:$src1,
+    ///                        i32:$src2);
+    /// \endcode
+    DAS_BreakElements,
+    /// Break inside DAGArg after the operator and the all elements.
+    /// \code
+    ///   let DAGArgIns = (ins
+    ///       i32:$src1,
+    ///       i32:$src2
+    ///   );
+    /// \endcode
+    DAS_BreakAll,
+  };
+
+  /// The styles of the line break inside the DAGArg in TableGen.
+  /// \version 19
+  DAGArgStyle TableGenBreakInsideDAGArg;
+
   /// The number of columns used for tab stops.
   /// \version 3.7
   unsigned TabWidth;
@@ -4980,9 +5034,12 @@ struct FormatStyle {
            SpacesInSquareBrackets == R.SpacesInSquareBrackets &&
            Standard == R.Standard &&
            StatementAttributeLikeMacros == R.StatementAttributeLikeMacros &&
-           StatementMacros == R.StatementMacros && TabWidth == R.TabWidth &&
-           TypeNames == R.TypeNames && TypenameMacros == R.TypenameMacros &&
-           UseTab == R.UseTab &&
+           StatementMacros == R.StatementMacros &&
+           TableGenBreakingDAGArgOperators ==
+               R.TableGenBreakingDAGArgOperators &&
+           TableGenBreakInsideDAGArg == R.TableGenBreakInsideDAGArg &&
+           TabWidth == R.TabWidth && TypeNames == R.TypeNames &&
+           TypenameMacros == R.TypenameMacros && UseTab == R.UseTab &&
            VerilogBreakBetweenInstancePorts ==
                R.VerilogBreakBetweenInstancePorts &&
            WhitespaceSensitiveMacros == R.WhitespaceSensitiveMacros;
diff --git a/clang/include/clang/InstallAPI/Context.h b/clang/include/clang/InstallAPI/Context.h
index bdb576d7d85fb..54e517544b8ed 100644
--- a/clang/include/clang/InstallAPI/Context.h
+++ b/clang/include/clang/InstallAPI/Context.h
@@ -11,6 +11,7 @@
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/FileManager.h"
+#include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/HeaderFile.h"
 #include "clang/InstallAPI/MachO.h"
 #include "llvm/ADT/DenseMap.h"
@@ -45,6 +46,9 @@ struct InstallAPIContext {
   /// DiagnosticsEngine for all error reporting.
   DiagnosticsEngine *Diags = nullptr;
 
+  /// Verifier when binary dylib is passed as input.
+  std::unique_ptr<DylibVerifier> Verifier = nullptr;
+
   /// File Path of output location.
   llvm::StringRef OutputLoc{};
 
diff --git a/clang/include/clang/InstallAPI/DylibVerifier.h b/clang/include/clang/InstallAPI/DylibVerifier.h
new file mode 100644
index 0000000000000..8269715c7f234
--- /dev/null
+++ b/clang/include/clang/InstallAPI/DylibVerifier.h
@@ -0,0 +1,149 @@
+//===- InstallAPI/DylibVerifier.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_DYLIBVERIFIER_H
+#define LLVM_CLANG_INSTALLAPI_DYLIBVERIFIER_H
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/InstallAPI/MachO.h"
+
+namespace clang {
+namespace installapi {
+struct FrontendAttrs;
+
+/// A list of InstallAPI verification modes.
+enum class VerificationMode {
+  Invalid,
+  ErrorsOnly,
+  ErrorsAndWarnings,
+  Pedantic,
+};
+
+/// Service responsible to tracking state of verification across the
+/// lifetime of InstallAPI.
+/// As declarations are collected during AST traversal, they are
+/// compared as symbols against what is available in the binary dylib.
+class DylibVerifier {
+private:
+  struct SymbolContext;
+
+public:
+  enum class Result { NoVerify, Ignore, Valid, Invalid };
+  struct VerifierContext {
+    // Current target being verified against the AST.
+    llvm::MachO::Target Target;
+
+    // Target specific API from binary.
+    RecordsSlice *DylibSlice = nullptr;
+
+    // Query state of verification after AST has been traversed.
+    Result FrontendState = Result::Ignore;
+
+    // First error for AST traversal, which is tied to the target triple.
+    bool DiscoveredFirstError = false;
+
+    // Determines what kind of banner to print a violation for.
+    bool PrintArch = false;
+
+    // Engine for reporting violations.
+    DiagnosticsEngine *Diag = nullptr;
+
+    // Handle diagnostics reporting for target level violations.
+    void emitDiag(llvm::function_ref<void()> Report);
+
+    VerifierContext() = default;
+    VerifierContext(DiagnosticsEngine *Diag) : Diag(Diag) {}
+  };
+
+  DylibVerifier() = default;
+
+  DylibVerifier(llvm::MachO::Records &&Dylib, DiagnosticsEngine *Diag,
+                VerificationMode Mode, bool Demangle)
+      : Dylib(std::move(Dylib)), Mode(Mode), Demangle(Demangle),
+        Exports(std::make_unique<SymbolSet>()), Ctx(VerifierContext{Diag}) {}
+
+  Result verify(GlobalRecord *R, const FrontendAttrs *FA);
+  Result verify(ObjCInterfaceRecord *R, const FrontendAttrs *FA);
+  Result verify(ObjCIVarRecord *R, const FrontendAttrs *FA,
+                const StringRef SuperClass);
+
+  /// Initialize target for verification.
+  void setTarget(const Target &T);
+
+  /// Release ownership over exports.
+  std::unique_ptr<SymbolSet> getExports() { return std::move(Exports); }
+
+  /// Get result of verification.
+  Result getState() const { return Ctx.FrontendState; }
+
+  /// Set different source managers to the same diagnostics engine.
+  void setSourceManager(SourceManager &SourceMgr) const {
+    if (!Ctx.Diag)
+      return;
+    Ctx.Diag->setSourceManager(&SourceMgr);
+  }
+
+private:
+  /// Determine whether to compare declaration to symbol in binary.
+  bool canVerify();
+
+  /// Shared implementation for verifying exported symbols.
+  Result verifyImpl(Record *R, SymbolContext &SymCtx);
+
+  /// Check if declaration is marked as obsolete, they are
+  // expected to result in a symbol mismatch.
+  bool shouldIgnoreObsolete(const Record *R, SymbolContext &SymCtx,
+                            const Record *DR);
+
+  /// Compare the visibility declarations to the linkage of symbol found in
+  /// dylib.
+  Result compareVisibility(const Record *R, SymbolContext &SymCtx,
+                           const Record *DR);
+
+  /// An ObjCInterfaceRecord can represent up to three symbols. When verifying,
+  // account for this granularity.
+  bool compareObjCInterfaceSymbols(const Record *R, SymbolContext &SymCtx,
+                                   const ObjCInterfaceRecord *DR);
+
+  /// Validate availability annotations against dylib.
+  Result compareAvailability(const Record *R, SymbolContext &SymCtx,
+                             const Record *DR);
+
+  /// Compare and validate matching symbol flags.
+  bool compareSymbolFlags(const Record *R, SymbolContext &SymCtx,
+                          const Record *DR);
+
+  /// Update result state on each call to `verify`.
+  void updateState(Result State);
+
+  /// Add verified exported symbol.
+  void addSymbol(const Record *R, SymbolContext &SymCtx,
+                 TargetList &&Targets = {});
+
+  /// Find matching dylib slice for target triple that is being parsed.
+  void assignSlice(const Target &T);
+
+  // Symbols in dylib.
+  llvm::MachO::Records Dylib;
+
+  // Controls what class of violations to report.
+  VerificationMode Mode = VerificationMode::Invalid;
+
+  // Attempt to demangle when reporting violations.
+  bool Demangle = false;
+
+  // Valid symbols in final text file.
+  std::unique_ptr<SymbolSet> Exports = std::make_unique<SymbolSet>();
+
+  // Track current state of verification while traversing AST.
+  VerifierContext Ctx;
+};
+
+} // namespace installapi
+} // namespace clang
+#endif // LLVM_CLANG_INSTALLAPI_DYLIBVERIFIER_H
diff --git a/clang/include/clang/InstallAPI/Frontend.h b/clang/include/clang/InstallAPI/Frontend.h
index 873cb50d60a54..5cccd891c5809 100644
--- a/clang/include/clang/InstallAPI/Frontend.h
+++ b/clang/include/clang/InstallAPI/Frontend.h
@@ -14,10 +14,10 @@
 #define LLVM_CLANG_INSTALLAPI_FRONTEND_H
 
 #include "clang/AST/ASTConsumer.h"
-#include "clang/AST/Availability.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/Visitor.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -35,6 +35,8 @@ class InstallAPIAction : public ASTFrontendAction {
 
   std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
                                                  StringRef InFile) override {
+    Ctx.Diags->getClient()->BeginSourceFile(CI.getLangOpts());
+    Ctx.Verifier->setSourceManager(CI.getSourceManager());
     return std::make_unique<InstallAPIVisitor>(
         CI.getASTContext(), Ctx, CI.getSourceManager(), CI.getPreprocessor());
   }
diff --git a/clang/include/clang/InstallAPI/FrontendRecords.h b/clang/include/clang/InstallAPI/FrontendRecords.h
index 1f5bc37798bef..59271e81e230c 100644
--- a/clang/include/clang/InstallAPI/FrontendRecords.h
+++ b/clang/include/clang/InstallAPI/FrontendRecords.h
@@ -43,13 +43,13 @@ class FrontendRecordsSlice : public llvm::MachO::RecordsSlice {
   /// \param Flags The flags that describe attributes of the symbol.
   /// \param Inlined Whether declaration is inlined, only applicable to
   /// functions.
-  /// \return The non-owning pointer to added record in slice.
-  GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage,
-                          GlobalRecord::Kind GV,
-                          const clang::AvailabilityInfo Avail, const Decl *D,
-                          const HeaderType Access,
-                          SymbolFlags Flags = SymbolFlags::None,
-                          bool Inlined = false);
+  /// \return The non-owning pointer to added record in slice with it's frontend
+  /// attributes.
+  std::pair<GlobalRecord *, FrontendAttrs *>
+  addGlobal(StringRef Name, RecordLinkage Linkage, GlobalRecord::Kind GV,
+            const clang::AvailabilityInfo Avail, const Decl *D,
+            const HeaderType Access, SymbolFlags Flags = SymbolFlags::None,
+            bool Inlined = false);
 
   /// Add ObjC Class record with attributes from AST.
   ///
@@ -60,11 +60,12 @@ class FrontendRecordsSlice : public llvm::MachO::RecordsSlice {
   /// \param D The pointer to the declaration from traversing AST.
   /// \param Access The intended access level of symbol.
   /// \param IsEHType Whether declaration has an exception attribute.
-  /// \return The non-owning pointer to added record in slice.
-  ObjCInterfaceRecord *addObjCInterface(StringRef Name, RecordLinkage Linkage,
-                                        const clang::AvailabilityInfo Avail,
-                                        const Decl *D, HeaderType Access,
-                                        bool IsEHType);
+  /// \return The non-owning pointer to added record in slice with it's frontend
+  /// attributes.
+  std::pair<ObjCInterfaceRecord *, FrontendAttrs *>
+  addObjCInterface(StringRef Name, RecordLinkage Linkage,
+                   const clang::AvailabilityInfo Avail, const Decl *D,
+                   HeaderType Access, bool IsEHType);
 
   /// Add ObjC Category record with attributes from AST.
   ///
@@ -75,11 +76,12 @@ class FrontendRecordsSlice : public llvm::MachO::RecordsSlice {
   /// to the active target triple.
   /// \param D The pointer to the declaration from traversing AST.
   /// \param Access The intended access level of symbol.
-  /// \return The non-owning pointer to added record in slice.
-  ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend,
-                                      StringRef CategoryName,
-                                      const clang::AvailabilityInfo Avail,
-                                      const Decl *D, HeaderType Access);
+  /// \return The non-owning pointer to added record in slice with it's frontend
+  /// attributes.
+  std::pair<ObjCCategoryRecord *, FrontendAttrs *>
+  addObjCCategory(StringRef ClassToExtend, StringRef CategoryName,
+                  const clang::AvailabilityInfo Avail, const Decl *D,
+                  HeaderType Access);
 
   /// Add ObjC IVar record with attributes from AST.
   ///
@@ -91,12 +93,13 @@ class FrontendRecordsSlice : public llvm::MachO::RecordsSlice {
   /// \param D The pointer to the declaration from traversing AST.
   /// \param Access The intended access level of symbol.
   /// \param AC The access control tied to the ivar declaration.
-  /// \return The non-owning pointer to added record in slice.
-  ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container,
-                              StringRef IvarName, RecordLinkage Linkage,
-                              const clang::AvailabilityInfo Avail,
-                              const Decl *D, HeaderType Access,
-                              const clang::ObjCIvarDecl::AccessControl AC);
+  /// \return The non-owning pointer to added record in slice with it's frontend
+  /// attributes.
+  std::pair<ObjCIVarRecord *, FrontendAttrs *>
+  addObjCIVar(ObjCContainerRecord *Container, StringRef IvarName,
+              RecordLinkage Linkage, const clang::AvailabilityInfo Avail,
+              const Decl *D, HeaderType Access,
+              const clang::ObjCIvarDecl::AccessControl AC);
 
 private:
   /// Mapping of records stored in slice to their frontend attributes.
diff --git a/clang/include/clang/InstallAPI/InstallAPIDiagnostic.h b/clang/include/clang/InstallAPI/InstallAPIDiagnostic.h
new file mode 100644
index 0000000000000..547fb0bcf9a89
--- /dev/null
+++ b/clang/include/clang/InstallAPI/InstallAPIDiagnostic.h
@@ -0,0 +1,14 @@
+//===--- InstallAPIDiagnostic.h - Diagnostics for InstallAPI ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_INSTALLAPIDIAGNOSTIC_H
+#define LLVM_CLANG_INSTALLAPI_INSTALLAPIDIAGNOSTIC_H
+
+#include "clang/Basic/DiagnosticInstallAPI.h"
+
+#endif
diff --git a/clang/include/clang/InstallAPI/MachO.h b/clang/include/clang/InstallAPI/MachO.h
index 55e5591389ce1..a77766511fa3e 100644
--- a/clang/include/clang/InstallAPI/MachO.h
+++ b/clang/include/clang/InstallAPI/MachO.h
@@ -18,6 +18,7 @@
 #include "llvm/TextAPI/PackedVersion.h"
 #include "llvm/TextAPI/Platform.h"
 #include "llvm/TextAPI/RecordVisitor.h"
+#include "llvm/TextAPI/Symbol.h"
 #include "llvm/TextAPI/Target.h"
 #include "llvm/TextAPI/TextAPIWriter.h"
 #include "llvm/TextAPI/Utils.h"
@@ -31,10 +32,13 @@ using ObjCInterfaceRecord = llvm::MachO::ObjCInterfaceRecord;
 using ObjCCategoryRecord = llvm::MachO::ObjCCategoryRecord;
 using ObjCIVarRecord = llvm::MachO::ObjCIVarRecord;
 using Records = llvm::MachO::Records;
+using RecordsSlice = llvm::MachO::RecordsSlice;
 using BinaryAttrs = llvm::MachO::RecordsSlice::BinaryAttrs;
 using SymbolSet = llvm::MachO::SymbolSet;
+using SimpleSymbol = llvm::MachO::SimpleSymbol;
 using FileType = llvm::MachO::FileType;
 using PackedVersion = llvm::MachO::PackedVersion;
 using Target = llvm::MachO::Target;
+using TargetList = llvm::MachO::TargetList;
 
 #endif // LLVM_CLANG_INSTALLAPI_MACHO_H
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 8db78be816a08..9561d911426dd 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3074,6 +3074,11 @@ class Parser : public CodeCompletionHandler {
                                  SourceLocation ScopeLoc,
                                  ParsedAttr::Form Form);
 
+  void ParseBoundsAttribute(IdentifierInfo &AttrName,
+                            SourceLocation AttrNameLoc, ParsedAttributes &Attrs,
+                            IdentifierInfo *ScopeName, SourceLocation ScopeLoc,
+                            ParsedAttr::Form Form);
+
   void ParseTypeofSpecifier(DeclSpec &DS);
   SourceLocation ParseDecltypeSpecifier(DeclSpec &DS);
   void AnnotateExistingDecltypeSpecifier(const DeclSpec &DS,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index c600bcaa0089b..b07a7e3db77c0 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3312,6 +3312,8 @@ class Sema final {
                                     TemplateIdAnnotation *TemplateId,
                                     bool IsMemberSpecialization);
 
+  bool checkConstantPointerAuthKey(Expr *keyExpr, unsigned &key);
+
   void DiagnoseFunctionSpecifiers(const DeclSpec &DS);
   NamedDecl *getShadowedDeclaration(const TypedefNameDecl *D,
                                     const LookupResult &R);
@@ -5654,6 +5656,7 @@ class Sema final {
     enum ExpressionKind {
       EK_Decltype,
       EK_TemplateArgument,
+      EK_BoundsAttrArgument,
       EK_Other
     } ExprContext;
 
@@ -5771,6 +5774,12 @@ class Sema final {
     return ExprEvalContexts.back();
   };
 
+  bool isBoundsAttrContext() const {
+    return ExprEvalContexts.back().ExprContext ==
+           ExpressionEvaluationContextRecord::ExpressionKind::
+               EK_BoundsAttrArgument;
+  }
+
   /// Increment when we find a reference; decrement when we find an ignored
   /// assignment.  Ultimately the value is 0 if every reference is an ignored
   /// assignment.
@@ -9658,7 +9667,7 @@ class Sema final {
 
   bool AttachTypeConstraint(NestedNameSpecifierLoc NS,
                             DeclarationNameInfo NameInfo,
-                            ConceptDecl *NamedConcept,
+                            ConceptDecl *NamedConcept, NamedDecl *FoundDecl,
                             const TemplateArgumentListInfo *TemplateArgs,
                             TemplateTypeParmDecl *ConstrainedParameter,
                             SourceLocation EllipsisLoc);
@@ -12188,6 +12197,8 @@ class Sema final {
   QualType BuildMatrixType(QualType T, Expr *NumRows, Expr *NumColumns,
                            SourceLocation AttrLoc);
 
+  QualType BuildCountAttributedArrayType(QualType WrappedTy, Expr *CountExpr);
+
   QualType BuildAddressSpaceAttr(QualType &T, LangAS ASIdx, Expr *AddrSpace,
                                  SourceLocation AttrLoc);
 
diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h
index 80a1359fd16aa..5d3e95cb5d630 100644
--- a/clang/include/clang/Serialization/ASTRecordReader.h
+++ b/clang/include/clang/Serialization/ASTRecordReader.h
@@ -221,6 +221,8 @@ class ASTRecordReader
     return Reader->ReadSelector(*F, Record, Idx);
   }
 
+  TypeCoupledDeclRefInfo readTypeCoupledDeclRefInfo();
+
   /// Read a declaration name, advancing Idx.
   // DeclarationName readDeclarationName(); (inherited)
   DeclarationNameLoc readDeclarationNameLoc(DeclarationName Name);
diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h
index 9a64735c9fa55..e007d4a70843a 100644
--- a/clang/include/clang/Serialization/ASTRecordWriter.h
+++ b/clang/include/clang/Serialization/ASTRecordWriter.h
@@ -141,6 +141,11 @@ class ASTRecordWriter
     AddSourceLocation(Loc);
   }
 
+  void writeTypeCoupledDeclRefInfo(TypeCoupledDeclRefInfo Info) {
+    writeDeclRef(Info.getDecl());
+    writeBool(Info.isDeref());
+  }
+
   /// Emit a source range.
   void AddSourceRange(SourceRange Range, LocSeq *Seq = nullptr) {
     return Writer->AddSourceRange(Range, *Record, Seq);
diff --git a/clang/include/clang/Serialization/TypeBitCodes.def b/clang/include/clang/Serialization/TypeBitCodes.def
index adbd5ec5d4b54..3c82dfed9497d 100644
--- a/clang/include/clang/Serialization/TypeBitCodes.def
+++ b/clang/include/clang/Serialization/TypeBitCodes.def
@@ -65,6 +65,6 @@ TYPE_BIT_CODE(DependentSizedMatrix, DEPENDENT_SIZE_MATRIX, 53)
 TYPE_BIT_CODE(Using, USING, 54)
 TYPE_BIT_CODE(BTFTagAttributed, BTFTAG_ATTRIBUTED, 55)
 TYPE_BIT_CODE(PackIndexing, PACK_INDEXING, 56)
-
+TYPE_BIT_CODE(CountAttributed, COUNT_ATTRIBUTED, 57)
 
 #undef TYPE_BIT_CODE
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a1a56f6322721..737814b3ddcbe 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2360,6 +2360,9 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     return getTypeInfo(
                   cast<AttributedType>(T)->getEquivalentType().getTypePtr());
 
+  case Type::CountAttributed:
+    return getTypeInfo(cast<CountAttributedType>(T)->desugar().getTypePtr());
+
   case Type::BTFTagAttributed:
     return getTypeInfo(
         cast<BTFTagAttributedType>(T)->getWrappedType().getTypePtr());
@@ -3134,6 +3137,32 @@ QualType ASTContext::removePtrSizeAddrSpace(QualType T) const {
   return T;
 }
 
+QualType ASTContext::getCountAttributedType(
+    QualType WrappedTy, Expr *CountExpr, bool CountInBytes, bool OrNull,
+    ArrayRef<TypeCoupledDeclRefInfo> DependentDecls) const {
+  assert(WrappedTy->isPointerType() || WrappedTy->isArrayType());
+
+  llvm::FoldingSetNodeID ID;
+  CountAttributedType::Profile(ID, WrappedTy, CountExpr, CountInBytes, OrNull);
+
+  void *InsertPos = nullptr;
+  CountAttributedType *CATy =
+      CountAttributedTypes.FindNodeOrInsertPos(ID, InsertPos);
+  if (CATy)
+    return QualType(CATy, 0);
+
+  QualType CanonTy = getCanonicalType(WrappedTy);
+  size_t Size = CountAttributedType::totalSizeToAlloc<TypeCoupledDeclRefInfo>(
+      DependentDecls.size());
+  CATy = (CountAttributedType *)Allocate(Size, TypeAlignment);
+  new (CATy) CountAttributedType(WrappedTy, CanonTy, CountExpr, CountInBytes,
+                                 OrNull, DependentDecls);
+  Types.push_back(CATy);
+  CountAttributedTypes.InsertNode(CATy, InsertPos);
+
+  return QualType(CATy, 0);
+}
+
 const FunctionType *ASTContext::adjustFunctionType(const FunctionType *T,
                                                    FunctionType::ExtInfo Info) {
   if (T->getExtInfo() == Info)
@@ -13275,6 +13304,32 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X,
       return QualType();
     return Ctx.getUsingType(CD, Ctx.getQualifiedType(Underlying));
   }
+  case Type::CountAttributed: {
+    const auto *DX = cast<CountAttributedType>(X),
+               *DY = cast<CountAttributedType>(Y);
+    if (DX->isCountInBytes() != DY->isCountInBytes())
+      return QualType();
+    if (DX->isOrNull() != DY->isOrNull())
+      return QualType();
+    Expr *CEX = DX->getCountExpr();
+    Expr *CEY = DY->getCountExpr();
+    llvm::ArrayRef<clang::TypeCoupledDeclRefInfo> CDX = DX->getCoupledDecls();
+    if (Ctx.hasSameExpr(CEX, CEY))
+      return Ctx.getCountAttributedType(Ctx.getQualifiedType(Underlying), CEX,
+                                        DX->isCountInBytes(), DX->isOrNull(),
+                                        CDX);
+    if (!CEX->isIntegerConstantExpr(Ctx) || !CEY->isIntegerConstantExpr(Ctx))
+      return QualType();
+    // Two declarations with the same integer constant may still differ in their
+    // expression pointers, so we need to evaluate them.
+    llvm::APSInt VX = *CEX->getIntegerConstantExpr(Ctx);
+    llvm::APSInt VY = *CEY->getIntegerConstantExpr(Ctx);
+    if (VX != VY)
+      return QualType();
+    return Ctx.getCountAttributedType(Ctx.getQualifiedType(Underlying), CEX,
+                                      DX->isCountInBytes(), DX->isOrNull(),
+                                      CDX);
+  }
   }
   llvm_unreachable("Unhandled Type Class");
 }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 5c26291e9d346..87437dfdff208 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1533,6 +1533,28 @@ ExpectedType ASTNodeImporter::VisitAttributedType(const AttributedType *T) {
       *ToModifiedTypeOrErr, *ToEquivalentTypeOrErr);
 }
 
+ExpectedType
+ASTNodeImporter::VisitCountAttributedType(const CountAttributedType *T) {
+  ExpectedType ToWrappedTypeOrErr = import(T->desugar());
+  if (!ToWrappedTypeOrErr)
+    return ToWrappedTypeOrErr.takeError();
+
+  Error Err = Error::success();
+  Expr *CountExpr = importChecked(Err, T->getCountExpr());
+
+  SmallVector<TypeCoupledDeclRefInfo, 1> CoupledDecls;
+  for (auto TI : T->dependent_decls()) {
+    Expected<ValueDecl *> ToDeclOrErr = import(TI.getDecl());
+    if (!ToDeclOrErr)
+      return ToDeclOrErr.takeError();
+    CoupledDecls.emplace_back(*ToDeclOrErr, TI.isDeref());
+  }
+
+  return Importer.getToContext().getCountAttributedType(
+      *ToWrappedTypeOrErr, CountExpr, T->isCountInBytes(), T->isOrNull(),
+      ArrayRef(CoupledDecls.data(), CoupledDecls.size()));
+}
+
 ExpectedType ASTNodeImporter::VisitTemplateTypeParmType(
     const TemplateTypeParmType *T) {
   Expected<TemplateTypeParmDecl *> ToDeclOrErr = import(T->getDecl());
@@ -9311,16 +9333,6 @@ Expected<Attr *> ASTImporter::Import(const Attr *FromAttr) {
                   From->args_size());
     break;
   }
-  case attr::CountedBy: {
-    AI.cloneAttr(FromAttr);
-    const auto *CBA = cast<CountedByAttr>(FromAttr);
-    Expected<SourceRange> SR = Import(CBA->getCountedByFieldLoc()).get();
-    if (!SR)
-      return SR.takeError();
-    AI.castAttrAs<CountedByAttr>()->setCountedByFieldLoc(SR.get());
-    break;
-  }
-
   default: {
     // The default branch works for attributes that have no arguments to import.
     // FIXME: Handle every attribute type that has arguments of type to import
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index fe6e03ce174e5..226e0aa38ece7 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -1069,6 +1069,13 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
       return false;
     break;
 
+  case Type::CountAttributed:
+    if (!IsStructurallyEquivalent(Context,
+                                  cast<CountAttributedType>(T1)->desugar(),
+                                  cast<CountAttributedType>(T2)->desugar()))
+      return false;
+    break;
+
   case Type::BTFTagAttributed:
     if (!IsStructurallyEquivalent(
             Context, cast<BTFTagAttributedType>(T1)->getWrappedType(),
diff --git a/clang/lib/AST/Availability.cpp b/clang/lib/AST/Availability.cpp
index d0054e37e4dcd..238359a2dedfc 100644
--- a/clang/lib/AST/Availability.cpp
+++ b/clang/lib/AST/Availability.cpp
@@ -28,9 +28,9 @@ AvailabilityInfo AvailabilityInfo::createFromDecl(const Decl *Decl) {
     for (const auto *A : RD->specific_attrs<AvailabilityAttr>()) {
       if (A->getPlatform()->getName() != PlatformName)
         continue;
-      Availability =
-          AvailabilityInfo(A->getPlatform()->getName(), A->getIntroduced(),
-                           A->getDeprecated(), A->getObsoleted(), false, false);
+      Availability = AvailabilityInfo(
+          A->getPlatform()->getName(), A->getIntroduced(), A->getDeprecated(),
+          A->getObsoleted(), A->getUnavailable(), false, false);
       break;
     }
 
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 1c3dcf63465c6..645ec2f7563bc 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -1567,10 +1567,9 @@ bool CXXRecordDecl::isGenericLambda() const {
 
 #ifndef NDEBUG
 static bool allLookupResultsAreTheSame(const DeclContext::lookup_result &R) {
-  for (auto *D : R)
-    if (!declaresSameEntity(D, R.front()))
-      return false;
-  return true;
+  return llvm::all_of(R, [&](NamedDecl *D) {
+    return D->isInvalidDecl() || declaresSameEntity(D, R.front());
+  });
 }
 #endif
 
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 2ce575ebfc180..c51fc6ef60466 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -685,6 +685,16 @@ static void printExplicitSpecifier(ExplicitSpecifier ES, llvm::raw_ostream &Out,
   Out << Proto;
 }
 
+static void MaybePrintTagKeywordIfSupressingScopes(PrintingPolicy &Policy,
+                                                   QualType T,
+                                                   llvm::raw_ostream &Out) {
+  StringRef prefix = T->isClassType()       ? "class "
+                     : T->isStructureType() ? "struct "
+                     : T->isUnionType()     ? "union "
+                                            : "";
+  Out << prefix;
+}
+
 void DeclPrinter::VisitFunctionDecl(FunctionDecl *D) {
   if (!D->getDescribedFunctionTemplate() &&
       !D->isFunctionTemplateSpecialization())
@@ -861,6 +871,10 @@ void DeclPrinter::VisitFunctionDecl(FunctionDecl *D) {
         Out << Proto << " -> ";
         Proto.clear();
       }
+      if (!Policy.SuppressTagKeyword && Policy.SuppressScope &&
+          !Policy.SuppressUnwrittenScope)
+        MaybePrintTagKeywordIfSupressingScopes(Policy, AFT->getReturnType(),
+                                               Out);
       AFT->getReturnType().print(Out, Policy, Proto);
       Proto.clear();
     }
@@ -1028,6 +1042,9 @@ void DeclPrinter::VisitVarDecl(VarDecl *D) {
              ? D->getIdentifier()->deuglifiedName()
              : D->getName();
 
+  if (!Policy.SuppressTagKeyword && Policy.SuppressScope &&
+      !Policy.SuppressUnwrittenScope)
+    MaybePrintTagKeywordIfSupressingScopes(Policy, T, Out);
   printDeclType(T, Name);
 
   // Print the attributes that should be placed right before the end of the
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 46cf59ec9e46f..c18d0269ad3c9 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -731,7 +731,8 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedIdentKind IK) {
 // FIXME: Maybe this should use DeclPrinter with a special "print predefined
 // expr" policy instead.
 std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
-                                        const Decl *CurrentDecl) {
+                                        const Decl *CurrentDecl,
+                                        bool ForceElaboratedPrinting) {
   ASTContext &Context = CurrentDecl->getASTContext();
 
   if (IK == PredefinedIdentKind::FuncDName) {
@@ -779,10 +780,21 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
     return std::string(Out.str());
   }
   if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(CurrentDecl)) {
-    if (IK != PredefinedIdentKind::PrettyFunction &&
+    const auto &LO = Context.getLangOpts();
+    bool IsFuncOrFunctionInNonMSVCCompatEnv =
+        ((IK == PredefinedIdentKind::Func ||
+          IK == PredefinedIdentKind ::Function) &&
+         !LO.MSVCCompat);
+    bool IsLFunctionInMSVCCommpatEnv =
+        IK == PredefinedIdentKind::LFunction && LO.MSVCCompat;
+    bool IsFuncOrFunctionOrLFunctionOrFuncDName =
+        IK != PredefinedIdentKind::PrettyFunction &&
         IK != PredefinedIdentKind::PrettyFunctionNoVirtual &&
         IK != PredefinedIdentKind::FuncSig &&
-        IK != PredefinedIdentKind::LFuncSig)
+        IK != PredefinedIdentKind::LFuncSig;
+    if ((ForceElaboratedPrinting &&
+         (IsFuncOrFunctionInNonMSVCCompatEnv || IsLFunctionInMSVCCommpatEnv)) ||
+        (!ForceElaboratedPrinting && IsFuncOrFunctionOrLFunctionOrFuncDName))
       return FD->getNameAsString();
 
     SmallString<256> Name;
@@ -810,6 +822,8 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
     PrintingPolicy Policy(Context.getLangOpts());
     PrettyCallbacks PrettyCB(Context.getLangOpts());
     Policy.Callbacks = &PrettyCB;
+    if (IK == PredefinedIdentKind::Function && ForceElaboratedPrinting)
+      Policy.SuppressTagKeyword = !LO.MSVCCompat;
     std::string Proto;
     llvm::raw_string_ostream POut(Proto);
 
@@ -837,6 +851,12 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
 
     FD->printQualifiedName(POut, Policy);
 
+    if (IK == PredefinedIdentKind::Function) {
+      POut.flush();
+      Out << Proto;
+      return std::string(Name);
+    }
+
     POut << "(";
     if (FT) {
       for (unsigned i = 0, e = Decl->getNumParams(); i != e; ++i) {
@@ -4661,8 +4681,17 @@ SourceRange DesignatedInitExpr::getDesignatorsSourceRange() const {
 SourceLocation DesignatedInitExpr::getBeginLoc() const {
   auto *DIE = const_cast<DesignatedInitExpr *>(this);
   Designator &First = *DIE->getDesignator(0);
-  if (First.isFieldDesignator())
-    return GNUSyntax ? First.getFieldLoc() : First.getDotLoc();
+  if (First.isFieldDesignator()) {
+    // Skip past implicit designators for anonymous structs/unions, since
+    // these do not have valid source locations.
+    for (unsigned int i = 0; i < DIE->size(); i++) {
+      Designator &Des = *DIE->getDesignator(i);
+      SourceLocation retval = GNUSyntax ? Des.getFieldLoc() : Des.getDotLoc();
+      if (!retval.isValid())
+        continue;
+      return retval;
+    }
+  }
   return First.getLBracketLoc();
 }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 24c59260a3451..d1deefbb44c15 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -5594,6 +5594,9 @@ static EvalStmtResult EvaluateStmt(StmtResult &Result, EvalInfo &Info,
         if (Assumption->isValueDependent())
           return ESR_Failed;
 
+        if (Assumption->HasSideEffects(Info.getCtx()))
+          continue;
+
         bool Value;
         if (!EvaluateAsBooleanCondition(Assumption, Value, Info))
           return ESR_Failed;
@@ -7344,9 +7347,6 @@ class BufferToAPValueConverter {
       for (size_t I = 0, E = CXXRD->getNumBases(); I != E; ++I) {
         const CXXBaseSpecifier &BS = CXXRD->bases_begin()[I];
         CXXRecordDecl *BaseDecl = BS.getType()->getAsCXXRecordDecl();
-        if (BaseDecl->isEmpty() ||
-            Info.Ctx.getASTRecordLayout(BaseDecl).getNonVirtualSize().isZero())
-          continue;
 
         std::optional<APValue> SubObj = visitType(
             BS.getType(), Layout.getBaseClassOffset(BaseDecl) + Offset);
@@ -8517,6 +8517,53 @@ class LValueExprEvaluator
 };
 } // end anonymous namespace
 
+/// Get an lvalue to a field of a lambda's closure type.
+static bool HandleLambdaCapture(EvalInfo &Info, const Expr *E, LValue &Result,
+                                const CXXMethodDecl *MD, const FieldDecl *FD,
+                                bool LValueToRValueConversion) {
+  // Static lambda function call operators can't have captures. We already
+  // diagnosed this, so bail out here.
+  if (MD->isStatic()) {
+    assert(Info.CurrentCall->This == nullptr &&
+           "This should not be set for a static call operator");
+    return false;
+  }
+
+  // Start with 'Result' referring to the complete closure object...
+  if (MD->isExplicitObjectMemberFunction()) {
+    // Self may be passed by reference or by value.
+    const ParmVarDecl *Self = MD->getParamDecl(0);
+    if (Self->getType()->isReferenceType()) {
+      APValue *RefValue = Info.getParamSlot(Info.CurrentCall->Arguments, Self);
+      Result.setFrom(Info.Ctx, *RefValue);
+    } else {
+      const ParmVarDecl *VD = Info.CurrentCall->Arguments.getOrigParam(Self);
+      CallStackFrame *Frame =
+          Info.getCallFrameAndDepth(Info.CurrentCall->Arguments.CallIndex)
+              .first;
+      unsigned Version = Info.CurrentCall->Arguments.Version;
+      Result.set({VD, Frame->Index, Version});
+    }
+  } else
+    Result = *Info.CurrentCall->This;
+
+  // ... then update it to refer to the field of the closure object
+  // that represents the capture.
+  if (!HandleLValueMember(Info, E, Result, FD))
+    return false;
+
+  // And if the field is of reference type (or if we captured '*this' by
+  // reference), update 'Result' to refer to what
+  // the field refers to.
+  if (LValueToRValueConversion) {
+    APValue RVal;
+    if (!handleLValueToRValueConversion(Info, E, FD->getType(), Result, RVal))
+      return false;
+    Result.setFrom(Info.Ctx, RVal);
+  }
+  return true;
+}
+
 /// Evaluate an expression as an lvalue. This can be legitimately called on
 /// expressions which are not glvalues, in three cases:
 ///  * function designators in C, and
@@ -8561,37 +8608,8 @@ bool LValueExprEvaluator::VisitVarDecl(const Expr *E, const VarDecl *VD) {
 
     if (auto *FD = Info.CurrentCall->LambdaCaptureFields.lookup(VD)) {
       const auto *MD = cast<CXXMethodDecl>(Info.CurrentCall->Callee);
-
-      // Static lambda function call operators can't have captures. We already
-      // diagnosed this, so bail out here.
-      if (MD->isStatic()) {
-        assert(Info.CurrentCall->This == nullptr &&
-               "This should not be set for a static call operator");
-        return false;
-      }
-
-      // Start with 'Result' referring to the complete closure object...
-      if (MD->isExplicitObjectMemberFunction()) {
-        APValue *RefValue =
-            Info.getParamSlot(Info.CurrentCall->Arguments, MD->getParamDecl(0));
-        Result.setFrom(Info.Ctx, *RefValue);
-      } else
-        Result = *Info.CurrentCall->This;
-
-      // ... then update it to refer to the field of the closure object
-      // that represents the capture.
-      if (!HandleLValueMember(Info, E, Result, FD))
-        return false;
-      // And if the field is of reference type, update 'Result' to refer to what
-      // the field refers to.
-      if (FD->getType()->isReferenceType()) {
-        APValue RVal;
-        if (!handleLValueToRValueConversion(Info, E, FD->getType(), Result,
-                                            RVal))
-          return false;
-        Result.setFrom(Info.Ctx, RVal);
-      }
-      return true;
+      return HandleLambdaCapture(Info, E, Result, MD, FD,
+                                 FD->getType()->isReferenceType());
     }
   }
 
@@ -9069,45 +9087,46 @@ class PointerExprEvaluator
     return Error(E);
   }
   bool VisitCXXThisExpr(const CXXThisExpr *E) {
-    // Can't look at 'this' when checking a potential constant expression.
-    if (Info.checkingPotentialConstantExpression())
-      return false;
-    if (!Info.CurrentCall->This) {
+    auto DiagnoseInvalidUseOfThis = [&] {
       if (Info.getLangOpts().CPlusPlus11)
         Info.FFDiag(E, diag::note_constexpr_this) << E->isImplicit();
       else
         Info.FFDiag(E);
+    };
+
+    // Can't look at 'this' when checking a potential constant expression.
+    if (Info.checkingPotentialConstantExpression())
       return false;
+
+    bool IsExplicitLambda =
+        isLambdaCallWithExplicitObjectParameter(Info.CurrentCall->Callee);
+    if (!IsExplicitLambda) {
+      if (!Info.CurrentCall->This) {
+        DiagnoseInvalidUseOfThis();
+        return false;
+      }
+
+      Result = *Info.CurrentCall->This;
     }
-    Result = *Info.CurrentCall->This;
 
     if (isLambdaCallOperator(Info.CurrentCall->Callee)) {
       // Ensure we actually have captured 'this'. If something was wrong with
       // 'this' capture, the error would have been previously reported.
       // Otherwise we can be inside of a default initialization of an object
       // declared by lambda's body, so no need to return false.
-      if (!Info.CurrentCall->LambdaThisCaptureField)
-        return true;
-
-      // If we have captured 'this',  the 'this' expression refers
-      // to the enclosing '*this' object (either by value or reference) which is
-      // either copied into the closure object's field that represents the
-      // '*this' or refers to '*this'.
-      // Update 'Result' to refer to the data member/field of the closure object
-      // that represents the '*this' capture.
-      if (!HandleLValueMember(Info, E, Result,
-                             Info.CurrentCall->LambdaThisCaptureField))
-        return false;
-      // If we captured '*this' by reference, replace the field with its referent.
-      if (Info.CurrentCall->LambdaThisCaptureField->getType()
-              ->isPointerType()) {
-        APValue RVal;
-        if (!handleLValueToRValueConversion(Info, E, E->getType(), Result,
-                                            RVal))
+      if (!Info.CurrentCall->LambdaThisCaptureField) {
+        if (IsExplicitLambda && !Info.CurrentCall->This) {
+          DiagnoseInvalidUseOfThis();
           return false;
+        }
 
-        Result.setFrom(Info.Ctx, RVal);
+        return true;
       }
+
+      const auto *MD = cast<CXXMethodDecl>(Info.CurrentCall->Callee);
+      return HandleLambdaCapture(
+          Info, E, Result, MD, Info.CurrentCall->LambdaThisCaptureField,
+          Info.CurrentCall->LambdaThisCaptureField->getType()->isPointerType());
     }
     return true;
   }
@@ -9252,7 +9271,8 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) {
            Info.getLangOpts().CPlusPlus26)) {
         // Permitted.
       } else {
-        if (SubExpr->getType()->isVoidPointerType()) {
+        if (SubExpr->getType()->isVoidPointerType() &&
+            Info.getLangOpts().CPlusPlus) {
           if (HasValidResult)
             CCEDiag(E, diag::note_constexpr_invalid_void_star_cast)
                 << SubExpr->getType() << Info.getLangOpts().CPlusPlus26
@@ -11869,8 +11889,8 @@ static QualType getObjectType(APValue::LValueBase B) {
 static const Expr *ignorePointerCastsAndParens(const Expr *E) {
   assert(E->isPRValue() && E->getType()->hasPointerRepresentation());
 
-  auto *NoParens = E->IgnoreParens();
-  auto *Cast = dyn_cast<CastExpr>(NoParens);
+  const Expr *NoParens = E->IgnoreParens();
+  const auto *Cast = dyn_cast<CastExpr>(NoParens);
   if (Cast == nullptr)
     return NoParens;
 
@@ -11881,7 +11901,7 @@ static const Expr *ignorePointerCastsAndParens(const Expr *E) {
       CastKind != CK_AddressSpaceConversion)
     return NoParens;
 
-  auto *SubExpr = Cast->getSubExpr();
+  const auto *SubExpr = Cast->getSubExpr();
   if (!SubExpr->getType()->hasPointerRepresentation() || !SubExpr->isPRValue())
     return NoParens;
   return ignorePointerCastsAndParens(SubExpr);
@@ -12953,6 +12973,10 @@ static bool isOnePastTheEndOfCompleteObject(const ASTContext &Ctx,
   if (Ty->isIncompleteType())
     return true;
 
+  // Can't be past the end of an invalid object.
+  if (LV.getLValueDesignator().Invalid)
+    return false;
+
   // We're a past-the-end pointer if we point to the byte after the object,
   // no matter what our type or path is.
   auto Size = Ctx.getTypeSizeInChars(Ty);
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 86304a54473ce..73831eefba456 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -82,15 +82,27 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (DiscardResult)
       return this->discard(SubExpr);
 
-    if (SubExpr->getType()->isAnyComplexType())
-      return this->delegate(SubExpr);
+    std::optional<PrimType> SubExprT = classify(SubExpr->getType());
+    // Prepare storage for the result.
+    if (!Initializing && !SubExprT) {
+      std::optional<unsigned> LocalIndex =
+          allocateLocal(SubExpr, /*IsExtended=*/false);
+      if (!LocalIndex)
+        return false;
+      if (!this->emitGetPtrLocal(*LocalIndex, CE))
+        return false;
+    }
 
     if (!this->visit(SubExpr))
       return false;
 
-    if (std::optional<PrimType> SubExprT = classify(SubExpr->getType()))
+    if (SubExprT)
       return this->emitLoadPop(*SubExprT, CE);
-    return false;
+
+    // If the subexpr type is not primitive, we need to perform a copy here.
+    // This happens for example in C when dereferencing a pointer of struct
+    // type.
+    return this->emitMemcpy(CE);
   }
 
   case CK_UncheckedDerivedToBase:
@@ -296,14 +308,11 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     // Location for the SubExpr.
     // Since SubExpr is of complex type, visiting it results in a pointer
     // anyway, so we just create a temporary pointer variable.
-    std::optional<unsigned> SubExprOffset = allocateLocalPrimitive(
+    unsigned SubExprOffset = allocateLocalPrimitive(
         SubExpr, PT_Ptr, /*IsConst=*/true, /*IsExtended=*/false);
-    if (!SubExprOffset)
-      return false;
-
     if (!this->visit(SubExpr))
       return false;
-    if (!this->emitSetLocal(PT_Ptr, *SubExprOffset, CE))
+    if (!this->emitSetLocal(PT_Ptr, SubExprOffset, CE))
       return false;
 
     PrimType SourceElemT = classifyComplexElementType(SubExpr->getType());
@@ -312,7 +321,7 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     PrimType DestElemT = classifyPrim(DestElemType);
     // Cast both elements individually.
     for (unsigned I = 0; I != 2; ++I) {
-      if (!this->emitGetLocal(PT_Ptr, *SubExprOffset, CE))
+      if (!this->emitGetLocal(PT_Ptr, SubExprOffset, CE))
         return false;
       if (!this->emitArrayElemPop(SourceElemT, I, CE))
         return false;
@@ -392,6 +401,17 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   const Expr *LHS = BO->getLHS();
   const Expr *RHS = BO->getRHS();
 
+  // Handle comma operators. Just discard the LHS
+  // and delegate to RHS.
+  if (BO->isCommaOp()) {
+    if (!this->discard(LHS))
+      return false;
+    if (RHS->getType()->isVoidType())
+      return this->discard(RHS);
+
+    return this->delegate(RHS);
+  }
+
   if (BO->getType()->isAnyComplexType())
     return this->VisitComplexBinOp(BO);
   if ((LHS->getType()->isAnyComplexType() ||
@@ -407,16 +427,6 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   std::optional<PrimType> RT = classify(RHS->getType());
   std::optional<PrimType> T = classify(BO->getType());
 
-  // Deal with operations which have composite or void types.
-  if (BO->isCommaOp()) {
-    if (!this->discard(LHS))
-      return false;
-    if (RHS->getType()->isVoidType())
-      return this->discard(RHS);
-
-    return this->delegate(RHS);
-  }
-
   // Special case for C++'s three-way/spaceship operator <=>, which
   // returns a std::{strong,weak,partial}_ordering (which is a class, so doesn't
   // have a PrimType).
@@ -676,11 +686,17 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
     if (!this->emitSetLocal(PT_Ptr, ResultOffset, E))
       return false;
   }
+  QualType LHSType = LHS->getType();
+  if (const auto *AT = LHSType->getAs<AtomicType>())
+    LHSType = AT->getValueType();
+  QualType RHSType = RHS->getType();
+  if (const auto *AT = RHSType->getAs<AtomicType>())
+    RHSType = AT->getValueType();
 
   // Evaluate LHS and save value to LHSOffset.
   bool LHSIsComplex;
   unsigned LHSOffset;
-  if (LHS->getType()->isAnyComplexType()) {
+  if (LHSType->isAnyComplexType()) {
     LHSIsComplex = true;
     LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false);
     if (!this->visit(LHS))
@@ -689,7 +705,7 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
       return false;
   } else {
     LHSIsComplex = false;
-    PrimType LHST = classifyPrim(LHS->getType());
+    PrimType LHST = classifyPrim(LHSType);
     LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false);
     if (!this->visit(LHS))
       return false;
@@ -700,7 +716,7 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
   // Same with RHS.
   bool RHSIsComplex;
   unsigned RHSOffset;
-  if (RHS->getType()->isAnyComplexType()) {
+  if (RHSType->isAnyComplexType()) {
     RHSIsComplex = true;
     RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false);
     if (!this->visit(RHS))
@@ -709,7 +725,7 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
       return false;
   } else {
     RHSIsComplex = false;
-    PrimType RHST = classifyPrim(RHS->getType());
+    PrimType RHST = classifyPrim(RHSType);
     RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false);
     if (!this->visit(RHS))
       return false;
@@ -1081,9 +1097,9 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
     const UnaryExprOrTypeTraitExpr *E) {
   UnaryExprOrTypeTrait Kind = E->getKind();
-  ASTContext &ASTCtx = Ctx.getASTContext();
+  const ASTContext &ASTCtx = Ctx.getASTContext();
 
-  if (Kind == UETT_SizeOf) {
+  if (Kind == UETT_SizeOf || Kind == UETT_DataSizeOf) {
     QualType ArgType = E->getTypeOfArgument();
 
     // C++ [expr.sizeof]p2: "When applied to a reference or a reference type,
@@ -1098,7 +1114,10 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
       if (ArgType->isDependentType() || !ArgType->isConstantSizeType())
         return false;
 
-      Size = ASTCtx.getTypeSizeInChars(ArgType);
+      if (Kind == UETT_SizeOf)
+        Size = ASTCtx.getTypeSizeInChars(ArgType);
+      else
+        Size = ASTCtx.getTypeInfoDataSizeInChars(ArgType).Width;
     }
 
     if (DiscardResult)
@@ -1233,22 +1252,19 @@ bool ByteCodeExprGen<Emitter>::VisitOpaqueValueExpr(const OpaqueValueExpr *E) {
   // At this point we either have the evaluated source expression or a pointer
   // to an object on the stack. We want to create a local variable that stores
   // this value.
-  std::optional<unsigned> LocalIndex =
-      allocateLocalPrimitive(E, SubExprT, /*IsConst=*/true);
-  if (!LocalIndex)
-    return false;
-  if (!this->emitSetLocal(SubExprT, *LocalIndex, E))
+  unsigned LocalIndex = allocateLocalPrimitive(E, SubExprT, /*IsConst=*/true);
+  if (!this->emitSetLocal(SubExprT, LocalIndex, E))
     return false;
 
   // Here the local variable is created but the value is removed from the stack,
   // so we put it back if the caller needs it.
   if (!DiscardResult) {
-    if (!this->emitGetLocal(SubExprT, *LocalIndex, E))
+    if (!this->emitGetLocal(SubExprT, LocalIndex, E))
       return false;
   }
 
   // This is cleaned up when the local variable is destroyed.
-  OpaqueExprs.insert({E, *LocalIndex});
+  OpaqueExprs.insert({E, LocalIndex});
 
   return true;
 }
@@ -1642,14 +1658,13 @@ bool ByteCodeExprGen<Emitter>::VisitMaterializeTemporaryExpr(
 
   // For everyhing else, use local variables.
   if (SubExprT) {
-    if (std::optional<unsigned> LocalIndex = allocateLocalPrimitive(
-            SubExpr, *SubExprT, /*IsConst=*/true, /*IsExtended=*/true)) {
-      if (!this->visit(SubExpr))
-        return false;
-      if (!this->emitSetLocal(*SubExprT, *LocalIndex, E))
-        return false;
-      return this->emitGetPtrLocal(*LocalIndex, E);
-    }
+    unsigned LocalIndex = allocateLocalPrimitive(
+        SubExpr, *SubExprT, /*IsConst=*/true, /*IsExtended=*/true);
+    if (!this->visit(SubExpr))
+      return false;
+    if (!this->emitSetLocal(*SubExprT, LocalIndex, E))
+      return false;
+    return this->emitGetPtrLocal(LocalIndex, E);
   } else {
     const Expr *Inner = E->getSubExpr()->skipRValueSubobjectAdjustments();
 
@@ -1744,6 +1759,14 @@ bool ByteCodeExprGen<Emitter>::VisitTypeTraitExpr(const TypeTraitExpr *E) {
   return this->emitConst(E->getValue(), E);
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitArrayTypeTraitExpr(
+    const ArrayTypeTraitExpr *E) {
+  if (DiscardResult)
+    return true;
+  return this->emitConst(E->getValue(), E);
+}
+
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitLambdaExpr(const LambdaExpr *E) {
   if (DiscardResult)
@@ -2215,6 +2238,12 @@ bool ByteCodeExprGen<Emitter>::VisitPseudoObjectExpr(
   return true;
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitPackIndexingExpr(
+    const PackIndexingExpr *E) {
+  return this->delegate(E->getSelectedExpr());
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   if (E->containsErrors())
     return false;
@@ -2227,7 +2256,7 @@ template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::delegate(const Expr *E) {
   if (E->containsErrors())
-    return false;
+    return this->emitError(E);
 
   // We're basically doing:
   // OptionScope<Emitter> Scope(this, DicardResult, Initializing);
@@ -2237,7 +2266,7 @@ bool ByteCodeExprGen<Emitter>::delegate(const Expr *E) {
 
 template <class Emitter> bool ByteCodeExprGen<Emitter>::visit(const Expr *E) {
   if (E->containsErrors())
-    return false;
+    return this->emitError(E);
 
   if (E->getType()->isVoidType())
     return this->discard(E);
@@ -2266,7 +2295,7 @@ bool ByteCodeExprGen<Emitter>::visitInitializer(const Expr *E) {
   assert(!classify(E->getType()));
 
   if (E->containsErrors())
-    return false;
+    return this->emitError(E);
 
   OptionScope<Emitter> Scope(this, /*NewDiscardResult=*/false,
                              /*NewInitializing=*/true);
@@ -2660,18 +2689,12 @@ bool ByteCodeExprGen<Emitter>::visitVarDecl(const VarDecl *VD) {
     if (P.getGlobal(VD))
       return true;
 
-    // Ignore external declarations. We will instead emit a dummy
-    // pointer when we see a DeclRefExpr for them.
-    if (VD->hasExternalStorage())
-      return true;
-
     std::optional<unsigned> GlobalIndex = P.createGlobal(VD, Init);
 
     if (!GlobalIndex)
       return false;
 
-    assert(Init);
-    {
+    if (Init) {
       DeclScope<Emitter> LocalScope(this, VD);
 
       if (VarT) {
@@ -2681,6 +2704,7 @@ bool ByteCodeExprGen<Emitter>::visitVarDecl(const VarDecl *VD) {
       }
       return this->visitGlobalInitializer(Init, *GlobalIndex);
     }
+    return true;
   } else {
     VariableScope<Emitter> LocalScope(this);
     if (VarT) {
@@ -2893,11 +2917,7 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitCXXDefaultInitExpr(
     const CXXDefaultInitExpr *E) {
   SourceLocScope<Emitter> SLS(this, E);
-  if (Initializing)
-    return this->visitInitializer(E->getExpr());
-
-  assert(classify(E->getType()));
-  return this->visit(E->getExpr());
+  return this->delegate(E->getExpr());
 }
 
 template <class Emitter>
@@ -3247,53 +3267,20 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   // pointer to the actual value) instead of a pointer to the pointer to the
   // value.
   bool IsReference = D->getType()->isReferenceType();
-  // Complex values are copied in the AST via a simply assignment or
-  // ltor cast. But we represent them as two-element arrays, which means
-  // we pass them around as pointers. So, to assignm from them, we will
-  // have to copy both (primitive) elements instead.
-  bool IsComplex = D->getType()->isAnyComplexType();
 
   // Check for local/global variables and parameters.
   if (auto It = Locals.find(D); It != Locals.end()) {
     const unsigned Offset = It->second.Offset;
-    // FIXME: Fix the code duplication here with the code in the global case.
-    if (Initializing && IsComplex) {
-      PrimType ElemT = classifyComplexElementType(D->getType());
-      for (unsigned I = 0; I != 2; ++I) {
-        if (!this->emitGetPtrLocal(Offset, E))
-          return false;
-        if (!this->emitArrayElemPop(ElemT, I, E))
-          return false;
-        if (!this->emitInitElem(ElemT, I, E))
-          return false;
-      }
-      return true;
-    }
-
     if (IsReference)
       return this->emitGetLocal(PT_Ptr, Offset, E);
     return this->emitGetPtrLocal(Offset, E);
   } else if (auto GlobalIndex = P.getGlobal(D)) {
-    if (Initializing && IsComplex) {
-      PrimType ElemT = classifyComplexElementType(D->getType());
-      for (unsigned I = 0; I != 2; ++I) {
-        if (!this->emitGetPtrGlobal(*GlobalIndex, E))
-          return false;
-        if (!this->emitArrayElemPop(ElemT, I, E))
-          return false;
-        if (!this->emitInitElem(ElemT, I, E))
-          return false;
-      }
-      return true;
-    }
-
     if (IsReference)
       return this->emitGetGlobalPtr(*GlobalIndex, E);
 
     return this->emitGetPtrGlobal(*GlobalIndex, E);
   } else if (const auto *PVD = dyn_cast<ParmVarDecl>(D)) {
     if (auto It = this->Params.find(PVD); It != this->Params.end()) {
-      // FIXME: _Complex initializing case?
       if (IsReference || !It->second.IsPtr)
         return this->emitGetParamPtr(It->second.Offset, E);
 
@@ -3316,15 +3303,13 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   if (Ctx.getLangOpts().CPlusPlus) {
     if (const auto *VD = dyn_cast<VarDecl>(D)) {
       // Visit local const variables like normal.
-      if (VD->isLocalVarDecl() && VD->getType().isConstQualified()) {
+      if ((VD->isLocalVarDecl() || VD->isStaticDataMember()) &&
+          VD->getType().isConstQualified()) {
         if (!this->visitVarDecl(VD))
           return false;
         // Retry.
         return this->VisitDeclRefExpr(E);
       }
-
-      if (VD->hasExternalStorage())
-        return this->emitInvalidDeclRef(E, E);
     }
   } else {
     if (const auto *VD = dyn_cast<VarDecl>(D);
@@ -3352,6 +3337,8 @@ template <class Emitter>
 unsigned
 ByteCodeExprGen<Emitter>::collectBaseOffset(const RecordType *BaseType,
                                             const RecordType *DerivedType) {
+  assert(BaseType);
+  assert(DerivedType);
   const auto *FinalDecl = cast<CXXRecordDecl>(BaseType->getDecl());
   const RecordDecl *CurDecl = DerivedType->getDecl();
   const Record *CurRecord = getRecord(CurDecl);
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 5ad2e74d7c269..db0d73ce23f7c 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -99,6 +99,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitCXXBindTemporaryExpr(const CXXBindTemporaryExpr *E);
   bool VisitCompoundLiteralExpr(const CompoundLiteralExpr *E);
   bool VisitTypeTraitExpr(const TypeTraitExpr *E);
+  bool VisitArrayTypeTraitExpr(const ArrayTypeTraitExpr *E);
   bool VisitLambdaExpr(const LambdaExpr *E);
   bool VisitPredefinedExpr(const PredefinedExpr *E);
   bool VisitCXXThrowExpr(const CXXThrowExpr *E);
@@ -119,6 +120,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitConceptSpecializationExpr(const ConceptSpecializationExpr *E);
   bool VisitCXXRewrittenBinaryOperator(const CXXRewrittenBinaryOperator *E);
   bool VisitPseudoObjectExpr(const PseudoObjectExpr *E);
+  bool VisitPackIndexingExpr(const PackIndexingExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index 6da3860f98d8c..675063e748988 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -273,15 +273,18 @@ bool ByteCodeStmtGen<Emitter>::visitStmt(const Stmt *S) {
     return visitCaseStmt(cast<CaseStmt>(S));
   case Stmt::DefaultStmtClass:
     return visitDefaultStmt(cast<DefaultStmt>(S));
-  case Stmt::GCCAsmStmtClass:
-  case Stmt::MSAsmStmtClass:
-    return visitAsmStmt(cast<AsmStmt>(S));
   case Stmt::AttributedStmtClass:
     return visitAttributedStmt(cast<AttributedStmt>(S));
   case Stmt::CXXTryStmtClass:
     return visitCXXTryStmt(cast<CXXTryStmt>(S));
   case Stmt::NullStmtClass:
     return true;
+  // Always invalid statements.
+  case Stmt::GCCAsmStmtClass:
+  case Stmt::MSAsmStmtClass:
+  case Stmt::GotoStmtClass:
+  case Stmt::LabelStmtClass:
+    return this->emitInvalid(S);
   default: {
     if (auto *Exp = dyn_cast<Expr>(S))
       return this->discard(Exp);
@@ -420,6 +423,11 @@ bool ByteCodeStmtGen<Emitter>::visitWhileStmt(const WhileStmt *S) {
   LoopScope<Emitter> LS(this, EndLabel, CondLabel);
 
   this->emitLabel(CondLabel);
+
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
+
   if (!this->visitBool(Cond))
     return false;
   if (!this->jumpFalse(EndLabel))
@@ -484,6 +492,10 @@ bool ByteCodeStmtGen<Emitter>::visitForStmt(const ForStmt *S) {
   if (Init && !this->visitStmt(Init))
     return false;
   this->emitLabel(CondLabel);
+
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
   if (Cond) {
     if (!this->visitBool(Cond))
       return false;
@@ -582,17 +594,21 @@ bool ByteCodeStmtGen<Emitter>::visitContinueStmt(const ContinueStmt *S) {
 template <class Emitter>
 bool ByteCodeStmtGen<Emitter>::visitSwitchStmt(const SwitchStmt *S) {
   const Expr *Cond = S->getCond();
-  PrimType CondT = this->classifyPrim(Cond->getType());
 
   LabelTy EndLabel = this->getLabel();
   OptLabelTy DefaultLabel = std::nullopt;
-  unsigned CondVar = this->allocateLocalPrimitive(Cond, CondT, true, false);
 
   if (const auto *CondInit = S->getInit())
     if (!visitStmt(CondInit))
       return false;
 
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
+
   // Initialize condition variable.
+  PrimType CondT = this->classifyPrim(Cond->getType());
+  unsigned CondVar = this->allocateLocalPrimitive(Cond, CondT, true, false);
   if (!this->visit(Cond))
     return false;
   if (!this->emitSetLocal(CondT, CondVar, S))
@@ -657,11 +673,6 @@ bool ByteCodeStmtGen<Emitter>::visitDefaultStmt(const DefaultStmt *S) {
   return this->visitStmt(S->getSubStmt());
 }
 
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitAsmStmt(const AsmStmt *S) {
-  return this->emitInvalid(S);
-}
-
 template <class Emitter>
 bool ByteCodeStmtGen<Emitter>::visitAttributedStmt(const AttributedStmt *S) {
   // Ignore all attributes.
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.h b/clang/lib/AST/Interp/ByteCodeStmtGen.h
index 64e03587ab211..ab7a591fb798e 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.h
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.h
@@ -63,7 +63,6 @@ class ByteCodeStmtGen final : public ByteCodeExprGen<Emitter> {
   bool visitSwitchStmt(const SwitchStmt *S);
   bool visitCaseStmt(const CaseStmt *S);
   bool visitDefaultStmt(const DefaultStmt *S);
-  bool visitAsmStmt(const AsmStmt *S);
   bool visitAttributedStmt(const AttributedStmt *S);
   bool visitCXXTryStmt(const CXXTryStmt *S);
 
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index ce7ed9cec3db3..a4ccc0236d292 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -233,9 +233,10 @@ static BlockMoveFn getMoveArrayPrim(PrimType Type) {
 Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsConst, bool IsTemporary, bool IsMutable)
     : Source(D), ElemSize(primSize(Type)), Size(ElemSize),
-      MDSize(MD.value_or(0)), AllocSize(align(Size + MDSize)), IsConst(IsConst),
-      IsMutable(IsMutable), IsTemporary(IsTemporary), CtorFn(getCtorPrim(Type)),
-      DtorFn(getDtorPrim(Type)), MoveFn(getMovePrim(Type)) {
+      MDSize(MD.value_or(0)), AllocSize(align(Size + MDSize)), PrimT(Type),
+      IsConst(IsConst), IsMutable(IsMutable), IsTemporary(IsTemporary),
+      CtorFn(getCtorPrim(Type)), DtorFn(getDtorPrim(Type)),
+      MoveFn(getMovePrim(Type)) {
   assert(AllocSize >= Size);
   assert(Source && "Missing source");
 }
@@ -246,7 +247,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsMutable)
     : Source(D), ElemSize(primSize(Type)), Size(ElemSize * NumElems),
       MDSize(MD.value_or(0)),
-      AllocSize(align(MDSize) + align(Size) + sizeof(InitMapPtr)),
+      AllocSize(align(MDSize) + align(Size) + sizeof(InitMapPtr)), PrimT(Type),
       IsConst(IsConst), IsMutable(IsMutable), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
       DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
@@ -300,10 +301,19 @@ Descriptor::Descriptor(const DeclTy &D, const Record *R, MetadataSize MD,
   assert(Source && "Missing source");
 }
 
-Descriptor::Descriptor(const DeclTy &D, MetadataSize MD)
-    : Source(D), ElemSize(1), Size(ElemSize), MDSize(MD.value_or(0)),
-      AllocSize(Size + MDSize), ElemRecord(nullptr), IsConst(true),
-      IsMutable(false), IsTemporary(false), IsDummy(true) {
+/// Dummy.
+Descriptor::Descriptor(const DeclTy &D)
+    : Source(D), ElemSize(1), Size(1), MDSize(0), AllocSize(MDSize),
+      ElemRecord(nullptr), IsConst(true), IsMutable(false), IsTemporary(false),
+      IsDummy(true) {
+  assert(Source && "Missing source");
+}
+
+/// Dummy array.
+Descriptor::Descriptor(const DeclTy &D, UnknownSize)
+    : Source(D), ElemSize(1), Size(UnknownSizeMark), MDSize(0),
+      AllocSize(MDSize), ElemRecord(nullptr), IsConst(true), IsMutable(false),
+      IsTemporary(false), IsArray(true), IsDummy(true) {
   assert(Source && "Missing source");
 }
 
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index 0f64d678f3ef6..4e257361ad146 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -112,6 +112,10 @@ struct Descriptor final {
   const Record *const ElemRecord = nullptr;
   /// Descriptor of the array element.
   const Descriptor *const ElemDesc = nullptr;
+  /// The primitive type this descriptor was created for,
+  /// or the primitive element type in case this is
+  /// a primitive array.
+  const std::optional<PrimType> PrimT = std::nullopt;
   /// Flag indicating if the block is mutable.
   const bool IsConst = false;
   /// Flag indicating if a field is mutable.
@@ -152,7 +156,11 @@ struct Descriptor final {
   Descriptor(const DeclTy &D, const Record *R, MetadataSize MD, bool IsConst,
              bool IsTemporary, bool IsMutable);
 
-  Descriptor(const DeclTy &D, MetadataSize MD);
+  /// Allocates a dummy descriptor.
+  Descriptor(const DeclTy &D);
+
+  /// Allocates a dummy array descriptor.
+  Descriptor(const DeclTy &D, UnknownSize);
 
   QualType getType() const;
   QualType getElemQualType() const;
@@ -183,6 +191,11 @@ struct Descriptor final {
     return Size;
   }
 
+  PrimType getPrimType() const {
+    assert(isPrimitiveArray() || isPrimitive());
+    return *PrimT;
+  }
+
   /// Returns the allocated size, including metadata.
   unsigned getAllocSize() const { return AllocSize; }
   /// returns the size of an element when the structure is viewed as an array.
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index 315ddb293044b..01ef1c24744a5 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -10,9 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Boolean.h"
 #include "Floating.h"
 #include "Function.h"
+#include "FunctionPointer.h"
+#include "Integral.h"
 #include "IntegralAP.h"
+#include "InterpFrame.h"
 #include "Opcode.h"
 #include "PrimType.h"
 #include "Program.h"
@@ -86,6 +90,40 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
 
 LLVM_DUMP_METHOD void Program::dump() const { dump(llvm::errs()); }
 
+static const char *primTypeToString(PrimType T) {
+  switch (T) {
+  case PT_Sint8:
+    return "Sint8";
+  case PT_Uint8:
+    return "Uint8";
+  case PT_Sint16:
+    return "Sint16";
+  case PT_Uint16:
+    return "Uint16";
+  case PT_Sint32:
+    return "Sint32";
+  case PT_Uint32:
+    return "Uint32";
+  case PT_Sint64:
+    return "Sint64";
+  case PT_Uint64:
+    return "Uint64";
+  case PT_IntAP:
+    return "IntAP";
+  case PT_IntAPS:
+    return "IntAPS";
+  case PT_Bool:
+    return "Bool";
+  case PT_Float:
+    return "Float";
+  case PT_Ptr:
+    return "Ptr";
+  case PT_FnPtr:
+    return "FnPtr";
+  }
+  llvm_unreachable("Unhandled PrimType");
+}
+
 LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
   {
     ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true});
@@ -100,9 +138,10 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
   unsigned GI = 0;
   for (const Global *G : Globals) {
     const Descriptor *Desc = G->block()->getDescriptor();
+    Pointer GP = getPtrGlobal(GI);
+
     OS << GI << ": " << (void *)G->block() << " ";
     {
-      Pointer GP = getPtrGlobal(GI);
       ColorScope SC(OS, true,
                     GP.isInitialized()
                         ? TerminalColor{llvm::raw_ostream::GREEN, false}
@@ -111,6 +150,15 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
     }
     Desc->dump(OS);
     OS << "\n";
+    if (Desc->isPrimitive() && !Desc->isDummy()) {
+      OS << "   ";
+      {
+        ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_CYAN, false});
+        OS << primTypeToString(Desc->getPrimType()) << " ";
+      }
+      TYPE_SWITCH(Desc->getPrimType(), { GP.deref<T>().print(OS); });
+      OS << "\n";
+    }
     ++GI;
   }
 
@@ -136,7 +184,7 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
   {
     ColorScope SC(OS, true, {llvm::raw_ostream::BLUE, true});
     if (const auto *ND = dyn_cast_if_present<NamedDecl>(asDecl()))
-      OS << ND->getName();
+      ND->printQualifiedName(OS);
     else if (asExpr())
       OS << "expr (TODO)";
   }
@@ -159,3 +207,29 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
   if (isDummy())
     OS << " dummy";
 }
+
+LLVM_DUMP_METHOD void InterpFrame::dump(llvm::raw_ostream &OS,
+                                        unsigned Indent) const {
+  unsigned Spaces = Indent * 2;
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::BLUE, true});
+    OS.indent(Spaces);
+    if (getCallee())
+      describe(OS);
+    else
+      OS << "Frame (Depth: " << getDepth() << ")";
+    OS << "\n";
+  }
+  OS.indent(Spaces) << "Function: " << getFunction();
+  if (const Function *F = getFunction()) {
+    OS << " (" << F->getName() << ")";
+  }
+  OS << "\n";
+  OS.indent(Spaces) << "This: " << getThis() << "\n";
+  OS.indent(Spaces) << "RVO: " << getRVOPtr() << "\n";
+
+  while (const InterpFrame *F = this->Caller) {
+    F->dump(OS, Indent + 1);
+    F = F->Caller;
+  }
+}
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 4f3cd6cd21a15..0ce64a572c263 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -254,10 +254,10 @@ bool CheckConstant(InterpState &S, CodePtr OpPC, const Descriptor *Desc) {
     if (VD->isConstexpr())
       return true;
 
+    QualType T = VD->getType();
     if (S.getLangOpts().CPlusPlus && !S.getLangOpts().CPlusPlus11)
-      return false;
+      return T->isSignedIntegerOrEnumerationType() || T->isUnsignedIntegerOrEnumerationType();
 
-    QualType T = VD->getType();
     if (T.isConstQualified())
       return true;
 
@@ -485,7 +485,9 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
         // Don't emit anything if the function isn't defined and we're checking
         // for a constant expression. It might be defined at the point we're
         // actually calling it.
-        if (!DiagDecl->isDefined() && S.checkingPotentialConstantExpression())
+        bool IsExtern = DiagDecl->getStorageClass() == SC_Extern;
+        if (!DiagDecl->isDefined() && !IsExtern &&
+            S.checkingPotentialConstantExpression())
           return false;
 
         // If the declaration is defined _and_ declared 'constexpr', the below
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index db80e2d59753f..405993eb82703 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -122,6 +122,9 @@ bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
 bool SetThreeWayComparisonField(InterpState &S, CodePtr OpPC,
                                 const Pointer &Ptr, const APSInt &IntValue);
 
+/// Copy the contents of Src into Dest.
+bool DoMemcpy(InterpState &S, CodePtr OpPC, const Pointer &Src, Pointer &Dest);
+
 /// Checks if the shift operation is legal.
 template <typename LT, typename RT>
 bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
@@ -165,7 +168,8 @@ bool CheckDivRem(InterpState &S, CodePtr OpPC, const T &LHS, const T &RHS) {
     const auto *Op = cast<BinaryOperator>(S.Current->getExpr(OpPC));
     S.FFDiag(Op, diag::note_expr_divide_by_zero)
         << Op->getRHS()->getSourceRange();
-    return false;
+    if constexpr (!std::is_same_v<T, Floating>)
+      return false;
   }
 
   if (LHS.isSigned() && LHS.isMin() && RHS.isNegative() && RHS.isMinusOne()) {
@@ -1487,6 +1491,16 @@ bool InitElemPop(InterpState &S, CodePtr OpPC, uint32_t Idx) {
   return true;
 }
 
+inline bool Memcpy(InterpState &S, CodePtr OpPC) {
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  Pointer &Dest = S.Stk.peek<Pointer>();
+
+  if (!CheckLoad(S, OpPC, Src))
+    return false;
+
+  return DoMemcpy(S, OpPC, Src, Dest);
+}
+
 //===----------------------------------------------------------------------===//
 // AddOffset, SubOffset
 //===----------------------------------------------------------------------===//
@@ -1933,8 +1947,15 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
-  if (Ptr.isDummy())
-    return true;
+  if (!Ptr.isZero()) {
+    if (!CheckArray(S, OpPC, Ptr))
+      return false;
+
+    if (Ptr.isDummy()) {
+      S.Stk.push<Pointer>(Ptr);
+      return true;
+    }
+  }
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
     return false;
@@ -1947,9 +1968,14 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (Ptr.isDummy()) {
-    S.Stk.push<Pointer>(Ptr);
-    return true;
+  if (!Ptr.isZero()) {
+    if (!CheckArray(S, OpPC, Ptr))
+      return false;
+
+    if (Ptr.isDummy()) {
+      S.Stk.push<Pointer>(Ptr);
+      return true;
+    }
   }
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
@@ -2201,6 +2227,9 @@ inline bool Invalid(InterpState &S, CodePtr OpPC) {
   return false;
 }
 
+/// Do nothing and just abort execution.
+inline bool Error(InterpState &S, CodePtr OpPC) { return false; }
+
 /// Same here, but only for casts.
 inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind) {
   const SourceLocation &Loc = S.Current->getLocation(OpPC);
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index c500b9d502d70..1bf5d55314f1f 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -119,6 +119,36 @@ static bool retPrimValue(InterpState &S, CodePtr OpPC, APValue &Result,
 #undef RET_CASE
 }
 
+static bool interp__builtin_is_constant_evaluated(InterpState &S, CodePtr OpPC,
+                                                  const InterpFrame *Frame,
+                                                  const CallExpr *Call) {
+  // The current frame is the one for __builtin_is_constant_evaluated.
+  // The one above that, potentially the one for std::is_constant_evaluated().
+  if (S.inConstantContext() && !S.checkingPotentialConstantExpression() &&
+      Frame->Caller && S.getEvalStatus().Diag) {
+    auto isStdCall = [](const FunctionDecl *F) -> bool {
+      return F && F->isInStdNamespace() && F->getIdentifier() &&
+             F->getIdentifier()->isStr("is_constant_evaluated");
+    };
+    const InterpFrame *Caller = Frame->Caller;
+
+    if (Caller->Caller && isStdCall(Caller->getCallee())) {
+      const Expr *E = Caller->Caller->getExpr(Caller->getRetPC());
+      S.report(E->getExprLoc(),
+               diag::warn_is_constant_evaluated_always_true_constexpr)
+          << "std::is_constant_evaluated";
+    } else {
+      const Expr *E = Frame->Caller->getExpr(Frame->getRetPC());
+      S.report(E->getExprLoc(),
+               diag::warn_is_constant_evaluated_always_true_constexpr)
+          << "__builtin_is_constant_evaluated";
+    }
+  }
+
+  S.Stk.push<Boolean>(Boolean::from(S.inConstantContext()));
+  return true;
+}
+
 static bool interp__builtin_strcmp(InterpState &S, CodePtr OpPC,
                                    const InterpFrame *Frame,
                                    const CallExpr *Call) {
@@ -533,11 +563,12 @@ static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC,
                                    const InterpFrame *Frame,
                                    const Function *Func, const CallExpr *Call,
                                    bool Right) {
-  PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
-  assert(ArgT == *S.getContext().classify(Call->getArg(1)->getType()));
+  PrimType AmountT = *S.getContext().classify(Call->getArg(1)->getType());
+  PrimType ValueT = *S.getContext().classify(Call->getArg(0)->getType());
 
-  APSInt Amount = peekToAPSInt(S.Stk, ArgT);
-  APSInt Value = peekToAPSInt(S.Stk, ArgT, align(primSize(ArgT)) * 2);
+  APSInt Amount = peekToAPSInt(S.Stk, AmountT);
+  APSInt Value = peekToAPSInt(
+      S.Stk, ValueT, align(primSize(AmountT)) + align(primSize(ValueT)));
 
   APSInt Result;
   if (Right)
@@ -605,10 +636,9 @@ static bool interp__builtin_eh_return_data_regno(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_launder(InterpState &S, CodePtr OpPC,
-                                    const InterpFrame *Frame,
-                                    const Function *Func,
-                                    const CallExpr *Call) {
+/// Just takes the first Argument to the call and puts it on the stack.
+static bool noopPointer(InterpState &S, CodePtr OpPC, const InterpFrame *Frame,
+                        const Function *Func, const CallExpr *Call) {
   const Pointer &Arg = S.Stk.peek<Pointer>();
   S.Stk.push<Pointer>(Arg);
   return true;
@@ -917,14 +947,15 @@ static bool interp__builtin_complex(InterpState &S, CodePtr OpPC,
 
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *Call) {
-  InterpFrame *Frame = S.Current;
+  const InterpFrame *Frame = S.Current;
   APValue Dummy;
 
   std::optional<PrimType> ReturnT = S.getContext().classify(Call);
 
   switch (F->getBuiltinID()) {
   case Builtin::BI__builtin_is_constant_evaluated:
-    S.Stk.push<Boolean>(Boolean::from(S.inConstantContext()));
+    if (!interp__builtin_is_constant_evaluated(S, OpPC, Frame, Call))
+      return false;
     break;
   case Builtin::BI__builtin_assume:
   case Builtin::BI__assume:
@@ -1143,7 +1174,9 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
     break;
 
   case Builtin::BI__builtin_launder:
-    if (!interp__builtin_launder(S, OpPC, Frame, F, Call))
+  case Builtin::BI__builtin___CFStringMakeConstantString:
+  case Builtin::BI__builtin___NSStringMakeConstantString:
+    if (!noopPointer(S, OpPC, Frame, F, Call))
       return false;
     break;
 
@@ -1326,5 +1359,50 @@ bool SetThreeWayComparisonField(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+bool DoMemcpy(InterpState &S, CodePtr OpPC, const Pointer &Src, Pointer &Dest) {
+  assert(Src.isLive() && Dest.isLive());
+
+  [[maybe_unused]] const Descriptor *SrcDesc = Src.getFieldDesc();
+  const Descriptor *DestDesc = Dest.getFieldDesc();
+
+  assert(!DestDesc->isPrimitive() && !SrcDesc->isPrimitive());
+
+  if (DestDesc->isPrimitiveArray()) {
+    assert(SrcDesc->isPrimitiveArray());
+    assert(SrcDesc->getNumElems() == DestDesc->getNumElems());
+    PrimType ET = DestDesc->getPrimType();
+    for (unsigned I = 0, N = DestDesc->getNumElems(); I != N; ++I) {
+      Pointer DestElem = Dest.atIndex(I);
+      TYPE_SWITCH(ET, {
+        DestElem.deref<T>() = Src.atIndex(I).deref<T>();
+        DestElem.initialize();
+      });
+    }
+    return true;
+  }
+
+  if (DestDesc->isRecord()) {
+    assert(SrcDesc->isRecord());
+    assert(SrcDesc->ElemRecord == DestDesc->ElemRecord);
+    const Record *R = DestDesc->ElemRecord;
+    for (const Record::Field &F : R->fields()) {
+      Pointer DestField = Dest.atField(F.Offset);
+      if (std::optional<PrimType> FT = S.Ctx.classify(F.Decl->getType())) {
+        TYPE_SWITCH(*FT, {
+          DestField.deref<T>() = Src.atField(F.Offset).deref<T>();
+          DestField.initialize();
+        });
+      } else {
+        return Invalid(S, OpPC);
+      }
+    }
+    return true;
+  }
+
+  // FIXME: Composite types.
+
+  return Invalid(S, OpPC);
+}
+
 } // namespace interp
 } // namespace clang
diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp
index f69ff06b5e81b..12e2e6ff9155b 100644
--- a/clang/lib/AST/Interp/InterpFrame.cpp
+++ b/clang/lib/AST/Interp/InterpFrame.cpp
@@ -190,6 +190,8 @@ SourceRange InterpFrame::getCallRange() const {
 }
 
 const FunctionDecl *InterpFrame::getCallee() const {
+  if (!Func)
+    return nullptr;
   return Func->getDecl();
 }
 
diff --git a/clang/lib/AST/Interp/InterpFrame.h b/clang/lib/AST/Interp/InterpFrame.h
index 322d5dcfa698a..1f80a0a5d2c49 100644
--- a/clang/lib/AST/Interp/InterpFrame.h
+++ b/clang/lib/AST/Interp/InterpFrame.h
@@ -123,6 +123,9 @@ class InterpFrame final : public Frame {
 
   unsigned getDepth() const { return Depth; }
 
+  void dump() const { dump(llvm::errs(), 0); }
+  void dump(llvm::raw_ostream &OS, unsigned Indent = 0) const;
+
 private:
   /// Returns an original argument from the stack.
   template <typename T> const T &stackRef(unsigned Offset) const {
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 9b99aa0ccb558..cc1310f4c0d52 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -706,6 +706,7 @@ def Dup : Opcode {
 
 // [] -> []
 def Invalid : Opcode {}
+def Error : Opcode {}
 def InvalidCast : Opcode {
   let Args = [ArgCastKind];
 }
@@ -720,3 +721,5 @@ def CheckNonNullArg : Opcode {
   let Types = [PtrTypeClass];
   let HasGroup = 1;
 }
+
+def Memcpy : Opcode;
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index 3f85635f43674..af60ced0e10e9 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -320,10 +320,10 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx) const {
     // Complex types.
     if (const auto *CT = Ty->getAs<ComplexType>()) {
       QualType ElemTy = CT->getElementType();
-      std::optional<PrimType> ElemT = Ctx.classify(ElemTy);
-      assert(ElemT);
 
       if (ElemTy->isIntegerType()) {
+        std::optional<PrimType> ElemT = Ctx.classify(ElemTy);
+        assert(ElemT);
         INT_TYPE_SWITCH(*ElemT, {
           auto V1 = Ptr.atIndex(0).deref<T>();
           auto V2 = Ptr.atIndex(1).deref<T>();
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index 34ecdb967960d..fffb4aba492fc 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -285,6 +285,11 @@ class Pointer {
   bool inPrimitiveArray() const { return getFieldDesc()->isPrimitiveArray(); }
   /// Checks if the structure is an array of unknown size.
   bool isUnknownSizeArray() const {
+    // If this points inside a dummy block, return true.
+    // FIXME: This might change in the future. If it does, we need
+    // to set the proper Ctor/Dtor functions for dummy Descriptors.
+    if (Base != 0 && Base != sizeof(InlineDescriptor) && isDummy())
+      return true;
     return getFieldDesc()->isUnknownSizeArray();
   }
   /// Checks if the pointer points to an array.
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index 86e18ede63811..da6f72c62115d 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -145,11 +145,20 @@ std::optional<unsigned> Program::getOrCreateGlobal(const ValueDecl *VD,
 
 std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *VD) {
   // Dedup blocks since they are immutable and pointers cannot be compared.
-  if (auto It = DummyParams.find(VD); It != DummyParams.end())
+  if (auto It = DummyVariables.find(VD); It != DummyVariables.end())
     return It->second;
 
   // Create dummy descriptor.
-  Descriptor *Desc = allocateDescriptor(VD, std::nullopt);
+  // We create desriptors of 'array of unknown size' if the type is an array
+  // type _and_ the size isn't known (it's not a ConstantArrayType). If the size
+  // is known however, we create a regular dummy pointer.
+  Descriptor *Desc;
+  if (const auto *AT = VD->getType()->getAsArrayTypeUnsafe();
+      AT && !isa<ConstantArrayType>(AT))
+    Desc = allocateDescriptor(VD, Descriptor::UnknownSize{});
+  else
+    Desc = allocateDescriptor(VD);
+
   // Allocate a block for storage.
   unsigned I = Globals.size();
 
@@ -158,7 +167,7 @@ std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *VD) {
   G->block()->invokeCtor();
 
   Globals.push_back(G);
-  DummyParams[VD] = I;
+  DummyVariables[VD] = I;
   return I;
 }
 
diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h
index 50bdb575e805c..36b5a1faa513a 100644
--- a/clang/lib/AST/Interp/Program.h
+++ b/clang/lib/AST/Interp/Program.h
@@ -208,7 +208,7 @@ class Program final {
   llvm::DenseMap<const RecordDecl *, Record *> Records;
 
   /// Dummy parameter to generate pointers from.
-  llvm::DenseMap<const ValueDecl *, unsigned> DummyParams;
+  llvm::DenseMap<const ValueDecl *, unsigned> DummyVariables;
 
   /// Creates a new descriptor.
   template <typename... Ts>
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 67e021a4ff258..06600b6ab8060 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -2431,6 +2431,7 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty,
   case Type::MacroQualified:
   case Type::BitInt:
   case Type::DependentBitInt:
+  case Type::CountAttributed:
     llvm_unreachable("type is illegal as a nested name specifier");
 
   case Type::SubstTemplateTypeParmPack:
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index b26141de7459e..9cccfe8d064fe 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -390,6 +390,7 @@ class MicrosoftCXXNameMangler {
                           const FunctionDecl *D = nullptr,
                           bool ForceThisQuals = false,
                           bool MangleExceptionSpec = true);
+  void mangleSourceName(StringRef Name);
   void mangleNestedName(GlobalDecl GD);
 
 private:
@@ -408,7 +409,6 @@ class MicrosoftCXXNameMangler {
     mangleUnqualifiedName(GD, cast<NamedDecl>(GD.getDecl())->getDeclName());
   }
   void mangleUnqualifiedName(GlobalDecl GD, DeclarationName Name);
-  void mangleSourceName(StringRef Name);
   void mangleOperatorName(OverloadedOperatorKind OO, SourceLocation Loc);
   void mangleCXXDtorType(CXXDtorType T);
   void mangleQualifiers(Qualifiers Quals, bool IsMember);
@@ -3953,7 +3953,8 @@ void MicrosoftMangleContextImpl::mangleThreadSafeStaticGuardVariable(
   msvc_hashing_ostream MHO(Out);
   MicrosoftCXXNameMangler Mangler(*this, MHO);
 
-  Mangler.getStream() << "?$TSS" << GuardNum << '@';
+  Mangler.getStream() << "?";
+  Mangler.mangleSourceName("$TSS" + llvm::utostr(GuardNum));
   Mangler.mangleNestedName(VD);
   Mangler.getStream() << "@4HA";
 }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 3dea6ca4e3122..20887bab2dd70 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -385,6 +385,26 @@ void DependentBitIntType::Profile(llvm::FoldingSetNodeID &ID,
   NumBitsExpr->Profile(ID, Context, true);
 }
 
+bool BoundsAttributedType::referencesFieldDecls() const {
+  return llvm::any_of(dependent_decls(),
+                      [](const TypeCoupledDeclRefInfo &Info) {
+                        return isa<FieldDecl>(Info.getDecl());
+                      });
+}
+
+void CountAttributedType::Profile(llvm::FoldingSetNodeID &ID,
+                                  QualType WrappedTy, Expr *CountExpr,
+                                  bool CountInBytes, bool OrNull) {
+  ID.AddPointer(WrappedTy.getAsOpaquePtr());
+  ID.AddBoolean(CountInBytes);
+  ID.AddBoolean(OrNull);
+  // We profile it as a pointer as the StmtProfiler considers parameter
+  // expressions on function declaration and function definition as the
+  // same, resulting in count expression being evaluated with ParamDecl
+  // not in the function scope.
+  ID.AddPointer(CountExpr);
+}
+
 /// getArrayElementTypeNoTypeQual - If this is an array type, return the
 /// element type of the array, potentially with type qualifiers missing.
 /// This method should never be used when type qualifiers are meaningful.
@@ -559,6 +579,14 @@ template <> const AttributedType *Type::getAs() const {
   return getAsSugar<AttributedType>(this);
 }
 
+template <> const BoundsAttributedType *Type::getAs() const {
+  return getAsSugar<BoundsAttributedType>(this);
+}
+
+template <> const CountAttributedType *Type::getAs() const {
+  return getAsSugar<CountAttributedType>(this);
+}
+
 /// getUnqualifiedDesugaredType - Pull any qualifiers and syntactic
 /// sugar off the given type.  This should produce an object of the
 /// same dynamic type as the canonical type.
@@ -641,6 +669,10 @@ bool Type::isScopedEnumeralType() const {
   return false;
 }
 
+bool Type::isCountAttributedType() const {
+  return getAs<CountAttributedType>();
+}
+
 const ComplexType *Type::getAsComplexIntegerType() const {
   if (const auto *Complex = getAs<ComplexType>())
     if (Complex->getElementType()->isIntegerType())
@@ -3715,6 +3747,43 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID,
           getExtProtoInfo(), Ctx, isCanonicalUnqualified());
 }
 
+TypeCoupledDeclRefInfo::TypeCoupledDeclRefInfo(ValueDecl *D, bool Deref)
+    : Data(D, Deref << DerefShift) {}
+
+bool TypeCoupledDeclRefInfo::isDeref() const {
+  return Data.getInt() & DerefMask;
+}
+ValueDecl *TypeCoupledDeclRefInfo::getDecl() const { return Data.getPointer(); }
+unsigned TypeCoupledDeclRefInfo::getInt() const { return Data.getInt(); }
+void *TypeCoupledDeclRefInfo::getOpaqueValue() const {
+  return Data.getOpaqueValue();
+}
+bool TypeCoupledDeclRefInfo::operator==(
+    const TypeCoupledDeclRefInfo &Other) const {
+  return getOpaqueValue() == Other.getOpaqueValue();
+}
+void TypeCoupledDeclRefInfo::setFromOpaqueValue(void *V) {
+  Data.setFromOpaqueValue(V);
+}
+
+BoundsAttributedType::BoundsAttributedType(TypeClass TC, QualType Wrapped,
+                                           QualType Canon)
+    : Type(TC, Canon, Wrapped->getDependence()), WrappedTy(Wrapped) {}
+
+CountAttributedType::CountAttributedType(
+    QualType Wrapped, QualType Canon, Expr *CountExpr, bool CountInBytes,
+    bool OrNull, ArrayRef<TypeCoupledDeclRefInfo> CoupledDecls)
+    : BoundsAttributedType(CountAttributed, Wrapped, Canon),
+      CountExpr(CountExpr) {
+  CountAttributedTypeBits.NumCoupledDecls = CoupledDecls.size();
+  CountAttributedTypeBits.CountInBytes = CountInBytes;
+  CountAttributedTypeBits.OrNull = OrNull;
+  auto *DeclSlot = getTrailingObjects<TypeCoupledDeclRefInfo>();
+  Decls = llvm::ArrayRef(DeclSlot, CoupledDecls.size());
+  for (unsigned i = 0; i != CoupledDecls.size(); ++i)
+    DeclSlot[i] = CoupledDecls[i];
+}
+
 TypedefType::TypedefType(TypeClass tc, const TypedefNameDecl *D,
                          QualType Underlying, QualType can)
     : Type(tc, can, toSemanticDependence(can->getDependence())),
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index 90ef2f8025e67..b85c7b802f2ad 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -521,6 +521,10 @@ SourceRange AttributedTypeLoc::getLocalSourceRange() const {
   return getAttr() ? getAttr()->getRange() : SourceRange();
 }
 
+SourceRange CountAttributedTypeLoc::getLocalSourceRange() const {
+  return getCountExpr() ? getCountExpr()->getSourceRange() : SourceRange();
+}
+
 SourceRange BTFTagAttributedTypeLoc::getLocalSourceRange() const {
   return getAttr() ? getAttr()->getRange() : SourceRange();
 }
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 55c6eccafa56c..3f630f8c94548 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -288,6 +288,7 @@ bool TypePrinter::canPrefixQualifiers(const Type *T,
     case Type::PackExpansion:
     case Type::SubstTemplateTypeParm:
     case Type::MacroQualified:
+    case Type::CountAttributed:
       CanPrefixQualifiers = false;
       break;
 
@@ -1649,6 +1650,17 @@ void TypePrinter::printElaboratedBefore(const ElaboratedType *T,
          OS << " ";
     }
     NestedNameSpecifier *Qualifier = T->getQualifier();
+    if (!Policy.SuppressTagKeyword && Policy.SuppressScope &&
+        !Policy.SuppressUnwrittenScope) {
+      bool OldTagKeyword = Policy.SuppressTagKeyword;
+      bool OldSupressScope = Policy.SuppressScope;
+      Policy.SuppressTagKeyword = true;
+      Policy.SuppressScope = false;
+      printBefore(T->getNamedType(), OS);
+      Policy.SuppressTagKeyword = OldTagKeyword;
+      Policy.SuppressScope = OldSupressScope;
+      return;
+    }
     if (Qualifier && !(Policy.SuppressTypedefs &&
                        T->getNamedType()->getTypeClass() == Type::Typedef))
       Qualifier->print(OS, Policy);
@@ -1732,6 +1744,36 @@ void TypePrinter::printPackExpansionAfter(const PackExpansionType *T,
   OS << "...";
 }
 
+static void printCountAttributedImpl(const CountAttributedType *T,
+                                     raw_ostream &OS,
+                                     const PrintingPolicy &Policy) {
+  if (T->isCountInBytes() && T->isOrNull())
+    OS << " __sized_by_or_null(";
+  else if (T->isCountInBytes())
+    OS << " __sized_by(";
+  else if (T->isOrNull())
+    OS << " __counted_by_or_null(";
+  else
+    OS << " __counted_by(";
+  if (T->getCountExpr())
+    T->getCountExpr()->printPretty(OS, nullptr, Policy);
+  OS << ')';
+}
+
+void TypePrinter::printCountAttributedBefore(const CountAttributedType *T,
+                                             raw_ostream &OS) {
+  printBefore(T->desugar(), OS);
+  if (!T->desugar()->isArrayType())
+    printCountAttributedImpl(T, OS, Policy);
+}
+
+void TypePrinter::printCountAttributedAfter(const CountAttributedType *T,
+                                            raw_ostream &OS) {
+  printAfter(T->desugar(), OS);
+  if (T->desugar()->isArrayType())
+    printCountAttributedImpl(T, OS, Policy);
+}
+
 void TypePrinter::printAttributedBefore(const AttributedType *T,
                                         raw_ostream &OS) {
   // FIXME: Generate this with TableGen.
@@ -1866,6 +1908,7 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
     OS << "pipe";
     break;
 
+  case attr::CountedBy:
   case attr::LifetimeBound:
   case attr::TypeNonNull:
   case attr::TypeNullable:
diff --git a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
similarity index 89%
rename from clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
rename to clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
index 7c9f8fbb0a700..3813b8c3ee8a2 100644
--- a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
@@ -1,4 +1,4 @@
-//===- ControlFlowContext.cpp ---------------------------------------------===//
+//===- AdornedCFG.cpp ---------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file defines a ControlFlowContext class that is used by dataflow
-//  analyses that run over Control-Flow Graphs (CFGs).
+//  This file defines an `AdornedCFG` class that is used by dataflow analyses
+//  that run over Control-Flow Graphs (CFGs).
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
@@ -126,8 +126,7 @@ buildContainsExprConsumedInDifferentBlock(
   return Result;
 }
 
-llvm::Expected<ControlFlowContext>
-ControlFlowContext::build(const FunctionDecl &Func) {
+llvm::Expected<AdornedCFG> AdornedCFG::build(const FunctionDecl &Func) {
   if (!Func.doesThisDeclarationHaveABody())
     return llvm::createStringError(
         std::make_error_code(std::errc::invalid_argument),
@@ -136,8 +135,8 @@ ControlFlowContext::build(const FunctionDecl &Func) {
   return build(Func, *Func.getBody(), Func.getASTContext());
 }
 
-llvm::Expected<ControlFlowContext>
-ControlFlowContext::build(const Decl &D, Stmt &S, ASTContext &C) {
+llvm::Expected<AdornedCFG> AdornedCFG::build(const Decl &D, Stmt &S,
+                                             ASTContext &C) {
   if (D.isTemplated())
     return llvm::createStringError(
         std::make_error_code(std::errc::invalid_argument),
@@ -175,9 +174,9 @@ ControlFlowContext::build(const Decl &D, Stmt &S, ASTContext &C) {
   llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock =
       buildContainsExprConsumedInDifferentBlock(*Cfg, StmtToBlock);
 
-  return ControlFlowContext(D, std::move(Cfg), std::move(StmtToBlock),
-                            std::move(BlockReachable),
-                            std::move(ContainsExprConsumedInDifferentBlock));
+  return AdornedCFG(D, std::move(Cfg), std::move(StmtToBlock),
+                    std::move(BlockReachable),
+                    std::move(ContainsExprConsumedInDifferentBlock));
 }
 
 } // namespace dataflow
diff --git a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
index 5af4ecfc9efa5..a3b5d9adc24bd 100644
--- a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_clang_library(clangAnalysisFlowSensitive
+  AdornedCFG.cpp
   Arena.cpp
-  ControlFlowContext.cpp
   DataflowAnalysisContext.cpp
   DataflowEnvironment.cpp
   Formula.cpp
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
index f4c4af022f51f..d520539dd2535 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
@@ -288,8 +288,8 @@ void DataflowAnalysisContext::dumpFlowCondition(Atom Token,
   }
 }
 
-const ControlFlowContext *
-DataflowAnalysisContext::getControlFlowContext(const FunctionDecl *F) {
+const AdornedCFG *
+DataflowAnalysisContext::getAdornedCFG(const FunctionDecl *F) {
   // Canonicalize the key:
   F = F->getDefinition();
   if (F == nullptr)
@@ -299,10 +299,10 @@ DataflowAnalysisContext::getControlFlowContext(const FunctionDecl *F) {
     return &It->second;
 
   if (F->doesThisDeclarationHaveABody()) {
-    auto CFCtx = ControlFlowContext::build(*F);
+    auto ACFG = AdornedCFG::build(*F);
     // FIXME: Handle errors.
-    assert(CFCtx);
-    auto Result = FunctionContexts.insert({F, std::move(*CFCtx)});
+    assert(ACFG);
+    auto Result = FunctionContexts.insert({F, std::move(*ACFG)});
     return &Result.first->second;
   }
 
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 1d2bd9a9b08af..cc1ebd511191a 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -771,6 +771,7 @@ static bool isOriginalRecordConstructor(const Expr &RecordPRValue) {
     return !Init->isSemanticForm() || !Init->isTransparent();
   return isa<CXXConstructExpr>(RecordPRValue) || isa<CallExpr>(RecordPRValue) ||
          isa<LambdaExpr>(RecordPRValue) ||
+         isa<CXXDefaultArgExpr>(RecordPRValue) ||
          isa<CXXDefaultInitExpr>(RecordPRValue) ||
          // The framework currently does not propagate the objects created in
          // the two branches of a `ConditionalOperator` because there is no way
diff --git a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
index 6afd66d9dc6ac..397a8d87e114d 100644
--- a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
+++ b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
@@ -54,7 +54,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DebugSupport.h"
 #include "clang/Analysis/FlowSensitive/Logger.h"
 #include "clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h"
@@ -162,7 +162,7 @@ class HTMLLogger : public Logger {
   llvm::raw_string_ostream JStringStream{JSON};
   llvm::json::OStream JOS{JStringStream, /*Indent=*/2};
 
-  const ControlFlowContext *CFC;
+  const AdornedCFG *ACFG;
   // Timeline of iterations of CFG block visitation.
   std::vector<Iteration> Iters;
   // Indexes  in `Iters` of the iterations for each block.
@@ -176,15 +176,15 @@ class HTMLLogger : public Logger {
 
 public:
   explicit HTMLLogger(StreamFactory Streams) : Streams(std::move(Streams)) {}
-  void beginAnalysis(const ControlFlowContext &CFC,
+  void beginAnalysis(const AdornedCFG &ACFG,
                      TypeErasedDataflowAnalysis &A) override {
     OS = Streams();
-    this->CFC = &CFC;
+    this->ACFG = &ACFG;
     *OS << llvm::StringRef(HTMLLogger_html).split("<?INJECT?>").first;
 
-    BlockConverged.resize(CFC.getCFG().getNumBlockIDs());
+    BlockConverged.resize(ACFG.getCFG().getNumBlockIDs());
 
-    const auto &D = CFC.getDecl();
+    const auto &D = ACFG.getDecl();
     const auto &SM = A.getASTContext().getSourceManager();
     *OS << "<title>";
     if (const auto *ND = dyn_cast<NamedDecl>(&D))
@@ -345,7 +345,7 @@ class HTMLLogger : public Logger {
   // tokens are associated with, and even which BB element (so that clicking
   // can select the right element).
   void writeCode() {
-    const auto &AST = CFC->getDecl().getASTContext();
+    const auto &AST = ACFG->getDecl().getASTContext();
     bool Invalid = false;
 
     // Extract the source code from the original file.
@@ -353,7 +353,7 @@ class HTMLLogger : public Logger {
     // indentation to worry about), but we need the boundaries of particular
     // AST nodes and the printer doesn't provide this.
     auto Range = clang::Lexer::makeFileCharRange(
-        CharSourceRange::getTokenRange(CFC->getDecl().getSourceRange()),
+        CharSourceRange::getTokenRange(ACFG->getDecl().getSourceRange()),
         AST.getSourceManager(), AST.getLangOpts());
     if (Range.isInvalid())
       return;
@@ -419,7 +419,7 @@ class HTMLLogger : public Logger {
     // Construct one TokenInfo per character in a flat array.
     // This is inefficient (chars in a token all have the same info) but simple.
     std::vector<TokenInfo> State(Code.size());
-    for (const auto *Block : CFC->getCFG()) {
+    for (const auto *Block : ACFG->getCFG()) {
       unsigned EltIndex = 0;
       for (const auto& Elt : *Block) {
         ++EltIndex;
@@ -480,7 +480,7 @@ class HTMLLogger : public Logger {
   // out to `dot` to turn it into an SVG.
   void writeCFG() {
     *OS << "<template data-copy='cfg'>\n";
-    if (auto SVG = renderSVG(buildCFGDot(CFC->getCFG())))
+    if (auto SVG = renderSVG(buildCFGDot(ACFG->getCFG())))
       *OS << *SVG;
     else
       *OS << "Can't draw CFG: " << toString(SVG.takeError());
diff --git a/clang/lib/Analysis/FlowSensitive/Logger.cpp b/clang/lib/Analysis/FlowSensitive/Logger.cpp
index 8c401df62e445..8f40768171c94 100644
--- a/clang/lib/Analysis/FlowSensitive/Logger.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Logger.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Analysis/FlowSensitive/Logger.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h"
 #include "llvm/Support/WithColor.h"
 
@@ -33,17 +33,17 @@ struct TextualLogger final : Logger {
   TextualLogger(llvm::raw_ostream &OS)
       : OS(OS), ShowColors(llvm::WithColor::defaultAutoDetectFunction()(OS)) {}
 
-  virtual void beginAnalysis(const ControlFlowContext &CFG,
+  virtual void beginAnalysis(const AdornedCFG &ACFG,
                              TypeErasedDataflowAnalysis &Analysis) override {
     {
       llvm::WithColor Header(OS, llvm::raw_ostream::Colors::RED, /*Bold=*/true);
       OS << "=== Beginning data flow analysis ===\n";
     }
-    auto &D = CFG.getDecl();
+    auto &D = ACFG.getDecl();
     D.print(OS);
     OS << "\n";
     D.dump(OS);
-    CurrentCFG = &CFG.getCFG();
+    CurrentCFG = &ACFG.getCFG();
     CurrentCFG->print(OS, Analysis.getASTContext().getLangOpts(), ShowColors);
     CurrentAnalysis = &Analysis;
   }
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index 1d31b22b6d25f..dbf4878622eba 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -64,39 +64,125 @@ static bool hasOptionalClassName(const CXXRecordDecl &RD) {
   return false;
 }
 
+static const CXXRecordDecl *getOptionalBaseClass(const CXXRecordDecl *RD) {
+  if (RD == nullptr)
+    return nullptr;
+  if (hasOptionalClassName(*RD))
+    return RD;
+
+  if (!RD->hasDefinition())
+    return nullptr;
+
+  for (const CXXBaseSpecifier &Base : RD->bases())
+    if (const CXXRecordDecl *BaseClass =
+            getOptionalBaseClass(Base.getType()->getAsCXXRecordDecl()))
+      return BaseClass;
+
+  return nullptr;
+}
+
 namespace {
 
 using namespace ::clang::ast_matchers;
 using LatticeTransferState = TransferState<NoopLattice>;
 
-AST_MATCHER(CXXRecordDecl, hasOptionalClassNameMatcher) {
-  return hasOptionalClassName(Node);
+AST_MATCHER(CXXRecordDecl, optionalClass) { return hasOptionalClassName(Node); }
+
+AST_MATCHER(CXXRecordDecl, optionalOrDerivedClass) {
+  return getOptionalBaseClass(&Node) != nullptr;
 }
 
-DeclarationMatcher optionalClass() {
-  return classTemplateSpecializationDecl(
-      hasOptionalClassNameMatcher(),
-      hasTemplateArgument(0, refersToType(type().bind("T"))));
+auto desugarsToOptionalType() {
+  return hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(cxxRecordDecl(optionalClass()))));
 }
 
-auto optionalOrAliasType() {
+auto desugarsToOptionalOrDerivedType() {
   return hasUnqualifiedDesugaredType(
-      recordType(hasDeclaration(optionalClass())));
+      recordType(hasDeclaration(cxxRecordDecl(optionalOrDerivedClass()))));
+}
+
+auto hasOptionalType() { return hasType(desugarsToOptionalType()); }
+
+/// Matches any of the spellings of the optional types and sugar, aliases,
+/// derived classes, etc.
+auto hasOptionalOrDerivedType() {
+  return hasType(desugarsToOptionalOrDerivedType());
+}
+
+QualType getPublicType(const Expr *E) {
+  auto *Cast = dyn_cast<ImplicitCastExpr>(E->IgnoreParens());
+  if (Cast == nullptr || Cast->getCastKind() != CK_UncheckedDerivedToBase) {
+    QualType Ty = E->getType();
+    if (Ty->isPointerType())
+      return Ty->getPointeeType();
+    return Ty;
+  }
+
+  // Is the derived type that we're casting from the type of `*this`? In this
+  // special case, we can upcast to the base class even if the base is
+  // non-public.
+  bool CastingFromThis = isa<CXXThisExpr>(Cast->getSubExpr());
+
+  // Find the least-derived type in the path (i.e. the last entry in the list)
+  // that we can access.
+  const CXXBaseSpecifier *PublicBase = nullptr;
+  for (const CXXBaseSpecifier *Base : Cast->path()) {
+    if (Base->getAccessSpecifier() != AS_public && !CastingFromThis)
+      break;
+    PublicBase = Base;
+    CastingFromThis = false;
+  }
+
+  if (PublicBase != nullptr)
+    return PublicBase->getType();
+
+  // We didn't find any public type that we could cast to. There may be more
+  // casts in `getSubExpr()`, so recurse. (If there aren't any more casts, this
+  // will return the type of `getSubExpr()`.)
+  return getPublicType(Cast->getSubExpr());
 }
 
-/// Matches any of the spellings of the optional types and sugar, aliases, etc.
-auto hasOptionalType() { return hasType(optionalOrAliasType()); }
+// Returns the least-derived type for the receiver of `MCE` that
+// `MCE.getImplicitObjectArgument()->IgnoreParentImpCasts()` can be downcast to.
+// Effectively, we upcast until we reach a non-public base class, unless that
+// base is a base of `*this`.
+//
+// This is needed to correctly match methods called on types derived from
+// `std::optional`.
+//
+// Say we have a `struct Derived : public std::optional<int> {} d;` For a call
+// `d.has_value()`, the `getImplicitObjectArgument()` looks like this:
+//
+//   ImplicitCastExpr 'const std::__optional_storage_base<int>' lvalue
+//   |            <UncheckedDerivedToBase (optional -> __optional_storage_base)>
+//   `-DeclRefExpr 'Derived' lvalue Var 'd' 'Derived'
+//
+// The type of the implicit object argument is `__optional_storage_base`
+// (since this is the internal type that `has_value()` is declared on). If we
+// call `IgnoreParenImpCasts()` on the implicit object argument, we get the
+// `DeclRefExpr`, which has type `Derived`. Neither of these types is
+// `optional`, and hence neither is sufficient for querying whether we are
+// calling a method on `optional`.
+//
+// Instead, starting with the most derived type, we need to follow the chain of
+// casts
+QualType getPublicReceiverType(const CXXMemberCallExpr &MCE) {
+  return getPublicType(MCE.getImplicitObjectArgument());
+}
+
+AST_MATCHER_P(CXXMemberCallExpr, publicReceiverType,
+              ast_matchers::internal::Matcher<QualType>, InnerMatcher) {
+  return InnerMatcher.matches(getPublicReceiverType(Node), Finder, Builder);
+}
 
 auto isOptionalMemberCallWithNameMatcher(
     ast_matchers::internal::Matcher<NamedDecl> matcher,
     const std::optional<StatementMatcher> &Ignorable = std::nullopt) {
-  auto Exception = unless(Ignorable ? expr(anyOf(*Ignorable, cxxThisExpr()))
-                                    : cxxThisExpr());
-  return cxxMemberCallExpr(
-      on(expr(Exception,
-              anyOf(hasOptionalType(),
-                    hasType(pointerType(pointee(optionalOrAliasType())))))),
-      callee(cxxMethodDecl(matcher)));
+  return cxxMemberCallExpr(Ignorable ? on(expr(unless(*Ignorable)))
+                                     : anything(),
+                           publicReceiverType(desugarsToOptionalType()),
+                           callee(cxxMethodDecl(matcher)));
 }
 
 auto isOptionalOperatorCallWithName(
@@ -129,49 +215,51 @@ auto inPlaceClass() {
 
 auto isOptionalNulloptConstructor() {
   return cxxConstructExpr(
-      hasOptionalType(),
       hasDeclaration(cxxConstructorDecl(parameterCountIs(1),
-                                        hasParameter(0, hasNulloptType()))));
+                                        hasParameter(0, hasNulloptType()))),
+      hasOptionalOrDerivedType());
 }
 
 auto isOptionalInPlaceConstructor() {
-  return cxxConstructExpr(hasOptionalType(),
-                          hasArgument(0, hasType(inPlaceClass())));
+  return cxxConstructExpr(hasArgument(0, hasType(inPlaceClass())),
+                          hasOptionalOrDerivedType());
 }
 
 auto isOptionalValueOrConversionConstructor() {
   return cxxConstructExpr(
-      hasOptionalType(),
       unless(hasDeclaration(
           cxxConstructorDecl(anyOf(isCopyConstructor(), isMoveConstructor())))),
-      argumentCountIs(1), hasArgument(0, unless(hasNulloptType())));
+      argumentCountIs(1), hasArgument(0, unless(hasNulloptType())),
+      hasOptionalOrDerivedType());
 }
 
 auto isOptionalValueOrConversionAssignment() {
   return cxxOperatorCallExpr(
       hasOverloadedOperatorName("="),
-      callee(cxxMethodDecl(ofClass(optionalClass()))),
+      callee(cxxMethodDecl(ofClass(optionalOrDerivedClass()))),
       unless(hasDeclaration(cxxMethodDecl(
           anyOf(isCopyAssignmentOperator(), isMoveAssignmentOperator())))),
       argumentCountIs(2), hasArgument(1, unless(hasNulloptType())));
 }
 
 auto isOptionalNulloptAssignment() {
-  return cxxOperatorCallExpr(hasOverloadedOperatorName("="),
-                             callee(cxxMethodDecl(ofClass(optionalClass()))),
-                             argumentCountIs(2),
-                             hasArgument(1, hasNulloptType()));
+  return cxxOperatorCallExpr(
+      hasOverloadedOperatorName("="),
+      callee(cxxMethodDecl(ofClass(optionalOrDerivedClass()))),
+      argumentCountIs(2), hasArgument(1, hasNulloptType()));
 }
 
 auto isStdSwapCall() {
   return callExpr(callee(functionDecl(hasName("std::swap"))),
-                  argumentCountIs(2), hasArgument(0, hasOptionalType()),
-                  hasArgument(1, hasOptionalType()));
+                  argumentCountIs(2),
+                  hasArgument(0, hasOptionalOrDerivedType()),
+                  hasArgument(1, hasOptionalOrDerivedType()));
 }
 
 auto isStdForwardCall() {
   return callExpr(callee(functionDecl(hasName("std::forward"))),
-                  argumentCountIs(1), hasArgument(0, hasOptionalType()));
+                  argumentCountIs(1),
+                  hasArgument(0, hasOptionalOrDerivedType()));
 }
 
 constexpr llvm::StringLiteral ValueOrCallID = "ValueOrCall";
@@ -212,8 +300,9 @@ auto isValueOrNotEqX() {
 }
 
 auto isCallReturningOptional() {
-  return callExpr(hasType(qualType(anyOf(
-      optionalOrAliasType(), referenceType(pointee(optionalOrAliasType()))))));
+  return callExpr(hasType(qualType(
+      anyOf(desugarsToOptionalOrDerivedType(),
+            referenceType(pointee(desugarsToOptionalOrDerivedType()))))));
 }
 
 template <typename L, typename R>
@@ -275,12 +364,9 @@ BoolValue *getHasValue(Environment &Env, RecordStorageLocation *OptionalLoc) {
   return HasValueVal;
 }
 
-/// Returns true if and only if `Type` is an optional type.
-bool isOptionalType(QualType Type) {
-  if (!Type->isRecordType())
-    return false;
-  const CXXRecordDecl *D = Type->getAsCXXRecordDecl();
-  return D != nullptr && hasOptionalClassName(*D);
+QualType valueTypeFromOptionalDecl(const CXXRecordDecl &RD) {
+  auto &CTSD = cast<ClassTemplateSpecializationDecl>(RD);
+  return CTSD.getTemplateArgs()[0].getAsType();
 }
 
 /// Returns the number of optional wrappers in `Type`.
@@ -288,15 +374,13 @@ bool isOptionalType(QualType Type) {
 /// For example, if `Type` is `optional<optional<int>>`, the result of this
 /// function will be 2.
 int countOptionalWrappers(const ASTContext &ASTCtx, QualType Type) {
-  if (!isOptionalType(Type))
+  const CXXRecordDecl *Optional =
+      getOptionalBaseClass(Type->getAsCXXRecordDecl());
+  if (Optional == nullptr)
     return 0;
   return 1 + countOptionalWrappers(
                  ASTCtx,
-                 cast<ClassTemplateSpecializationDecl>(Type->getAsRecordDecl())
-                     ->getTemplateArgs()
-                     .get(0)
-                     .getAsType()
-                     .getDesugaredType(ASTCtx));
+                 valueTypeFromOptionalDecl(*Optional).getDesugaredType(ASTCtx));
 }
 
 StorageLocation *getLocBehindPossiblePointer(const Expr &E,
@@ -843,13 +927,7 @@ auto buildDiagnoseMatchSwitch(
 
 ast_matchers::DeclarationMatcher
 UncheckedOptionalAccessModel::optionalClassDecl() {
-  return optionalClass();
-}
-
-static QualType valueTypeFromOptionalType(QualType OptionalTy) {
-  auto *CTSD =
-      cast<ClassTemplateSpecializationDecl>(OptionalTy->getAsCXXRecordDecl());
-  return CTSD->getTemplateArgs()[0].getAsType();
+  return cxxRecordDecl(optionalClass());
 }
 
 UncheckedOptionalAccessModel::UncheckedOptionalAccessModel(ASTContext &Ctx,
@@ -858,9 +936,11 @@ UncheckedOptionalAccessModel::UncheckedOptionalAccessModel(ASTContext &Ctx,
       TransferMatchSwitch(buildTransferMatchSwitch()) {
   Env.getDataflowAnalysisContext().setSyntheticFieldCallback(
       [&Ctx](QualType Ty) -> llvm::StringMap<QualType> {
-        if (!isOptionalType(Ty))
+        const CXXRecordDecl *Optional =
+            getOptionalBaseClass(Ty->getAsCXXRecordDecl());
+        if (Optional == nullptr)
           return {};
-        return {{"value", valueTypeFromOptionalType(Ty)},
+        return {{"value", valueTypeFromOptionalDecl(*Optional)},
                 {"has_value", Ctx.BoolTy}};
       });
 }
diff --git a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
index da4dd6dc07851..2f0b0e5c5640c 100644
--- a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
@@ -14,18 +14,52 @@
 
 #define DEBUG_TYPE "dataflow"
 
-void clang::dataflow::copyRecord(RecordStorageLocation &Src,
-                                 RecordStorageLocation &Dst, Environment &Env) {
+namespace clang::dataflow {
+
+static void copyField(const ValueDecl &Field, StorageLocation *SrcFieldLoc,
+                      StorageLocation *DstFieldLoc, RecordStorageLocation &Dst,
+                      Environment &Env) {
+  assert(Field.getType()->isReferenceType() ||
+         (SrcFieldLoc != nullptr && DstFieldLoc != nullptr));
+
+  if (Field.getType()->isRecordType()) {
+    copyRecord(cast<RecordStorageLocation>(*SrcFieldLoc),
+               cast<RecordStorageLocation>(*DstFieldLoc), Env);
+  } else if (Field.getType()->isReferenceType()) {
+    Dst.setChild(Field, SrcFieldLoc);
+  } else {
+    if (Value *Val = Env.getValue(*SrcFieldLoc))
+      Env.setValue(*DstFieldLoc, *Val);
+    else
+      Env.clearValue(*DstFieldLoc);
+  }
+}
+
+static void copySyntheticField(QualType FieldType, StorageLocation &SrcFieldLoc,
+                               StorageLocation &DstFieldLoc, Environment &Env) {
+  if (FieldType->isRecordType()) {
+    copyRecord(cast<RecordStorageLocation>(SrcFieldLoc),
+               cast<RecordStorageLocation>(DstFieldLoc), Env);
+  } else {
+    if (Value *Val = Env.getValue(SrcFieldLoc))
+      Env.setValue(DstFieldLoc, *Val);
+    else
+      Env.clearValue(DstFieldLoc);
+  }
+}
+
+void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
+                Environment &Env) {
   auto SrcType = Src.getType().getCanonicalType().getUnqualifiedType();
   auto DstType = Dst.getType().getCanonicalType().getUnqualifiedType();
 
   auto SrcDecl = SrcType->getAsCXXRecordDecl();
   auto DstDecl = DstType->getAsCXXRecordDecl();
 
-  bool compatibleTypes =
+  [[maybe_unused]] bool compatibleTypes =
       SrcType == DstType ||
-      (SrcDecl && DstDecl && SrcDecl->isDerivedFrom(DstDecl));
-  (void)compatibleTypes;
+      (SrcDecl != nullptr && DstDecl != nullptr &&
+       (SrcDecl->isDerivedFrom(DstDecl) || DstDecl->isDerivedFrom(SrcDecl)));
 
   LLVM_DEBUG({
     if (!compatibleTypes) {
@@ -35,45 +69,27 @@ void clang::dataflow::copyRecord(RecordStorageLocation &Src,
   });
   assert(compatibleTypes);
 
-  for (auto [Field, DstFieldLoc] : Dst.children()) {
-    StorageLocation *SrcFieldLoc = Src.getChild(*Field);
-
-    assert(Field->getType()->isReferenceType() ||
-           (SrcFieldLoc != nullptr && DstFieldLoc != nullptr));
-
-    if (Field->getType()->isRecordType()) {
-      copyRecord(cast<RecordStorageLocation>(*SrcFieldLoc),
-                 cast<RecordStorageLocation>(*DstFieldLoc), Env);
-    } else if (Field->getType()->isReferenceType()) {
-      Dst.setChild(*Field, SrcFieldLoc);
-    } else {
-      if (Value *Val = Env.getValue(*SrcFieldLoc))
-        Env.setValue(*DstFieldLoc, *Val);
-      else
-        Env.clearValue(*DstFieldLoc);
-    }
-  }
-
-  for (const auto &[Name, SynthFieldLoc] : Src.synthetic_fields()) {
-    if (SynthFieldLoc->getType()->isRecordType()) {
-      copyRecord(*cast<RecordStorageLocation>(SynthFieldLoc),
-                 cast<RecordStorageLocation>(Dst.getSyntheticField(Name)), Env);
-    } else {
-      if (Value *Val = Env.getValue(*SynthFieldLoc))
-        Env.setValue(Dst.getSyntheticField(Name), *Val);
-      else
-        Env.clearValue(Dst.getSyntheticField(Name));
-    }
+  if (SrcType == DstType || (SrcDecl != nullptr && DstDecl != nullptr &&
+                             SrcDecl->isDerivedFrom(DstDecl))) {
+    for (auto [Field, DstFieldLoc] : Dst.children())
+      copyField(*Field, Src.getChild(*Field), DstFieldLoc, Dst, Env);
+    for (const auto &[Name, DstFieldLoc] : Dst.synthetic_fields())
+      copySyntheticField(DstFieldLoc->getType(), Src.getSyntheticField(Name),
+                         *DstFieldLoc, Env);
+  } else {
+    for (auto [Field, SrcFieldLoc] : Src.children())
+      copyField(*Field, SrcFieldLoc, Dst.getChild(*Field), Dst, Env);
+    for (const auto &[Name, SrcFieldLoc] : Src.synthetic_fields())
+      copySyntheticField(SrcFieldLoc->getType(), *SrcFieldLoc,
+                         Dst.getSyntheticField(Name), Env);
   }
 
   RecordValue *DstVal = &Env.create<RecordValue>(Dst);
   Env.setValue(Dst, *DstVal);
 }
 
-bool clang::dataflow::recordsEqual(const RecordStorageLocation &Loc1,
-                                   const Environment &Env1,
-                                   const RecordStorageLocation &Loc2,
-                                   const Environment &Env2) {
+bool recordsEqual(const RecordStorageLocation &Loc1, const Environment &Env1,
+                  const RecordStorageLocation &Loc2, const Environment &Env2) {
   LLVM_DEBUG({
     if (Loc2.getType().getCanonicalType().getUnqualifiedType() !=
         Loc1.getType().getCanonicalType().getUnqualifiedType()) {
@@ -116,3 +132,5 @@ bool clang::dataflow::recordsEqual(const RecordStorageLocation &Loc1,
 
   return true;
 }
+
+} // namespace clang::dataflow
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 04aa2831df055..960e9688ffb72 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -20,7 +20,7 @@
 #include "clang/AST/OperationKinds.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtVisitor.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/NoopAnalysis.h"
 #include "clang/Analysis/FlowSensitive/RecordOps.h"
@@ -38,9 +38,9 @@ namespace clang {
 namespace dataflow {
 
 const Environment *StmtToEnvMap::getEnvironment(const Stmt &S) const {
-  auto BlockIt = CFCtx.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
-  assert(BlockIt != CFCtx.getStmtToBlock().end());
-  if (!CFCtx.isBlockReachable(*BlockIt->getSecond()))
+  auto BlockIt = ACFG.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
+  assert(BlockIt != ACFG.getStmtToBlock().end());
+  if (!ACFG.isBlockReachable(*BlockIt->getSecond()))
     return nullptr;
   if (BlockIt->getSecond()->getBlockID() == CurBlockID)
     return &CurState.Env;
@@ -450,6 +450,25 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     Env.setStorageLocation(*S, *MemberLoc);
   }
 
+  void VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *S) {
+    const Expr *ArgExpr = S->getExpr();
+    assert(ArgExpr != nullptr);
+    propagateValueOrStorageLocation(*ArgExpr, *S, Env);
+
+    // If this is a prvalue of record type, we consider it to be an "original
+    // record constructor", which we always require to have a `RecordValue`.
+    // So make sure we have a value if we didn't propagate one above.
+    if (S->isPRValue() && S->getType()->isRecordType()) {
+      if (Env.getValue(*S) == nullptr) {
+        Value *Val = Env.createValue(S->getType());
+        // We're guaranteed to always be able to create a value for record
+        // types.
+        assert(Val != nullptr);
+        Env.setValue(*S, *Val);
+      }
+    }
+  }
+
   void VisitCXXDefaultInitExpr(const CXXDefaultInitExpr *S) {
     const Expr *InitExpr = S->getExpr();
     assert(InitExpr != nullptr);
@@ -525,15 +544,6 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       if (LocSrc == nullptr || LocDst == nullptr)
         return;
 
-      // The assignment operators are different from the type of the destination
-      // in this model (i.e. in one of their base classes). This must be very
-      // rare and we just bail.
-      if (Method->getFunctionObjectParameterType()
-              .getCanonicalType()
-              .getUnqualifiedType() !=
-          LocDst->getType().getCanonicalType().getUnqualifiedType())
-        return;
-
       copyRecord(*LocSrc, *LocDst, Env);
 
       // If the expr is a glvalue, we can reasonably assume the operator is
@@ -836,27 +846,26 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
           Env.canDescend(Options.ContextSensitiveOpts->Depth, F)))
       return;
 
-    const ControlFlowContext *CFCtx =
-        Env.getDataflowAnalysisContext().getControlFlowContext(F);
-    if (!CFCtx)
+    const AdornedCFG *ACFG = Env.getDataflowAnalysisContext().getAdornedCFG(F);
+    if (!ACFG)
       return;
 
     // FIXME: We don't support context-sensitive analysis of recursion, so
     // we should return early here if `F` is the same as the `FunctionDecl`
     // holding `S` itself.
 
-    auto ExitBlock = CFCtx->getCFG().getExit().getBlockID();
+    auto ExitBlock = ACFG->getCFG().getExit().getBlockID();
 
     auto CalleeEnv = Env.pushCall(S);
 
     // FIXME: Use the same analysis as the caller for the callee. Note,
     // though, that doing so would require support for changing the analysis's
     // ASTContext.
-    auto Analysis = NoopAnalysis(CFCtx->getDecl().getASTContext(),
+    auto Analysis = NoopAnalysis(ACFG->getDecl().getASTContext(),
                                  DataflowAnalysisOptions{Options});
 
     auto BlockToOutputState =
-        dataflow::runDataflowAnalysis(*CFCtx, Analysis, CalleeEnv);
+        dataflow::runDataflowAnalysis(*ACFG, Analysis, CalleeEnv);
     assert(BlockToOutputState);
     assert(ExitBlock < BlockToOutputState->size());
 
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 939247c047c66..595f70f819ddb 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
 #include <optional>
 #include <system_error>
 #include <utility>
@@ -33,7 +32,6 @@
 #include "clang/Analysis/FlowSensitive/Value.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 
@@ -64,106 +62,44 @@ static bool isBackedgeNode(const CFGBlock &B) {
 
 namespace {
 
-// The return type of the visit functions in TerminatorVisitor. The first
-// element represents the terminator expression (that is the conditional
-// expression in case of a path split in the CFG). The second element
-// represents whether the condition was true or false.
-using TerminatorVisitorRetTy = std::pair<const Expr *, bool>;
-
-/// Extends the flow condition of an environment based on a terminator
-/// statement.
+/// Extracts the terminator's condition expression.
 class TerminatorVisitor
-    : public ConstStmtVisitor<TerminatorVisitor, TerminatorVisitorRetTy> {
+    : public ConstStmtVisitor<TerminatorVisitor, const Expr *> {
 public:
-  TerminatorVisitor(Environment &Env, int BlockSuccIdx)
-      : Env(Env), BlockSuccIdx(BlockSuccIdx) {}
-
-  TerminatorVisitorRetTy VisitIfStmt(const IfStmt *S) {
-    auto *Cond = S->getCond();
-    assert(Cond != nullptr);
-    return extendFlowCondition(*Cond);
-  }
-
-  TerminatorVisitorRetTy VisitWhileStmt(const WhileStmt *S) {
-    auto *Cond = S->getCond();
-    assert(Cond != nullptr);
-    return extendFlowCondition(*Cond);
-  }
-
-  TerminatorVisitorRetTy VisitDoStmt(const DoStmt *S) {
-    auto *Cond = S->getCond();
-    assert(Cond != nullptr);
-    return extendFlowCondition(*Cond);
-  }
-
-  TerminatorVisitorRetTy VisitForStmt(const ForStmt *S) {
-    auto *Cond = S->getCond();
-    if (Cond != nullptr)
-      return extendFlowCondition(*Cond);
-    return {nullptr, false};
-  }
-
-  TerminatorVisitorRetTy VisitCXXForRangeStmt(const CXXForRangeStmt *) {
+  TerminatorVisitor() = default;
+  const Expr *VisitIfStmt(const IfStmt *S) { return S->getCond(); }
+  const Expr *VisitWhileStmt(const WhileStmt *S) { return S->getCond(); }
+  const Expr *VisitDoStmt(const DoStmt *S) { return S->getCond(); }
+  const Expr *VisitForStmt(const ForStmt *S) { return S->getCond(); }
+  const Expr *VisitCXXForRangeStmt(const CXXForRangeStmt *) {
     // Don't do anything special for CXXForRangeStmt, because the condition
     // (being implicitly generated) isn't visible from the loop body.
-    return {nullptr, false};
+    return nullptr;
   }
-
-  TerminatorVisitorRetTy VisitBinaryOperator(const BinaryOperator *S) {
+  const Expr *VisitBinaryOperator(const BinaryOperator *S) {
     assert(S->getOpcode() == BO_LAnd || S->getOpcode() == BO_LOr);
-    auto *LHS = S->getLHS();
-    assert(LHS != nullptr);
-    return extendFlowCondition(*LHS);
+    return S->getLHS();
   }
-
-  TerminatorVisitorRetTy
-  VisitConditionalOperator(const ConditionalOperator *S) {
-    auto *Cond = S->getCond();
-    assert(Cond != nullptr);
-    return extendFlowCondition(*Cond);
+  const Expr *VisitConditionalOperator(const ConditionalOperator *S) {
+    return S->getCond();
   }
-
-private:
-  TerminatorVisitorRetTy extendFlowCondition(const Expr &Cond) {
-    auto *Val = Env.get<BoolValue>(Cond);
-    // In transferCFGBlock(), we ensure that we always have a `Value` for the
-    // terminator condition, so assert this.
-    // We consciously assert ourselves instead of asserting via `cast()` so
-    // that we get a more meaningful line number if the assertion fails.
-    assert(Val != nullptr);
-
-    bool ConditionValue = true;
-    // The condition must be inverted for the successor that encompasses the
-    // "else" branch, if such exists.
-    if (BlockSuccIdx == 1) {
-      Val = &Env.makeNot(*Val);
-      ConditionValue = false;
-    }
-
-    Env.assume(Val->formula());
-    return {&Cond, ConditionValue};
-  }
-
-  Environment &Env;
-  int BlockSuccIdx;
 };
 
 /// Holds data structures required for running dataflow analysis.
 struct AnalysisContext {
-  AnalysisContext(const ControlFlowContext &CFCtx,
-                  TypeErasedDataflowAnalysis &Analysis,
+  AnalysisContext(const AdornedCFG &ACFG, TypeErasedDataflowAnalysis &Analysis,
                   const Environment &InitEnv,
                   llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>>
                       BlockStates)
-      : CFCtx(CFCtx), Analysis(Analysis), InitEnv(InitEnv),
+      : ACFG(ACFG), Analysis(Analysis), InitEnv(InitEnv),
         Log(*InitEnv.getDataflowAnalysisContext().getOptions().Log),
         BlockStates(BlockStates) {
-    Log.beginAnalysis(CFCtx, Analysis);
+    Log.beginAnalysis(ACFG, Analysis);
   }
   ~AnalysisContext() { Log.endAnalysis(); }
 
   /// Contains the CFG being analyzed.
-  const ControlFlowContext &CFCtx;
+  const AdornedCFG &ACFG;
   /// The analysis to be run.
   TypeErasedDataflowAnalysis &Analysis;
   /// Initial state to start the analysis.
@@ -176,19 +112,19 @@ struct AnalysisContext {
 
 class PrettyStackTraceAnalysis : public llvm::PrettyStackTraceEntry {
 public:
-  PrettyStackTraceAnalysis(const ControlFlowContext &CFCtx, const char *Message)
-      : CFCtx(CFCtx), Message(Message) {}
+  PrettyStackTraceAnalysis(const AdornedCFG &ACFG, const char *Message)
+      : ACFG(ACFG), Message(Message) {}
 
   void print(raw_ostream &OS) const override {
     OS << Message << "\n";
     OS << "Decl:\n";
-    CFCtx.getDecl().dump(OS);
+    ACFG.getDecl().dump(OS);
     OS << "CFG:\n";
-    CFCtx.getCFG().print(OS, LangOptions(), false);
+    ACFG.getCFG().print(OS, LangOptions(), false);
   }
 
 private:
-  const ControlFlowContext &CFCtx;
+  const AdornedCFG &ACFG;
   const char *Message;
 };
 
@@ -264,9 +200,13 @@ class JoinedStateBuilder {
     return Result;
   }
 };
-
 } // namespace
 
+static const Expr *getTerminatorCondition(const Stmt *TerminatorStmt) {
+  return TerminatorStmt == nullptr ? nullptr
+                                   : TerminatorVisitor().Visit(TerminatorStmt);
+}
+
 /// Computes the input state for a given basic block by joining the output
 /// states of its predecessors.
 ///
@@ -303,7 +243,7 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
     // See `NoreturnDestructorTest` for concrete examples.
     if (Block.succ_begin()->getReachableBlock() != nullptr &&
         Block.succ_begin()->getReachableBlock()->hasNoReturnElement()) {
-      auto &StmtToBlock = AC.CFCtx.getStmtToBlock();
+      auto &StmtToBlock = AC.ACFG.getStmtToBlock();
       auto StmtBlock = StmtToBlock.find(Block.getTerminatorStmt());
       assert(StmtBlock != StmtToBlock.end());
       llvm::erase(Preds, StmtBlock->getSecond());
@@ -319,7 +259,7 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
   // all predecessors have expression state consumed in a different block.
   Environment::ExprJoinBehavior JoinBehavior = Environment::DiscardExprState;
   for (const CFGBlock *Pred : Preds) {
-    if (Pred && AC.CFCtx.containsExprConsumedInDifferentBlock(*Pred)) {
+    if (Pred && AC.ACFG.containsExprConsumedInDifferentBlock(*Pred)) {
       JoinBehavior = Environment::KeepExprState;
       break;
     }
@@ -338,25 +278,32 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
     if (!MaybePredState)
       continue;
 
+    const TypeErasedDataflowAnalysisState &PredState = *MaybePredState;
+    const Expr *Cond = getTerminatorCondition(Pred->getTerminatorStmt());
+    if (Cond == nullptr) {
+      Builder.addUnowned(PredState);
+      continue;
+    }
+
+    bool BranchVal = blockIndexInPredecessor(*Pred, Block) == 0;
+
+    // `transferBranch` may need to mutate the environment to describe the
+    // dynamic effect of the terminator for a given branch.  Copy now.
+    TypeErasedDataflowAnalysisState Copy = MaybePredState->fork();
     if (AC.Analysis.builtinOptions()) {
-      if (const Stmt *PredTerminatorStmt = Pred->getTerminatorStmt()) {
-        // We have a terminator: we need to mutate an environment to describe
-        // when the terminator is taken. Copy now.
-        TypeErasedDataflowAnalysisState Copy = MaybePredState->fork();
-
-        auto [Cond, CondValue] =
-            TerminatorVisitor(Copy.Env, blockIndexInPredecessor(*Pred, Block))
-                .Visit(PredTerminatorStmt);
-        if (Cond != nullptr)
-          // FIXME: Call transferBranchTypeErased even if BuiltinTransferOpts
-          // are not set.
-          AC.Analysis.transferBranchTypeErased(CondValue, Cond, Copy.Lattice,
-                                               Copy.Env);
-        Builder.addOwned(std::move(Copy));
-        continue;
-      }
+      auto *CondVal = Copy.Env.get<BoolValue>(*Cond);
+      // In transferCFGBlock(), we ensure that we always have a `Value`
+      // for the terminator condition, so assert this. We consciously
+      // assert ourselves instead of asserting via `cast()` so that we get
+      // a more meaningful line number if the assertion fails.
+      assert(CondVal != nullptr);
+      BoolValue *AssertedVal =
+          BranchVal ? CondVal : &Copy.Env.makeNot(*CondVal);
+      Copy.Env.assume(AssertedVal->formula());
     }
-    Builder.addUnowned(*MaybePredState);
+    AC.Analysis.transferBranchTypeErased(BranchVal, Cond, Copy.Lattice,
+                                         Copy.Env);
+    Builder.addOwned(std::move(Copy));
   }
   return std::move(Builder).take();
 }
@@ -368,7 +315,7 @@ builtinTransferStatement(unsigned CurBlockID, const CFGStmt &Elt,
                          AnalysisContext &AC) {
   const Stmt *S = Elt.getStmt();
   assert(S != nullptr);
-  transfer(StmtToEnvMap(AC.CFCtx, AC.BlockStates, CurBlockID, InputState), *S,
+  transfer(StmtToEnvMap(AC.ACFG, AC.BlockStates, CurBlockID, InputState), *S,
            InputState.Env);
 }
 
@@ -511,9 +458,8 @@ transferCFGBlock(const CFGBlock &Block, AnalysisContext &AC,
       // takes a `CFGElement` as input, but some expressions only show up as a
       // terminator condition, but not as a `CFGElement`. The condition of an if
       // statement is one such example.
-      transfer(
-          StmtToEnvMap(AC.CFCtx, AC.BlockStates, Block.getBlockID(), State),
-          *TerminatorCond, State.Env);
+      transfer(StmtToEnvMap(AC.ACFG, AC.BlockStates, Block.getBlockID(), State),
+               *TerminatorCond, State.Env);
 
     // If the transfer function didn't produce a value, create an atom so that
     // we have *some* value for the condition expression. This ensures that
@@ -528,13 +474,13 @@ transferCFGBlock(const CFGBlock &Block, AnalysisContext &AC,
 
 llvm::Expected<std::vector<std::optional<TypeErasedDataflowAnalysisState>>>
 runTypeErasedDataflowAnalysis(
-    const ControlFlowContext &CFCtx, TypeErasedDataflowAnalysis &Analysis,
+    const AdornedCFG &ACFG, TypeErasedDataflowAnalysis &Analysis,
     const Environment &InitEnv,
     std::function<void(const CFGElement &,
                        const TypeErasedDataflowAnalysisState &)>
         PostVisitCFG,
     std::int32_t MaxBlockVisits) {
-  PrettyStackTraceAnalysis CrashInfo(CFCtx, "runTypeErasedDataflowAnalysis");
+  PrettyStackTraceAnalysis CrashInfo(ACFG, "runTypeErasedDataflowAnalysis");
 
   std::optional<Environment> MaybeStartingEnv;
   if (InitEnv.callStackSize() == 1) {
@@ -544,7 +490,7 @@ runTypeErasedDataflowAnalysis(
   const Environment &StartingEnv =
       MaybeStartingEnv ? *MaybeStartingEnv : InitEnv;
 
-  const clang::CFG &CFG = CFCtx.getCFG();
+  const clang::CFG &CFG = ACFG.getCFG();
   PostOrderCFGView POV(&CFG);
   ForwardDataflowWorklist Worklist(CFG, &POV);
 
@@ -557,7 +503,7 @@ runTypeErasedDataflowAnalysis(
                                      StartingEnv.fork()};
   Worklist.enqueueSuccessors(&Entry);
 
-  AnalysisContext AC(CFCtx, Analysis, StartingEnv, BlockStates);
+  AnalysisContext AC(ACFG, Analysis, StartingEnv, BlockStates);
   std::int32_t BlockVisits = 0;
   while (const CFGBlock *Block = Worklist.dequeue()) {
     LLVM_DEBUG(llvm::dbgs()
@@ -615,7 +561,7 @@ runTypeErasedDataflowAnalysis(
   // state set to `std::nullopt` at this point) to also analyze dead code.
 
   if (PostVisitCFG) {
-    for (const CFGBlock *Block : CFCtx.getCFG()) {
+    for (const CFGBlock *Block : ACFG.getCFG()) {
       // Skip blocks that were not evaluated.
       if (!BlockStates[Block->getBlockID()])
         continue;
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index b353a6627f298..cd42573968b21 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -49,6 +49,7 @@ struct StaticDiagInfoDescriptionStringTable {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
   // clang-format on
 #undef DIAG
 };
@@ -70,7 +71,8 @@ const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
-  // clang-format on
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+// clang-format on
 #undef DIAG
 };
 
@@ -95,7 +97,8 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
-  // clang-format on
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+// clang-format on
 #undef DIAG
 };
 
@@ -173,6 +176,7 @@ VALIDATE_DIAG_SIZE(CROSSTU)
 VALIDATE_DIAG_SIZE(SEMA)
 VALIDATE_DIAG_SIZE(ANALYSIS)
 VALIDATE_DIAG_SIZE(REFACTORING)
+VALIDATE_DIAG_SIZE(INSTALLAPI)
 #undef VALIDATE_DIAG_SIZE
 #undef STRINGIFY_NAME
 
@@ -204,6 +208,7 @@ const StaticDiagInfoRec StaticDiagInfo[] = {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
 // clang-format on
 #undef DIAG
 };
@@ -246,6 +251,7 @@ CATEGORY(CROSSTU, COMMENT)
 CATEGORY(SEMA, CROSSTU)
 CATEGORY(ANALYSIS, SEMA)
 CATEGORY(REFACTORING, ANALYSIS)
+CATEGORY(INSTALLAPI, REFACTORING)
 #undef CATEGORY
 
   // Avoid out of bounds reads.
@@ -855,6 +861,9 @@ bool DiagnosticIDs::isUnrecoverable(unsigned DiagID) const {
   if (isARCDiagnostic(DiagID))
     return false;
 
+  if (isCodegenABICheckDiagnostic(DiagID))
+    return false;
+
   return true;
 }
 
@@ -862,3 +871,8 @@ bool DiagnosticIDs::isARCDiagnostic(unsigned DiagID) {
   unsigned cat = getCategoryNumberForDiag(DiagID);
   return DiagnosticIDs::getCategoryNameFromID(cat).starts_with("ARC ");
 }
+
+bool DiagnosticIDs::isCodegenABICheckDiagnostic(unsigned DiagID) {
+  unsigned cat = getCategoryNumberForDiag(DiagID);
+  return DiagnosticIDs::getCategoryNameFromID(cat) == "Codegen ABI Check";
+}
diff --git a/clang/lib/Basic/LangStandards.cpp b/clang/lib/Basic/LangStandards.cpp
index ab09c7221dda9..cb2c077234998 100644
--- a/clang/lib/Basic/LangStandards.cpp
+++ b/clang/lib/Basic/LangStandards.cpp
@@ -69,6 +69,16 @@ LangStandard::Kind LangStandard::getLangKind(StringRef Name) {
       .Default(lang_unspecified);
 }
 
+LangStandard::Kind LangStandard::getHLSLLangKind(StringRef Name) {
+  return llvm::StringSwitch<LangStandard::Kind>(Name)
+      .Case("2016", LangStandard::lang_hlsl2016)
+      .Case("2017", LangStandard::lang_hlsl2017)
+      .Case("2018", LangStandard::lang_hlsl2018)
+      .Case("2021", LangStandard::lang_hlsl2021)
+      .Case("202x", LangStandard::lang_hlsl202x)
+      .Default(LangStandard::lang_unspecified);
+}
+
 const LangStandard *LangStandard::getLangStandardForName(StringRef Name) {
   Kind K = getLangKind(Name);
   if (K == lang_unspecified)
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index 9f597dcf8b0f5..256365d66bb90 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -301,10 +301,9 @@ bool Module::directlyUses(const Module *Requested) {
     if (Requested->isSubModuleOf(Use))
       return true;
 
-  // Anyone is allowed to use our builtin stdarg.h and stddef.h and their
-  // accompanying modules.
-  if (Requested->getTopLevelModuleName() == "_Builtin_stdarg" ||
-      Requested->getTopLevelModuleName() == "_Builtin_stddef")
+  // Anyone is allowed to use our builtin stddef.h and its accompanying modules.
+  if (Requested->fullModuleNameIs({"_Builtin_stddef", "max_align_t"}) ||
+      Requested->fullModuleNameIs({"_Builtin_stddef_wint_t"}))
     return true;
 
   if (NoUndeclaredIncludes)
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 0bb2a7e4234df..79690f643dc2e 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -959,6 +959,10 @@ bool TargetInfo::validateInputConstraint(
   return true;
 }
 
+bool TargetInfo::validatePointerAuthKey(const llvm::APSInt &value) const {
+  return false;
+}
+
 void TargetInfo::CheckFixedPointBits() const {
   // Check that the number of fractional and integral bits (and maybe sign) can
   // fit into the bits given for a fixed point type.
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 5abb060073c51..1c3199bd76eed 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -11,9 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -199,13 +201,23 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple,
 StringRef AArch64TargetInfo::getABI() const { return ABI; }
 
 bool AArch64TargetInfo::setABI(const std::string &Name) {
-  if (Name != "aapcs" && Name != "darwinpcs")
+  if (Name != "aapcs" && Name != "aapcs-soft" && Name != "darwinpcs")
     return false;
 
   ABI = Name;
   return true;
 }
 
+bool AArch64TargetInfo::validateTarget(DiagnosticsEngine &Diags) const {
+  if (hasFeature("fp") && ABI == "aapcs-soft") {
+    // aapcs-soft is not allowed for targets with an FPU, to avoid there being
+    // two incomatible ABIs.
+    Diags.Report(diag::err_target_unsupported_abi_with_fpu) << ABI;
+    return false;
+  }
+  return true;
+}
+
 bool AArch64TargetInfo::validateBranchProtection(StringRef Spec, StringRef,
                                                  BranchProtectionInfo &BPI,
                                                  StringRef &Err) const {
@@ -680,7 +692,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Cases("aarch64", "arm64", "arm", true)
       .Case("fmv", HasFMV)
-      .Cases("neon", "fp", "simd", FPU & NeonMode)
+      .Case("fp", FPU & FPUMode)
+      .Cases("neon", "simd", FPU & NeonMode)
       .Case("jscvt", HasJSCVT)
       .Case("fcma", HasFCMA)
       .Case("rng", HasRandGen)
@@ -1450,6 +1463,11 @@ int AArch64TargetInfo::getEHDataRegisterNumber(unsigned RegNo) const {
   return -1;
 }
 
+bool AArch64TargetInfo::validatePointerAuthKey(
+    const llvm::APSInt &value) const {
+  return 0 <= value && value <= 3;
+}
+
 bool AArch64TargetInfo::hasInt128Type() const { return true; }
 
 AArch64leTargetInfo::AArch64leTargetInfo(const llvm::Triple &Triple,
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index c1ba156860a12..542894c66412d 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -195,10 +195,14 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
 
   int getEHDataRegisterNumber(unsigned RegNo) const override;
 
+  bool validatePointerAuthKey(const llvm::APSInt &value) const override;
+
   const char *getBFloat16Mangling() const override { return "u6__bf16"; };
   bool hasInt128Type() const override;
 
   bool hasBitIntType() const override { return true; }
+
+  bool validateTarget(DiagnosticsEngine &Diags) const override;
 };
 
 class LLVM_LIBRARY_VISIBILITY AArch64leTargetInfo : public AArch64TargetInfo {
diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
index 26a54f631fcfc..b5ba11a3bdca9 100644
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -36,7 +36,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
     return;
   }
 
-  Builder.defineMacro("__BPF_FEATURE_ARENA_CAST");
+  Builder.defineMacro("__BPF_FEATURE_ADDR_SPACE_CAST");
 
   if (CPU.empty() || CPU == "generic" || CPU == "v1") {
     Builder.defineMacro("__BPF_CPU_VERSION__", "1");
@@ -45,6 +45,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   std::string CpuVerNumStr = CPU.substr(1);
   Builder.defineMacro("__BPF_CPU_VERSION__", CpuVerNumStr);
+  Builder.defineMacro("__BPF_FEATURE_MAY_GOTO");
 
   int CpuVerNum = std::stoi(CpuVerNumStr);
   if (CpuVerNum >= 2)
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index ceda6e13ffe4f..84edc8a36d7d2 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1541,7 +1541,10 @@ llvm::Function *CodeGenFunction::GenerateBlockFunction(
   llvm::BasicBlock *resume = Builder.GetInsertBlock();
 
   // Go back to the entry.
-  ++entry_ptr;
+  if (entry_ptr->getNextNonDebugInstruction())
+    entry_ptr = entry_ptr->getNextNonDebugInstruction()->getIterator();
+  else
+    entry_ptr = entry->end();
   Builder.SetInsertPoint(entry, entry_ptr);
 
   // Emit debug information for all the DeclRefExprs.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6409658f3ced4..db606569224e4 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -935,7 +935,7 @@ static unsigned CountCountedByAttrs(const RecordDecl *RD) {
 
   for (const Decl *D : RD->decls()) {
     if (const auto *FD = dyn_cast<FieldDecl>(D);
-        FD && FD->hasAttr<CountedByAttr>()) {
+        FD && FD->getType()->isCountAttributedType()) {
       return ++Num;
     }
 
@@ -1033,7 +1033,7 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
     //         };
     //    };
     //
-    // We don't konw which 'count' to use in this scenario:
+    // We don't know which 'count' to use in this scenario:
     //
     //     size_t get_size(struct union_of_fams *p) {
     //         return __builtin_dynamic_object_size(p, 1);
@@ -1052,7 +1052,7 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
       FindFlexibleArrayMemberField(Ctx, OuterRD, FAMName, Offset);
   Offset = Ctx.toCharUnitsFromBits(Offset).getQuantity();
 
-  if (!FAMDecl || !FAMDecl->hasAttr<CountedByAttr>())
+  if (!FAMDecl || !FAMDecl->getType()->isCountAttributedType())
     // No flexible array member found or it doesn't have the "counted_by"
     // attribute.
     return nullptr;
@@ -5288,6 +5288,73 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__iso_volatile_store64:
     return RValue::get(EmitISOVolatileStore(*this, E));
 
+  case Builtin::BI__builtin_ptrauth_auth:
+  case Builtin::BI__builtin_ptrauth_auth_and_resign:
+  case Builtin::BI__builtin_ptrauth_blend_discriminator:
+  case Builtin::BI__builtin_ptrauth_sign_generic_data:
+  case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
+  case Builtin::BI__builtin_ptrauth_strip: {
+    // Emit the arguments.
+    SmallVector<llvm::Value *, 5> Args;
+    for (auto argExpr : E->arguments())
+      Args.push_back(EmitScalarExpr(argExpr));
+
+    // Cast the value to intptr_t, saving its original type.
+    llvm::Type *OrigValueType = Args[0]->getType();
+    if (OrigValueType->isPointerTy())
+      Args[0] = Builder.CreatePtrToInt(Args[0], IntPtrTy);
+
+    switch (BuiltinID) {
+    case Builtin::BI__builtin_ptrauth_auth_and_resign:
+      if (Args[4]->getType()->isPointerTy())
+        Args[4] = Builder.CreatePtrToInt(Args[4], IntPtrTy);
+      LLVM_FALLTHROUGH;
+
+    case Builtin::BI__builtin_ptrauth_auth:
+    case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
+      if (Args[2]->getType()->isPointerTy())
+        Args[2] = Builder.CreatePtrToInt(Args[2], IntPtrTy);
+      break;
+
+    case Builtin::BI__builtin_ptrauth_sign_generic_data:
+      if (Args[1]->getType()->isPointerTy())
+        Args[1] = Builder.CreatePtrToInt(Args[1], IntPtrTy);
+      break;
+
+    case Builtin::BI__builtin_ptrauth_blend_discriminator:
+    case Builtin::BI__builtin_ptrauth_strip:
+      break;
+    }
+
+    // Call the intrinsic.
+    auto IntrinsicID = [&]() -> unsigned {
+      switch (BuiltinID) {
+      case Builtin::BI__builtin_ptrauth_auth:
+        return llvm::Intrinsic::ptrauth_auth;
+      case Builtin::BI__builtin_ptrauth_auth_and_resign:
+        return llvm::Intrinsic::ptrauth_resign;
+      case Builtin::BI__builtin_ptrauth_blend_discriminator:
+        return llvm::Intrinsic::ptrauth_blend;
+      case Builtin::BI__builtin_ptrauth_sign_generic_data:
+        return llvm::Intrinsic::ptrauth_sign_generic;
+      case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
+        return llvm::Intrinsic::ptrauth_sign;
+      case Builtin::BI__builtin_ptrauth_strip:
+        return llvm::Intrinsic::ptrauth_strip;
+      }
+      llvm_unreachable("bad ptrauth intrinsic");
+    }();
+    auto Intrinsic = CGM.getIntrinsic(IntrinsicID);
+    llvm::Value *Result = EmitRuntimeCall(Intrinsic, Args);
+
+    if (BuiltinID != Builtin::BI__builtin_ptrauth_sign_generic_data &&
+        BuiltinID != Builtin::BI__builtin_ptrauth_blend_discriminator &&
+        OrigValueType->isPointerTy()) {
+      Result = Builder.CreateIntToPtr(Result, OrigValueType);
+    }
+    return RValue::get(Result);
+  }
+
   case Builtin::BI__exception_code:
   case Builtin::BI_exception_code:
     return RValue::get(EmitSEHExceptionCode());
@@ -10779,6 +10846,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
   }
 
+  if (BuiltinID == clang::AArch64::BI__builtin_arm_trap) {
+    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_break);
+    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
+    return Builder.CreateCall(F, Builder.CreateZExt(Arg, CGM.Int32Ty));
+  }
+
   if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
     // Create call to __arm_sme_state and store the results to the two pointers.
     CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction(
@@ -18062,6 +18135,17 @@ llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
   return Arg;
 }
 
+Intrinsic::ID getDotProductIntrinsic(QualType QT) {
+  if (QT->hasSignedIntegerRepresentation())
+    return Intrinsic::dx_sdot;
+  if (QT->hasUnsignedIntegerRepresentation())
+    return Intrinsic::dx_udot;
+
+  assert(QT->hasFloatingRepresentation());
+  return Intrinsic::dx_dot;
+  ;
+}
+
 Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
                                             const CallExpr *E) {
   if (!getLangOpts().HLSL)
@@ -18074,6 +18158,21 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/llvm::Type::getInt1Ty(getLLVMContext()),
         Intrinsic::dx_any, ArrayRef<Value *>{Op0}, nullptr, "dx.any");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_clamp: {
+    Value *OpX = EmitScalarExpr(E->getArg(0));
+    Value *OpMin = EmitScalarExpr(E->getArg(1));
+    Value *OpMax = EmitScalarExpr(E->getArg(2));
+
+    QualType Ty = E->getArg(0)->getType();
+    bool IsUnsigned = false;
+    if (auto *VecTy = Ty->getAs<VectorType>())
+      Ty = VecTy->getElementType();
+    IsUnsigned = Ty->isUnsignedIntegerType();
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/OpX->getType(),
+        IsUnsigned ? Intrinsic::dx_uclamp : Intrinsic::dx_clamp,
+        ArrayRef<Value *>{OpX, OpMin, OpMax}, nullptr, "dx.clamp");
+  }
   case Builtin::BI__builtin_hlsl_dot: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     Value *Op1 = EmitScalarExpr(E->getArg(1));
@@ -18107,45 +18206,19 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
            "Dot product requires vectors to be of the same size.");
 
     return Builder.CreateIntrinsic(
-        /*ReturnType=*/T0->getScalarType(), Intrinsic::dx_dot,
+        /*ReturnType=*/T0->getScalarType(),
+        getDotProductIntrinsic(E->getArg(0)->getType()),
         ArrayRef<Value *>{Op0, Op1}, nullptr, "dx.dot");
   } break;
   case Builtin::BI__builtin_hlsl_lerp: {
     Value *X = EmitScalarExpr(E->getArg(0));
     Value *Y = EmitScalarExpr(E->getArg(1));
     Value *S = EmitScalarExpr(E->getArg(2));
-    llvm::Type *Xty = X->getType();
-    llvm::Type *Yty = Y->getType();
-    llvm::Type *Sty = S->getType();
-    if (!Xty->isVectorTy() && !Yty->isVectorTy() && !Sty->isVectorTy()) {
-      if (Xty->isFloatingPointTy()) {
-        auto V = Builder.CreateFSub(Y, X);
-        V = Builder.CreateFMul(S, V);
-        return Builder.CreateFAdd(X, V, "dx.lerp");
-      }
-      llvm_unreachable("Scalar Lerp is only supported on floats.");
-    }
-    // A VectorSplat should have happened
-    assert(Xty->isVectorTy() && Yty->isVectorTy() && Sty->isVectorTy() &&
-           "Lerp of vector and scalar is not supported.");
-
-    [[maybe_unused]] auto *XVecTy =
-        E->getArg(0)->getType()->getAs<VectorType>();
-    [[maybe_unused]] auto *YVecTy =
-        E->getArg(1)->getType()->getAs<VectorType>();
-    [[maybe_unused]] auto *SVecTy =
-        E->getArg(2)->getType()->getAs<VectorType>();
-    // A HLSLVectorTruncation should have happend
-    assert(XVecTy->getNumElements() == YVecTy->getNumElements() &&
-           XVecTy->getNumElements() == SVecTy->getNumElements() &&
-           "Lerp requires vectors to be of the same size.");
-    assert(XVecTy->getElementType()->isRealFloatingType() &&
-           XVecTy->getElementType() == YVecTy->getElementType() &&
-           XVecTy->getElementType() == SVecTy->getElementType() &&
-           "Lerp requires float vectors to be of the same type.");
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("lerp operand must have a float representation");
     return Builder.CreateIntrinsic(
-        /*ReturnType=*/Xty, Intrinsic::dx_lerp, ArrayRef<Value *>{X, Y, S},
-        nullptr, "dx.lerp");
+        /*ReturnType=*/X->getType(), Intrinsic::dx_lerp,
+        ArrayRef<Value *>{X, Y, S}, nullptr, "dx.lerp");
   }
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
@@ -18155,6 +18228,20 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/Op0->getType(), Intrinsic::dx_frac,
         ArrayRef<Value *>{Op0}, nullptr, "dx.frac");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_isinf: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    llvm::Type *Xty = Op0->getType();
+    llvm::Type *retType = llvm::Type::getInt1Ty(this->getLLVMContext());
+    if (Xty->isVectorTy()) {
+      auto *XVecTy = E->getArg(0)->getType()->getAs<VectorType>();
+      retType = llvm::VectorType::get(
+          retType, ElementCount::getFixed(XVecTy->getNumElements()));
+    }
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("isinf operand must have a float representation");
+    return Builder.CreateIntrinsic(retType, Intrinsic::dx_isinf,
+                                   ArrayRef<Value *>{Op0}, nullptr, "dx.isinf");
+  }
   case Builtin::BI__builtin_hlsl_mad: {
     Value *M = EmitScalarExpr(E->getArg(0));
     Value *A = EmitScalarExpr(E->getArg(1));
@@ -18182,6 +18269,14 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/Op0->getType(), Intrinsic::dx_rcp,
         ArrayRef<Value *>{Op0}, nullptr, "dx.rcp");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_rsqrt: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("rsqrt operand must have a float representation");
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/Op0->getType(), Intrinsic::dx_rsqrt,
+        ArrayRef<Value *>{Op0}, nullptr, "dx.rsqrt");
+  }
   }
   return nullptr;
 }
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 2a0c50f05b545..14246f3a7d237 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -493,7 +493,8 @@ static void replaceManagedVar(llvm::GlobalVariable *Var,
       // variable with instructions.
       for (auto &&Op : WorkItem) {
         auto *CE = cast<llvm::ConstantExpr>(Op);
-        auto *NewInst = CE->getAsInstruction(I);
+        auto *NewInst = CE->getAsInstruction();
+        NewInst->insertBefore(*I->getParent(), I->getIterator());
         NewInst->replaceUsesOfWith(OldV, NewV);
         OldV = CE;
         NewV = NewInst;
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 3d69abdc8a513..1d0e27d9fa371 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -3713,6 +3713,7 @@ llvm::DIType *CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile *Unit) {
   case Type::TemplateSpecialization:
     return CreateType(cast<TemplateSpecializationType>(Ty), Unit);
 
+  case Type::CountAttributed:
   case Type::Auto:
   case Type::Attributed:
   case Type::BTFTagAttributed:
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index e339ede397f56..2b3b8f1d805ad 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1137,38 +1137,19 @@ llvm::Value *CodeGenFunction::EmitCountedByFieldExpr(
 }
 
 const FieldDecl *CodeGenFunction::FindCountedByField(const FieldDecl *FD) {
-  if (!FD || !FD->hasAttr<CountedByAttr>())
+  if (!FD)
     return nullptr;
 
-  const auto *CBA = FD->getAttr<CountedByAttr>();
-  if (!CBA)
+  const auto *CAT = FD->getType()->getAs<CountAttributedType>();
+  if (!CAT)
     return nullptr;
 
-  auto GetNonAnonStructOrUnion =
-      [](const RecordDecl *RD) -> const RecordDecl * {
-    while (RD && RD->isAnonymousStructOrUnion()) {
-      const auto *R = dyn_cast<RecordDecl>(RD->getDeclContext());
-      if (!R)
-        return nullptr;
-      RD = R;
-    }
-    return RD;
-  };
-  const RecordDecl *EnclosingRD = GetNonAnonStructOrUnion(FD->getParent());
-  if (!EnclosingRD)
-    return nullptr;
-
-  DeclarationName DName(CBA->getCountedByField());
-  DeclContext::lookup_result Lookup = EnclosingRD->lookup(DName);
-
-  if (Lookup.empty())
-    return nullptr;
-
-  const NamedDecl *ND = Lookup.front();
-  if (const auto *IFD = dyn_cast<IndirectFieldDecl>(ND))
-    ND = IFD->getAnonField();
+  const auto *CountDRE = cast<DeclRefExpr>(CAT->getCountExpr());
+  const auto *CountDecl = CountDRE->getDecl();
+  if (const auto *IFD = dyn_cast<IndirectFieldDecl>(CountDecl))
+    CountDecl = IFD->getAnonField();
 
-  return dyn_cast<FieldDecl>(ND);
+  return dyn_cast<FieldDecl>(CountDecl);
 }
 
 void CodeGenFunction::EmitBoundsCheck(const Expr *E, const Expr *Base,
@@ -4286,7 +4267,7 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
       if (const auto *ME = dyn_cast<MemberExpr>(Array);
           ME &&
           ME->isFlexibleArrayMemberLike(getContext(), StrictFlexArraysLevel) &&
-          ME->getMemberDecl()->hasAttr<CountedByAttr>()) {
+          ME->getMemberDecl()->getType()->isCountAttributedType()) {
         const FieldDecl *FAMDecl = dyn_cast<FieldDecl>(ME->getMemberDecl());
         if (const FieldDecl *CountFD = FindCountedByField(FAMDecl)) {
           if (std::optional<int64_t> Diff =
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index c03336839e952..ce3004473b903 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -4749,10 +4749,10 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
           if (CGF.CGM.getCodeGenOpts().hasReducedDebugInfo())
             (void)DI->EmitDeclareOfAutoVariable(SharedVar, ContextValue,
                                                 CGF.Builder, false);
-          llvm::Instruction &Last = CGF.Builder.GetInsertBlock()->back();
           // Get the call dbg.declare instruction we just created and update
           // its DIExpression to add offset to base address.
-          if (auto DDI = dyn_cast<llvm::DbgVariableIntrinsic>(&Last)) {
+          auto UpdateExpr = [](llvm::LLVMContext &Ctx, auto *Declare,
+                               unsigned Offset) {
             SmallVector<uint64_t, 8> Ops;
             // Add offset to the base address if non zero.
             if (Offset) {
@@ -4760,9 +4760,21 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
               Ops.push_back(Offset);
             }
             Ops.push_back(llvm::dwarf::DW_OP_deref);
-            auto &Ctx = DDI->getContext();
-            llvm::DIExpression *DIExpr = llvm::DIExpression::get(Ctx, Ops);
-            Last.setOperand(2, llvm::MetadataAsValue::get(Ctx, DIExpr));
+            Declare->setExpression(llvm::DIExpression::get(Ctx, Ops));
+          };
+          llvm::Instruction &Last = CGF.Builder.GetInsertBlock()->back();
+          if (auto DDI = dyn_cast<llvm::DbgVariableIntrinsic>(&Last))
+            UpdateExpr(DDI->getContext(), DDI, Offset);
+          // If we're emitting using the new debug info format into a block
+          // without a terminator, the record will be "trailing".
+          assert(!Last.isTerminator() && "unexpected terminator");
+          if (auto *Marker =
+                  CGF.Builder.GetInsertBlock()->getTrailingDbgRecords()) {
+            for (llvm::DbgVariableRecord &DVR : llvm::reverse(
+                     llvm::filterDbgVars(Marker->getDbgRecordRange()))) {
+              UpdateExpr(Last.getContext(), &DVR, Offset);
+              break;
+            }
           }
         }
       }
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 12853557aa99c..49ca152f35d66 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1600,6 +1600,8 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
   FunctionArgList Args;
   QualType ResTy = BuildFunctionArgList(GD, Args);
 
+  CGM.getTargetCodeGenInfo().checkFunctionABI(CGM, FD);
+
   if (FD->isInlineBuiltinDeclaration()) {
     // When generating code for a builtin with an inline declaration, use a
     // mangled name to hold the actual body, while keeping an external
@@ -2686,6 +2688,7 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) {
     case Type::BTFTagAttributed:
     case Type::SubstTemplateTypeParm:
     case Type::MacroQualified:
+    case Type::CountAttributed:
       // Keep walking after single level desugaring.
       type = type.getSingleStepDesugaredType(getContext());
       break;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 3c0d5e859808d..a9131ceca7241 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -156,6 +156,8 @@ createTargetCodeGenInfo(CodeGenModule &CGM) {
       Kind = AArch64ABIKind::DarwinPCS;
     else if (Triple.isOSWindows())
       return createWindowsAArch64TargetCodeGenInfo(CGM, AArch64ABIKind::Win64);
+    else if (Target.getABI() == "aapcs-soft")
+      Kind = AArch64ABIKind::AAPCSSoft;
 
     return createAArch64TargetCodeGenInfo(CGM, Kind);
   }
diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
index 18a701dc47cef..468d869cea2aa 100644
--- a/clang/lib/CodeGen/ModuleBuilder.cpp
+++ b/clang/lib/CodeGen/ModuleBuilder.cpp
@@ -180,7 +180,7 @@ namespace {
 
     bool HandleTopLevelDecl(DeclGroupRef DG) override {
       // FIXME: Why not return false and abort parsing?
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return true;
 
       HandlingTopLevelDeclRAII HandlingDecl(*this);
@@ -206,7 +206,7 @@ namespace {
     }
 
     void HandleInlineFunctionDefinition(FunctionDecl *D) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       assert(D->doesThisDeclarationHaveABody());
@@ -233,7 +233,7 @@ namespace {
     /// client hack on the type, which can occur at any point in the file
     /// (because these can be defined in declspecs).
     void HandleTagDeclDefinition(TagDecl *D) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       // Don't allow re-entrant calls to CodeGen triggered by PCH
@@ -269,7 +269,7 @@ namespace {
     }
 
     void HandleTagDeclRequiredDefinition(const TagDecl *D) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       // Don't allow re-entrant calls to CodeGen triggered by PCH
@@ -283,7 +283,7 @@ namespace {
 
     void HandleTranslationUnit(ASTContext &Ctx) override {
       // Release the Builder when there is no error.
-      if (!Diags.hasErrorOccurred() && Builder)
+      if (!Diags.hasUnrecoverableErrorOccurred() && Builder)
         Builder->Release();
 
       // If there are errors before or when releasing the Builder, reset
@@ -297,14 +297,14 @@ namespace {
     }
 
     void AssignInheritanceModel(CXXRecordDecl *RD) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       Builder->RefreshTypeCacheForClass(RD);
     }
 
     void CompleteTentativeDefinition(VarDecl *D) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       Builder->EmitTentativeDefinition(D);
@@ -315,7 +315,7 @@ namespace {
     }
 
     void HandleVTable(CXXRecordDecl *RD) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       // No VTable usage is legal in SYCL, so don't bother marking them used.
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 7682f197041c7..6893b50a3cfe9 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -84,6 +84,11 @@ class TargetCodeGenInfo {
   /// Provides a convenient hook to handle extra target-specific globals.
   virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {}
 
+  /// Any further codegen related checks that need to be done on a function
+  /// signature in a target specific manner.
+  virtual void checkFunctionABI(CodeGenModule &CGM,
+                                const FunctionDecl *Decl) const {}
+
   /// Any further codegen related checks that need to be done on a function call
   /// in a target specific manner.
   virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
@@ -416,6 +421,7 @@ enum class AArch64ABIKind {
   AAPCS = 0,
   DarwinPCS,
   Win64,
+  AAPCSSoft,
 };
 
 std::unique_ptr<TargetCodeGenInfo>
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 85117366de0ee..4c32f510101f0 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -27,6 +27,8 @@ class AArch64ABIInfo : public ABIInfo {
   AArch64ABIInfo(CodeGenTypes &CGT, AArch64ABIKind Kind)
       : ABIInfo(CGT), Kind(Kind) {}
 
+  bool isSoftFloat() const { return Kind == AArch64ABIKind::AAPCSSoft; }
+
 private:
   AArch64ABIKind getABIKind() const { return Kind; }
   bool isDarwinPCS() const { return Kind == AArch64ABIKind::DarwinPCS; }
@@ -55,8 +57,8 @@ class AArch64ABIInfo : public ABIInfo {
   Address EmitDarwinVAArg(Address VAListAddr, QualType Ty,
                           CodeGenFunction &CGF) const;
 
-  Address EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
-                         CodeGenFunction &CGF) const;
+  Address EmitAAPCSVAArg(Address VAListAddr, QualType Ty, CodeGenFunction &CGF,
+                         AArch64ABIKind Kind) const;
 
   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                     QualType Ty) const override {
@@ -67,7 +69,7 @@ class AArch64ABIInfo : public ABIInfo {
 
     return Kind == AArch64ABIKind::Win64 ? EmitMSVAArg(CGF, VAListAddr, Ty)
            : isDarwinPCS()               ? EmitDarwinVAArg(VAListAddr, Ty, CGF)
-                                         : EmitAAPCSVAArg(VAListAddr, Ty, CGF);
+                           : EmitAAPCSVAArg(VAListAddr, Ty, CGF, Kind);
   }
 
   Address EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
@@ -163,6 +165,9 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
     return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty);
   }
 
+  void checkFunctionABI(CodeGenModule &CGM,
+                        const FunctionDecl *Decl) const override;
+
   void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
                             const FunctionDecl *Caller,
                             const FunctionDecl *Callee,
@@ -494,6 +499,11 @@ bool AArch64SwiftABIInfo::isLegalVectorType(CharUnits VectorSize,
 }
 
 bool AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
+  // For the soft-float ABI variant, no types are considered to be homogeneous
+  // aggregates.
+  if (Kind == AArch64ABIKind::AAPCSSoft)
+    return false;
+
   // Homogeneous aggregates for AAPCS64 must have base types of a floating
   // point type or a short-vector type. This is the same as the 32-bit ABI,
   // but with the difference that any floating-point type is allowed,
@@ -525,7 +535,8 @@ bool AArch64ABIInfo::isZeroLengthBitfieldPermittedInHomogeneousAggregate()
 }
 
 Address AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
-                                       CodeGenFunction &CGF) const {
+                                       CodeGenFunction &CGF,
+                                       AArch64ABIKind Kind) const {
   ABIArgInfo AI = classifyArgumentType(Ty, /*IsVariadic=*/true,
                                        CGF.CurFnInfo->getCallingConvention());
   // Empty records are ignored for parameter passing purposes.
@@ -550,7 +561,8 @@ Address AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
     BaseTy = ArrTy->getElementType();
     NumRegs = ArrTy->getNumElements();
   }
-  bool IsFPR = BaseTy->isFloatingPointTy() || BaseTy->isVectorTy();
+  bool IsFPR = Kind != AArch64ABIKind::AAPCSSoft &&
+               (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy());
 
   // The AArch64 va_list type and handling is specified in the Procedure Call
   // Standard, section B.4:
@@ -841,6 +853,34 @@ static bool isStreamingCompatible(const FunctionDecl *F) {
   return false;
 }
 
+void AArch64TargetCodeGenInfo::checkFunctionABI(
+    CodeGenModule &CGM, const FunctionDecl *FuncDecl) const {
+  const AArch64ABIInfo &ABIInfo = getABIInfo<AArch64ABIInfo>();
+  const TargetInfo &TI = ABIInfo.getContext().getTargetInfo();
+
+  // If we are using a hard-float ABI, but do not have floating point
+  // registers, then report an error for any function arguments or returns
+  // which would be passed in floating-pint registers.
+  auto CheckType = [&CGM, &TI, &ABIInfo](const QualType &Ty,
+                                         const NamedDecl *D) {
+    const Type *HABase = nullptr;
+    uint64_t HAMembers = 0;
+    if (Ty->isFloatingType() || Ty->isVectorType() ||
+        ABIInfo.isHomogeneousAggregate(Ty, HABase, HAMembers)) {
+      CGM.getDiags().Report(D->getLocation(),
+                            diag::err_target_unsupported_type_for_abi)
+          << D->getDeclName() << Ty << TI.getABI();
+    }
+  };
+
+  if (!TI.hasFeature("fp") && !ABIInfo.isSoftFloat()) {
+    CheckType(FuncDecl->getReturnType(), FuncDecl);
+    for (ParmVarDecl *PVD : FuncDecl->parameters()) {
+      CheckType(PVD->getType(), PVD);
+    }
+  }
+}
+
 void AArch64TargetCodeGenInfo::checkFunctionCallABI(
     CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller,
     const FunctionDecl *Callee, const CallArgList &Args) const {
diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index dec6540230a60..9a79424c4612c 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -529,7 +529,10 @@ class RISCVTargetCodeGenInfo : public TargetCodeGenInfo {
   RISCVTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT, unsigned XLen,
                          unsigned FLen, bool EABI)
       : TargetCodeGenInfo(
-            std::make_unique<RISCVABIInfo>(CGT, XLen, FLen, EABI)) {}
+            std::make_unique<RISCVABIInfo>(CGT, XLen, FLen, EABI)) {
+    SwiftInfo =
+        std::make_unique<SwiftABIInfo>(CGT, /*SwiftErrorInRegister=*/false);
+  }
 
   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
                            CodeGen::CodeGenModule &CGM) const override {
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 961455eb5267b..f082f65011ff0 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -92,6 +92,7 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/RISCVISAInfo.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -2681,6 +2682,12 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
     return false;
   }
 
+  if (C.getArgs().hasArg(options::OPT_print_std_module_manifest_path)) {
+    llvm::outs() << GetStdModuleManifestPath(C, C.getDefaultToolChain())
+                 << '\n';
+    return false;
+  }
+
   if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) {
     if (std::optional<std::string> RuntimePath = TC.getRuntimePath())
       llvm::outs() << *RuntimePath << '\n';
@@ -7927,7 +7934,8 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
 
   // All kinds exit now in device-only mode except for non-RDC mode HIP.
   if (offloadDeviceOnly() &&
-      (!C.isOffloadingHostKind(Action::OFK_HIP) ||
+      (getFinalPhase(Args) == phases::Preprocess ||
+       !C.isOffloadingHostKind(Action::OFK_HIP) ||
        !Args.hasFlag(options::OPT_gpu_bundle_output,
                      options::OPT_no_gpu_bundle_output, true) ||
        Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)))
@@ -9793,6 +9801,44 @@ std::string Driver::GetProgramPath(StringRef Name, const ToolChain &TC) const {
   return std::string(Name);
 }
 
+std::string Driver::GetStdModuleManifestPath(const Compilation &C,
+                                             const ToolChain &TC) const {
+  std::string error = "<NOT PRESENT>";
+
+  switch (TC.GetCXXStdlibType(C.getArgs())) {
+  case ToolChain::CST_Libcxx: {
+    std::string lib = GetFilePath("libc++.so", TC);
+
+    // Note when there are multiple flavours of libc++ the module json needs to
+    // look at the command-line arguments for the proper json.
+    // These flavours do not exist at the moment, but there are plans to
+    // provide a variant that is built with sanitizer instrumentation enabled.
+
+    // For example
+    //  StringRef modules = [&] {
+    //    const SanitizerArgs &Sanitize = TC.getSanitizerArgs(C.getArgs());
+    //    if (Sanitize.needsAsanRt())
+    //      return "modules-asan.json";
+    //    return "modules.json";
+    //  }();
+
+    SmallString<128> path(lib.begin(), lib.end());
+    llvm::sys::path::remove_filename(path);
+    llvm::sys::path::append(path, "modules.json");
+    if (TC.getVFS().exists(path))
+      return static_cast<std::string>(path);
+
+    return error;
+  }
+
+  case ToolChain::CST_Libstdcxx:
+    // libstdc++ does not provide Standard library modules yet.
+    return error;
+  }
+
+  return error;
+}
+
 std::string Driver::GetTemporaryPath(StringRef Prefix, StringRef Suffix) const {
   SmallString<128> Path;
   std::error_code EC = llvm::sys::fs::createTemporaryFile(Prefix, Suffix, Path);
@@ -10441,3 +10487,131 @@ void Driver::populateSYCLDeviceTraitsMacrosArgs(
     SYCLDeviceTraitsMacrosArgs.push_back(Args.MakeArgString(MacroAllDevices));
   }
 }
+
+static const char *GetStableCStr(llvm::StringSet<> &SavedStrings, StringRef S) {
+  return SavedStrings.insert(S).first->getKeyData();
+}
+
+/// Apply a list of edits to the input argument lists.
+///
+/// The input string is a space separated list of edits to perform,
+/// they are applied in order to the input argument lists. Edits
+/// should be one of the following forms:
+///
+///  '#': Silence information about the changes to the command line arguments.
+///
+///  '^': Add FOO as a new argument at the beginning of the command line.
+///
+///  '+': Add FOO as a new argument at the end of the command line.
+///
+///  's/XXX/YYY/': Substitute the regular expression XXX with YYY in the command
+///  line.
+///
+///  'xOPTION': Removes all instances of the literal argument OPTION.
+///
+///  'XOPTION': Removes all instances of the literal argument OPTION,
+///  and the following argument.
+///
+///  'Ox': Removes all flags matching 'O' or 'O[sz0-9]' and adds 'Ox'
+///  at the end of the command line.
+///
+/// \param OS - The stream to write edit information to.
+/// \param Args - The vector of command line arguments.
+/// \param Edit - The override command to perform.
+/// \param SavedStrings - Set to use for storing string representations.
+static void applyOneOverrideOption(raw_ostream &OS,
+                                   SmallVectorImpl<const char *> &Args,
+                                   StringRef Edit,
+                                   llvm::StringSet<> &SavedStrings) {
+  // This does not need to be efficient.
+
+  if (Edit[0] == '^') {
+    const char *Str = GetStableCStr(SavedStrings, Edit.substr(1));
+    OS << "### Adding argument " << Str << " at beginning\n";
+    Args.insert(Args.begin() + 1, Str);
+  } else if (Edit[0] == '+') {
+    const char *Str = GetStableCStr(SavedStrings, Edit.substr(1));
+    OS << "### Adding argument " << Str << " at end\n";
+    Args.push_back(Str);
+  } else if (Edit[0] == 's' && Edit[1] == '/' && Edit.ends_with("/") &&
+             Edit.slice(2, Edit.size() - 1).contains('/')) {
+    StringRef MatchPattern = Edit.substr(2).split('/').first;
+    StringRef ReplPattern = Edit.substr(2).split('/').second;
+    ReplPattern = ReplPattern.slice(0, ReplPattern.size() - 1);
+
+    for (unsigned i = 1, e = Args.size(); i != e; ++i) {
+      // Ignore end-of-line response file markers
+      if (Args[i] == nullptr)
+        continue;
+      std::string Repl = llvm::Regex(MatchPattern).sub(ReplPattern, Args[i]);
+
+      if (Repl != Args[i]) {
+        OS << "### Replacing '" << Args[i] << "' with '" << Repl << "'\n";
+        Args[i] = GetStableCStr(SavedStrings, Repl);
+      }
+    }
+  } else if (Edit[0] == 'x' || Edit[0] == 'X') {
+    auto Option = Edit.substr(1);
+    for (unsigned i = 1; i < Args.size();) {
+      if (Option == Args[i]) {
+        OS << "### Deleting argument " << Args[i] << '\n';
+        Args.erase(Args.begin() + i);
+        if (Edit[0] == 'X') {
+          if (i < Args.size()) {
+            OS << "### Deleting argument " << Args[i] << '\n';
+            Args.erase(Args.begin() + i);
+          } else
+            OS << "### Invalid X edit, end of command line!\n";
+        }
+      } else
+        ++i;
+    }
+  } else if (Edit[0] == 'O') {
+    for (unsigned i = 1; i < Args.size();) {
+      const char *A = Args[i];
+      // Ignore end-of-line response file markers
+      if (A == nullptr)
+        continue;
+      if (A[0] == '-' && A[1] == 'O' &&
+          (A[2] == '\0' || (A[3] == '\0' && (A[2] == 's' || A[2] == 'z' ||
+                                             ('0' <= A[2] && A[2] <= '9'))))) {
+        OS << "### Deleting argument " << Args[i] << '\n';
+        Args.erase(Args.begin() + i);
+      } else
+        ++i;
+    }
+    OS << "### Adding argument " << Edit << " at end\n";
+    Args.push_back(GetStableCStr(SavedStrings, '-' + Edit.str()));
+  } else {
+    OS << "### Unrecognized edit: " << Edit << "\n";
+  }
+}
+
+void driver::applyOverrideOptions(SmallVectorImpl<const char *> &Args,
+                                  const char *OverrideStr,
+                                  llvm::StringSet<> &SavedStrings,
+                                  raw_ostream *OS) {
+  if (!OS)
+    OS = &llvm::nulls();
+
+  if (OverrideStr[0] == '#') {
+    ++OverrideStr;
+    OS = &llvm::nulls();
+  }
+
+  *OS << "### CCC_OVERRIDE_OPTIONS: " << OverrideStr << "\n";
+
+  // This does not need to be efficient.
+
+  const char *S = OverrideStr;
+  while (*S) {
+    const char *End = ::strchr(S, ' ');
+    if (!End)
+      End = S + strlen(S);
+    if (End != S)
+      applyOneOverrideOption(*OS, Args, std::string(S, End), SavedStrings);
+    S = End;
+    if (*S != '\0')
+      ++S;
+  }
+}
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 11a4d817f06f5..0c8f8a9960cbd 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -487,6 +487,14 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
         Add &= ~NotAllowedWithExecuteOnly;
       if (CfiCrossDso)
         Add &= ~SanitizerKind::CFIMFCall;
+      // -fsanitize=undefined does not expand to signed-integer-overflow in
+      // -fwrapv (implied by -fno-strict-overflow) mode.
+      if (Add & SanitizerKind::UndefinedGroup) {
+        bool S = Args.hasFlagNoClaim(options::OPT_fno_strict_overflow,
+                                     options::OPT_fstrict_overflow, false);
+        if (Args.hasFlagNoClaim(options::OPT_fwrapv, options::OPT_fno_wrapv, S))
+          Add &= ~SanitizerKind::SignedIntegerOverflow;
+      }
       Add &= Supported;
 
       if (Add & SanitizerKind::Fuzzer)
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index aceadec7f1314..fd23f57e2c542 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -454,12 +454,6 @@ ToolChain::getDefaultUnwindTableLevel(const ArgList &Args) const {
   return UnwindTableLevel::None;
 }
 
-unsigned ToolChain::GetDefaultDwarfVersion() const {
-  // TODO: Remove the RISC-V special case when R_RISCV_SET_ULEB128 linker
-  // support becomes more widely available.
-  return getTriple().isRISCV() ? 4 : 5;
-}
-
 Tool *ToolChain::getClang() const {
   if (!Clang)
     Clang.reset(new tools::Clang(*this, useIntegratedBackend()));
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index aa3b80cb16e55..3e6e29584df3a 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -321,9 +321,11 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
     }
   }
 
-  if (Arg *A = Args.getLastArg(options::OPT_mno_unaligned_access,
-                               options::OPT_munaligned_access)) {
-    if (A->getOption().matches(options::OPT_mno_unaligned_access))
+  if (Arg *A = Args.getLastArg(
+          options::OPT_mstrict_align, options::OPT_mno_strict_align,
+          options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) {
+    if (A->getOption().matches(options::OPT_mstrict_align) ||
+        A->getOption().matches(options::OPT_mno_unaligned_access))
       Features.push_back("+strict-align");
   } else if (Triple.isOSOpenBSD())
     Features.push_back("+strict-align");
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index ba158b92bb44b..a68368c475865 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -868,12 +868,16 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver &D,
     }
   }
 
-  // Kernel code has more strict alignment requirements.
-  if (KernelOrKext) {
-    Features.push_back("+strict-align");
-  } else if (Arg *A = Args.getLastArg(options::OPT_mno_unaligned_access,
-                                      options::OPT_munaligned_access)) {
-    if (A->getOption().matches(options::OPT_munaligned_access)) {
+  if (Arg *A = Args.getLastArg(options::OPT_mno_unaligned_access,
+                                      options::OPT_munaligned_access,
+                                      options::OPT_mstrict_align,
+                                      options::OPT_mno_strict_align)) {
+    // Kernel code has more strict alignment requirements.
+    if (KernelOrKext ||
+        A->getOption().matches(options::OPT_mno_unaligned_access) ||
+        A->getOption().matches(options::OPT_mstrict_align)) {
+      Features.push_back("+strict-align");
+    } else {
       // No v6M core supports unaligned memory access (v6M ARM ARM A3.2).
       if (Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v6m)
         D.Diag(diag::err_target_unsupported_unaligned) << "v6m";
@@ -881,8 +885,7 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver &D,
       // access either.
       else if (Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v8m_baseline)
         D.Diag(diag::err_target_unsupported_unaligned) << "v8m.base";
-    } else
-      Features.push_back("+strict-align");
+    }
   } else {
     // Assume pre-ARMv6 doesn't support unaligned accesses.
     //
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 31153a67ad284..d23f9b36efb9a 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -165,10 +165,9 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     }
   }
 
-  // Select the `ual` feature determined by -m[no-]unaligned-access
-  // or the alias -m[no-]strict-align.
-  AddTargetFeature(Args, Features, options::OPT_munaligned_access,
-                   options::OPT_mno_unaligned_access, "ual");
+  // Select the `ual` feature determined by -m[no-]strict-align.
+  AddTargetFeature(Args, Features, options::OPT_mno_strict_align,
+                   options::OPT_mstrict_align, "ual");
 
   // Accept but warn about these TargetSpecific options.
   if (Arg *A = Args.getLastArgNoClaim(options::OPT_mabi_EQ))
diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index fe9d112b8800b..74a8874a3ea2b 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -341,6 +341,15 @@ void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
                    "dspr2");
   AddTargetFeature(Args, Features, options::OPT_mmsa, options::OPT_mno_msa,
                    "msa");
+  if (Arg *A = Args.getLastArg(
+          options::OPT_mstrict_align, options::OPT_mno_strict_align,
+          options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) {
+    if (A->getOption().matches(options::OPT_mstrict_align) ||
+        A->getOption().matches(options::OPT_mno_unaligned_access))
+      Features.push_back(Args.MakeArgString("+strict-align"));
+    else
+      Features.push_back(Args.MakeArgString("-strict-align"));
+  }
 
   // Add the last -mfp32/-mfpxx/-mfp64, if none are given and the ABI is O32
   // pass -mfpxx, or if none are given and fp64a is default, pass fp64 and
diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index a46b44f9ad2b2..5165bccc6d7e3 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -167,9 +167,9 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     Features.push_back("-relax");
   }
 
-  // -mno-unaligned-access is default, unless -munaligned-access is specified.
-  AddTargetFeature(Args, Features, options::OPT_munaligned_access,
-                   options::OPT_mno_unaligned_access, "fast-unaligned-access");
+  // -mstrict-align is default, unless -mno-strict-align is specified.
+  AddTargetFeature(Args, Features, options::OPT_mno_strict_align,
+                   options::OPT_mstrict_align, "fast-unaligned-access");
 
   // Now add any that the user explicitly requested on the command line,
   // which may override the defaults.
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index ea652823950cc..fa954c6ea70ca 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7992,6 +7992,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // -fno-common is the default, set -fcommon only when that flag is set.
   Args.addOptInFlag(CmdArgs, options::OPT_fcommon, options::OPT_fno_common);
 
+  if (Args.hasFlag(options::OPT_fptrauth_intrinsics,
+                   options::OPT_fno_ptrauth_intrinsics, false))
+    CmdArgs.push_back("-fptrauth-intrinsics");
+
   // -fsigned-bitfields is default, and clang doesn't yet support
   // -funsigned-bitfields.
   if (!Args.hasFlag(options::OPT_fsigned_bitfields,
@@ -9296,6 +9300,14 @@ void ClangAs::ConstructJob(Compilation &C, const JobAction &JA,
   case llvm::Triple::riscv64:
     AddRISCVTargetArgs(Args, CmdArgs);
     break;
+
+  case llvm::Triple::hexagon:
+    if (Args.hasFlag(options::OPT_mdefault_build_attributes,
+                     options::OPT_mno_default_build_attributes, true)) {
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back("-hexagon-add-build-attributes");
+    }
+    break;
   }
 
   // Consume all the warning flags. Usually this would be handled more
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 9aa0249998735..52ec7dbac7570 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -748,7 +748,8 @@ bool tools::isTLSDESCEnabled(const ToolChain &TC,
     SupportedArgument = V == "desc" || V == "trad";
     EnableTLSDESC = V == "desc";
   } else if (Triple.isX86()) {
-    SupportedArgument = V == "gnu";
+    SupportedArgument = V == "gnu" || V == "gnu2";
+    EnableTLSDESC = V == "gnu2";
   } else {
     Unsupported = true;
   }
@@ -2857,7 +2858,7 @@ void tools::addHIPRuntimeLibArgs(const ToolChain &TC, Compilation &C,
                                  llvm::opt::ArgStringList &CmdArgs) {
   if ((C.getActiveOffloadKinds() & Action::OFK_HIP) &&
       !Args.hasArg(options::OPT_nostdlib) &&
-      !Args.hasArg(options::OPT_no_hip_rt)) {
+      !Args.hasArg(options::OPT_no_hip_rt) && !Args.hasArg(options::OPT_r)) {
     TC.AddHIPRuntimeLibArgs(Args, CmdArgs);
   } else {
     // Claim "no HIP libraries" arguments if any
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 8cd9d34bb4c5a..2746a308b7575 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -838,10 +838,12 @@ NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
     if (!llvm::is_contained(*DAL, A))
       DAL->append(A);
 
-  // TODO: We should accept 'generic' as a valid architecture.
   if (!DAL->hasArg(options::OPT_march_EQ) && OffloadKind != Action::OFK_None) {
     DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
                       CudaArchToString(CudaArch::CudaDefault));
+  } else if (DAL->getLastArgValue(options::OPT_march_EQ) == "generic" &&
+             OffloadKind == Action::OFK_None) {
+    DAL->eraseArg(options::OPT_march_EQ);
   } else if (DAL->getLastArgValue(options::OPT_march_EQ) == "native") {
     auto GPUsOrErr = getSystemGPUArchs(Args);
     if (!GPUsOrErr) {
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 6168b42dc7829..70daa699e3a94 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -148,7 +148,6 @@ void Flang::addCodegenOptions(const ArgList &Args,
 
   Args.addAllArgs(CmdArgs, {options::OPT_flang_experimental_hlfir,
                             options::OPT_flang_deprecated_no_hlfir,
-                            options::OPT_flang_experimental_polymorphism,
                             options::OPT_fno_ppc_native_vec_elem_order,
                             options::OPT_fppc_native_vec_elem_order});
 }
diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp
index c6ad862b22942..05aac9caa7fb2 100644
--- a/clang/lib/Driver/ToolChains/HLSL.cpp
+++ b/clang/lib/Driver/ToolChains/HLSL.cpp
@@ -226,6 +226,21 @@ HLSLToolChain::TranslateArgs(const DerivedArgList &Args, StringRef BoundArch,
       A->claim();
       continue;
     }
+    if (A->getOption().getID() == options::OPT_dxc_hlsl_version) {
+      // Translate -HV into -std for llvm
+      // depending on the value given
+      LangStandard::Kind LangStd = LangStandard::getHLSLLangKind(A->getValue());
+      if (LangStd != LangStandard::lang_unspecified) {
+        LangStandard l = LangStandard::getLangStandardForKind(LangStd);
+        DAL->AddSeparateArg(nullptr, Opts.getOption(options::OPT_std_EQ),
+                            l.getName());
+      } else {
+        getDriver().Diag(diag::err_drv_invalid_value) << "HV" << A->getValue();
+      }
+
+      A->claim();
+      continue;
+    }
     DAL->append(A);
   }
 
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index b8c2573d6265f..b7c6efab83e80 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -44,8 +44,15 @@ std::string wasm::Linker::getLinkerPath(const ArgList &Args) const {
           llvm::sys::fs::can_execute(UseLinker))
         return std::string(UseLinker);
 
-      // Accept 'lld', and 'ld' as aliases for the default linker
-      if (UseLinker != "lld" && UseLinker != "ld")
+      // Interpret 'lld' as explicitly requesting `wasm-ld`, so look for that
+      // linker. Note that for `wasm32-wasip2` this overrides the default linker
+      // of `wasm-component-ld`.
+      if (UseLinker == "lld") {
+        return ToolChain.GetProgramPath("wasm-ld");
+      }
+
+      // Allow 'ld' as an alias for the default linker
+      if (UseLinker != "ld")
         ToolChain.getDriver().Diag(diag::err_drv_invalid_linker_name)
             << A->getAsString(Args);
     }
@@ -73,6 +80,16 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.hasArg(options::OPT_s))
     CmdArgs.push_back("--strip-all");
 
+  // On `wasip2` the default linker is `wasm-component-ld` which wraps the
+  // execution of `wasm-ld`. Find `wasm-ld` and pass it as an argument of where
+  // to find it to avoid it needing to hunt and rediscover or search `PATH` for
+  // where it is.
+  if (llvm::sys::path::stem(Linker).ends_with_insensitive(
+          "wasm-component-ld")) {
+    CmdArgs.push_back("--wasm-ld-path");
+    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetProgramPath("wasm-ld")));
+  }
+
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_u});
 
   ToolChain.AddFilePathLibArgs(Args, CmdArgs);
@@ -221,6 +238,12 @@ WebAssembly::WebAssembly(const Driver &D, const llvm::Triple &Triple,
   }
 }
 
+const char *WebAssembly::getDefaultLinker() const {
+  if (getOS() == "wasip2")
+    return "wasm-component-ld";
+  return "wasm-ld";
+}
+
 bool WebAssembly::IsMathErrnoDefault() const { return false; }
 
 bool WebAssembly::IsObjCNonFragileABIDefault() const { return true; }
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.h b/clang/lib/Driver/ToolChains/WebAssembly.h
index ae60f464c1081..76e0ca39bd748 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.h
+++ b/clang/lib/Driver/ToolChains/WebAssembly.h
@@ -67,7 +67,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssembly final : public ToolChain {
                            llvm::opt::ArgStringList &CmdArgs) const override;
   SanitizerMask getSupportedSanitizers() const override;
 
-  const char *getDefaultLinker() const override { return "wasm-ld"; }
+  const char *getDefaultLinker() const override;
 
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
diff --git a/clang/lib/Format/.clang-format b/clang/lib/Format/.clang-format
index f95602cab0f7f..d7331b3c8cf02 100644
--- a/clang/lib/Format/.clang-format
+++ b/clang/lib/Format/.clang-format
@@ -1,6 +1 @@
-BasedOnStyle: LLVM
-InsertBraces: true
-InsertNewlineAtEOF: true
-LineEnding: LF
-RemoveBracesLLVM: true
-RemoveParentheses: ReturnStatement
+BasedOnStyle: clang-format
diff --git a/clang/lib/Format/BreakableToken.h b/clang/lib/Format/BreakableToken.h
index e7c0680641e29..8b9360a3335ef 100644
--- a/clang/lib/Format/BreakableToken.h
+++ b/clang/lib/Format/BreakableToken.h
@@ -18,11 +18,8 @@
 #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
 
 #include "Encoding.h"
-#include "TokenAnnotator.h"
 #include "WhitespaceManager.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Support/Regex.h"
-#include <utility>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index df44e6994c478..700bce35c8683 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -822,6 +822,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       !CurrentState.IsCSharpGenericTypeConstraint && Previous.opensScope() &&
       Previous.isNot(TT_ObjCMethodExpr) && Previous.isNot(TT_RequiresClause) &&
       Previous.isNot(TT_TableGenDAGArgOpener) &&
+      Previous.isNot(TT_TableGenDAGArgOpenerToBreak) &&
       !(Current.MacroParent && Previous.MacroParent) &&
       (Current.isNot(TT_LineComment) ||
        Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen))) {
@@ -1444,7 +1445,9 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
       Style.BreakInheritanceList == FormatStyle::BILS_AfterColon) {
     return CurrentState.Indent;
   }
-  if (Previous.is(tok::r_paren) && !Current.isBinaryOperator() &&
+  if (Previous.is(tok::r_paren) &&
+      Previous.isNot(TT_TableGenDAGArgOperatorToBreak) &&
+      !Current.isBinaryOperator() &&
       !Current.isOneOf(tok::colon, tok::comment)) {
     return ContinuationIndent;
   }
@@ -1705,7 +1708,8 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State,
         (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign ||
          PrecedenceLevel != prec::Comma || Current.NestingLevel == 0) &&
         (!Style.isTableGen() ||
-         (Previous && Previous->is(TT_TableGenDAGArgListComma)))) {
+         (Previous && Previous->isOneOf(TT_TableGenDAGArgListComma,
+                                        TT_TableGenDAGArgListCommaToBreak)))) {
       NewParenState.Indent = std::max(
           std::max(State.Column, NewParenState.Indent), CurrentState.LastSpace);
     }
@@ -1842,6 +1846,17 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State,
         Style.ContinuationIndentWidth +
         std::max(CurrentState.LastSpace, CurrentState.StartOfFunctionCall);
 
+    if (Style.isTableGen() && Current.is(TT_TableGenDAGArgOpenerToBreak) &&
+        Style.TableGenBreakInsideDAGArg == FormatStyle::DAS_BreakElements) {
+      // For the case the next token is a TableGen DAGArg operator identifier
+      // that is not marked to have a line break after it.
+      // In this case the option DAS_BreakElements requires to align the
+      // DAGArg elements to the operator.
+      const FormatToken *Next = Current.Next;
+      if (Next && Next->is(TT_TableGenDAGArgOperatorID))
+        NewIndent = State.Column + Next->TokenText.size() + 2;
+    }
+
     // Ensure that different different brackets force relative alignment, e.g.:
     // void SomeFunction(vector<  // break
     //                       int> v);
diff --git a/clang/lib/Format/ContinuationIndenter.h b/clang/lib/Format/ContinuationIndenter.h
index 2598947bb624c..18441e10a1249 100644
--- a/clang/lib/Format/ContinuationIndenter.h
+++ b/clang/lib/Format/ContinuationIndenter.h
@@ -17,11 +17,6 @@
 
 #include "Encoding.h"
 #include "FormatToken.h"
-#include "clang/Format/Format.h"
-#include "llvm/Support/Regex.h"
-#include <map>
-#include <optional>
-#include <tuple>
 
 namespace clang {
 class SourceManager;
diff --git a/clang/lib/Format/Encoding.h b/clang/lib/Format/Encoding.h
index a0d664121b2bb..12f9043bb95ad 100644
--- a/clang/lib/Format/Encoding.h
+++ b/clang/lib/Format/Encoding.h
@@ -16,7 +16,6 @@
 #define LLVM_CLANG_LIB_FORMAT_ENCODING_H
 
 #include "clang/Basic/LLVM.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Unicode.h"
 
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index e64ba7eebc1ce..63ec3a88978dd 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -13,44 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Format/Format.h"
-#include "AffectedRangeManager.h"
-#include "BreakableToken.h"
-#include "ContinuationIndenter.h"
 #include "DefinitionBlockSeparator.h"
-#include "FormatInternal.h"
-#include "FormatToken.h"
-#include "FormatTokenLexer.h"
 #include "IntegerLiteralSeparatorFixer.h"
 #include "NamespaceEndCommentsFixer.h"
 #include "ObjCPropertyAttributeOrderFixer.h"
 #include "QualifierAlignmentFixer.h"
 #include "SortJavaScriptImports.h"
-#include "TokenAnalyzer.h"
-#include "TokenAnnotator.h"
 #include "UnwrappedLineFormatter.h"
-#include "UnwrappedLineParser.h"
 #include "UsingDeclarationsSorter.h"
-#include "WhitespaceManager.h"
-#include "clang/Basic/Diagnostic.h"
-#include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Lex/Lexer.h"
 #include "clang/Tooling/Inclusions/HeaderIncludes.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Support/YAMLTraits.h"
-#include <algorithm>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <string>
-#include <unordered_map>
 
 #define DEBUG_TYPE "format-formatter"
 
@@ -307,6 +279,14 @@ struct ScalarEnumerationTraits<FormatStyle::BreakTemplateDeclarationsStyle> {
   }
 };
 
+template <> struct ScalarEnumerationTraits<FormatStyle::DAGArgStyle> {
+  static void enumeration(IO &IO, FormatStyle::DAGArgStyle &Value) {
+    IO.enumCase(Value, "DontBreak", FormatStyle::DAS_DontBreak);
+    IO.enumCase(Value, "BreakElements", FormatStyle::DAS_BreakElements);
+    IO.enumCase(Value, "BreakAll", FormatStyle::DAS_BreakAll);
+  }
+};
+
 template <>
 struct ScalarEnumerationTraits<FormatStyle::DefinitionReturnTypeBreakingStyle> {
   static void
@@ -1120,6 +1100,10 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("StatementAttributeLikeMacros",
                    Style.StatementAttributeLikeMacros);
     IO.mapOptional("StatementMacros", Style.StatementMacros);
+    IO.mapOptional("TableGenBreakingDAGArgOperators",
+                   Style.TableGenBreakingDAGArgOperators);
+    IO.mapOptional("TableGenBreakInsideDAGArg",
+                   Style.TableGenBreakInsideDAGArg);
     IO.mapOptional("TabWidth", Style.TabWidth);
     IO.mapOptional("TypeNames", Style.TypeNames);
     IO.mapOptional("TypenameMacros", Style.TypenameMacros);
@@ -1580,6 +1564,8 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.StatementAttributeLikeMacros.push_back("Q_EMIT");
   LLVMStyle.StatementMacros.push_back("Q_UNUSED");
   LLVMStyle.StatementMacros.push_back("QT_REQUIRE_VERSION");
+  LLVMStyle.TableGenBreakingDAGArgOperators = {};
+  LLVMStyle.TableGenBreakInsideDAGArg = FormatStyle::DAS_DontBreak;
   LLVMStyle.TabWidth = 8;
   LLVMStyle.UseTab = FormatStyle::UT_Never;
   LLVMStyle.VerilogBreakBetweenInstancePorts = true;
diff --git a/clang/lib/Format/FormatInternal.h b/clang/lib/Format/FormatInternal.h
index 9043ce32e9e32..60c5bf6b786b0 100644
--- a/clang/lib/Format/FormatInternal.h
+++ b/clang/lib/Format/FormatInternal.h
@@ -15,7 +15,9 @@
 #ifndef LLVM_CLANG_LIB_FORMAT_FORMATINTERNAL_H
 #define LLVM_CLANG_LIB_FORMAT_FORMATINTERNAL_H
 
-#include "BreakableToken.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Format/Format.h"
+#include "clang/Tooling/Core/Replacement.h"
 #include <utility>
 
 namespace clang {
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index f4566e4a33513..06f567059c357 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -19,8 +19,6 @@
 #include "clang/Basic/OperatorPrecedence.h"
 #include "clang/Format/Format.h"
 #include "clang/Lex/Lexer.h"
-#include <memory>
-#include <optional>
 #include <unordered_set>
 
 namespace clang {
@@ -155,7 +153,11 @@ namespace format {
   TYPE(TableGenDAGArgCloser)                                                   \
   TYPE(TableGenDAGArgListColon)                                                \
   TYPE(TableGenDAGArgListComma)                                                \
+  TYPE(TableGenDAGArgListCommaToBreak)                                         \
   TYPE(TableGenDAGArgOpener)                                                   \
+  TYPE(TableGenDAGArgOpenerToBreak)                                            \
+  TYPE(TableGenDAGArgOperatorID)                                               \
+  TYPE(TableGenDAGArgOperatorToBreak)                                          \
   TYPE(TableGenListCloser)                                                     \
   TYPE(TableGenListOpener)                                                     \
   TYPE(TableGenMultiLineString)                                                \
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 65dd733bd5335..277cc0a2dfde6 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -17,14 +17,9 @@
 
 #include "Encoding.h"
 #include "FormatToken.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Support/Regex.h"
 
 #include <stack>
 
diff --git a/clang/lib/Format/FormatTokenSource.h b/clang/lib/Format/FormatTokenSource.h
index 7819244eb7d19..cce19f527a923 100644
--- a/clang/lib/Format/FormatTokenSource.h
+++ b/clang/lib/Format/FormatTokenSource.h
@@ -15,10 +15,7 @@
 #ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENSOURCE_H
 #define LLVM_CLANG_LIB_FORMAT_FORMATTOKENSOURCE_H
 
-#include "FormatToken.h"
 #include "UnwrappedLineParser.h"
-#include "llvm/ADT/DenseMap.h"
-#include <cstddef>
 
 #define DEBUG_TYPE "format-token-source"
 
diff --git a/clang/lib/Format/Macros.h b/clang/lib/Format/Macros.h
index d2f7fe502364c..fb12d22299dea 100644
--- a/clang/lib/Format/Macros.h
+++ b/clang/lib/Format/Macros.h
@@ -39,15 +39,9 @@
 #define CLANG_LIB_FORMAT_MACROS_H
 
 #include <list>
-#include <map>
-#include <string>
-#include <vector>
 
 #include "FormatToken.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/SortJavaScriptImports.h b/clang/lib/Format/SortJavaScriptImports.h
index 7336db9537b0d..b55b149aab4cd 100644
--- a/clang/lib/Format/SortJavaScriptImports.h
+++ b/clang/lib/Format/SortJavaScriptImports.h
@@ -14,10 +14,7 @@
 #ifndef LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H
 #define LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H
 
-#include "clang/Basic/LLVM.h"
 #include "clang/Format/Format.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h
index 4086dab1c94c3..b7494c395c8a2 100644
--- a/clang/lib/Format/TokenAnalyzer.h
+++ b/clang/lib/Format/TokenAnalyzer.h
@@ -17,19 +17,8 @@
 #define LLVM_CLANG_LIB_FORMAT_TOKENANALYZER_H
 
 #include "AffectedRangeManager.h"
-#include "Encoding.h"
-#include "FormatToken.h"
 #include "FormatTokenLexer.h"
 #include "TokenAnnotator.h"
-#include "UnwrappedLineParser.h"
-#include "clang/Basic/Diagnostic.h"
-#include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/FileManager.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
-#include <memory>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index d7b84e309e096..94d2266555f6b 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -989,16 +989,59 @@ class AnnotatingParser {
     return false;
   }
 
+  // Judge if the token is a operator ID to insert line break in DAGArg.
+  // That is, TableGenBreakingDAGArgOperators is empty (by the definition of the
+  // option) or the token is in the list.
+  bool isTableGenDAGArgBreakingOperator(const FormatToken &Tok) {
+    auto &Opes = Style.TableGenBreakingDAGArgOperators;
+    // If the list is empty, all operators are breaking operators.
+    if (Opes.empty())
+      return true;
+    // Otherwise, the operator is limited to normal identifiers.
+    if (Tok.isNot(tok::identifier) ||
+        Tok.isOneOf(TT_TableGenBangOperator, TT_TableGenCondOperator)) {
+      return false;
+    }
+    // The case next is colon, it is not a operator of identifier.
+    if (!Tok.Next || Tok.Next->is(tok::colon))
+      return false;
+    return std::find(Opes.begin(), Opes.end(), Tok.TokenText.str()) !=
+           Opes.end();
+  }
+
   // SimpleValue6 ::=  "(" DagArg [DagArgList] ")"
   // This parses SimpleValue 6's inside part of "(" ")"
   bool parseTableGenDAGArgAndList(FormatToken *Opener) {
+    FormatToken *FirstTok = CurrentToken;
     if (!parseTableGenDAGArg())
       return false;
+    bool BreakInside = false;
+    if (Style.TableGenBreakInsideDAGArg != FormatStyle::DAS_DontBreak) {
+      // Specialized detection for DAGArgOperator, that determines the way of
+      // line break for this DAGArg elements.
+      if (isTableGenDAGArgBreakingOperator(*FirstTok)) {
+        // Special case for identifier DAGArg operator.
+        BreakInside = true;
+        Opener->setType(TT_TableGenDAGArgOpenerToBreak);
+        if (FirstTok->isOneOf(TT_TableGenBangOperator,
+                              TT_TableGenCondOperator)) {
+          // Special case for bang/cond operators. Set the whole operator as
+          // the DAGArg operator. Always break after it.
+          CurrentToken->Previous->setType(TT_TableGenDAGArgOperatorToBreak);
+        } else if (FirstTok->is(tok::identifier)) {
+          if (Style.TableGenBreakInsideDAGArg == FormatStyle::DAS_BreakAll)
+            FirstTok->setType(TT_TableGenDAGArgOperatorToBreak);
+          else
+            FirstTok->setType(TT_TableGenDAGArgOperatorID);
+        }
+      }
+    }
     // Parse the [DagArgList] part
     bool FirstDAGArgListElm = true;
     while (CurrentToken) {
       if (!FirstDAGArgListElm && CurrentToken->is(tok::comma)) {
-        CurrentToken->setType(TT_TableGenDAGArgListComma);
+        CurrentToken->setType(BreakInside ? TT_TableGenDAGArgListCommaToBreak
+                                          : TT_TableGenDAGArgListComma);
         skipToNextNonComment();
       }
       if (CurrentToken && CurrentToken->is(tok::r_paren)) {
@@ -3596,6 +3639,13 @@ static bool isFunctionDeclarationName(bool IsCpp, const FormatToken &Current,
   if (!Current.Tok.getIdentifierInfo())
     return false;
 
+  const auto &Previous = *Current.Previous;
+
+  if (const auto *PrevPrev = Previous.Previous;
+      PrevPrev && PrevPrev->is(TT_ObjCDecl)) {
+    return false;
+  }
+
   auto skipOperatorName =
       [IsCpp](const FormatToken *Next) -> const FormatToken * {
     for (; Next; Next = Next->Next) {
@@ -3635,18 +3685,17 @@ static bool isFunctionDeclarationName(bool IsCpp, const FormatToken &Current,
   // Find parentheses of parameter list.
   const FormatToken *Next = Current.Next;
   if (Current.is(tok::kw_operator)) {
-    const auto *Previous = Current.Previous;
-    if (Previous->Tok.getIdentifierInfo() &&
-        !Previous->isOneOf(tok::kw_return, tok::kw_co_return)) {
+    if (Previous.Tok.getIdentifierInfo() &&
+        !Previous.isOneOf(tok::kw_return, tok::kw_co_return)) {
       return true;
     }
-    if (Previous->is(tok::r_paren) && Previous->is(TT_TypeDeclarationParen)) {
-      assert(Previous->MatchingParen);
-      assert(Previous->MatchingParen->is(tok::l_paren));
-      assert(Previous->MatchingParen->is(TT_TypeDeclarationParen));
+    if (Previous.is(tok::r_paren) && Previous.is(TT_TypeDeclarationParen)) {
+      assert(Previous.MatchingParen);
+      assert(Previous.MatchingParen->is(tok::l_paren));
+      assert(Previous.MatchingParen->is(TT_TypeDeclarationParen));
       return true;
     }
-    if (!Previous->isPointerOrReference() && Previous->isNot(TT_TemplateCloser))
+    if (!Previous.isPointerOrReference() && Previous.isNot(TT_TemplateCloser))
       return false;
     Next = skipOperatorName(Next);
   } else {
@@ -4268,6 +4317,10 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
     return Left.is(tok::hash);
   if (Left.isOneOf(tok::hashhash, tok::hash))
     return Right.is(tok::hash);
+  if (Left.is(BK_Block) && Right.is(tok::r_brace) &&
+      Right.MatchingParen == &Left && Line.Children.empty()) {
+    return Style.SpaceInEmptyBlock;
+  }
   if ((Left.is(tok::l_paren) && Right.is(tok::r_paren)) ||
       (Left.is(tok::l_brace) && Left.isNot(BK_Block) &&
        Right.is(tok::r_brace) && Right.isNot(BK_Block))) {
@@ -4610,7 +4663,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
       return Style.SpaceBeforeParensOptions.AfterFunctionDefinitionName ||
              spaceRequiredBeforeParens(Right);
     }
-    if (!Left.Previous || Left.Previous->isNot(tok::period)) {
+    if (!Left.Previous || !Left.Previous->isOneOf(tok::period, tok::arrow)) {
       if (Left.isOneOf(tok::kw_try, Keywords.kw___except, tok::kw_catch)) {
         return Style.SpaceBeforeParensOptions.AfterControlStatements ||
                spaceRequiredBeforeParens(Right);
@@ -5083,6 +5136,11 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     }
     if (Right.is(TT_TableGenCondOperatorColon))
       return false;
+    if (Left.isOneOf(TT_TableGenDAGArgOperatorID,
+                     TT_TableGenDAGArgOperatorToBreak) &&
+        Right.isNot(TT_TableGenDAGArgCloser)) {
+      return true;
+    }
     // Do not insert bang operators and consequent openers.
     if (Right.isOneOf(tok::l_paren, tok::less) &&
         Left.isOneOf(TT_TableGenBangOperator, TT_TableGenCondOperator)) {
@@ -5459,6 +5517,18 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
     //       case2:0);
     if (Left.is(TT_TableGenCondOperatorComma))
       return true;
+    if (Left.is(TT_TableGenDAGArgOperatorToBreak) &&
+        Right.isNot(TT_TableGenDAGArgCloser)) {
+      return true;
+    }
+    if (Left.is(TT_TableGenDAGArgListCommaToBreak))
+      return true;
+    if (Right.is(TT_TableGenDAGArgCloser) && Right.MatchingParen &&
+        Right.MatchingParen->is(TT_TableGenDAGArgOpenerToBreak) &&
+        &Left != Right.MatchingParen->Next) {
+      // Check to avoid empty DAGArg such as (ins).
+      return Style.TableGenBreakInsideDAGArg == FormatStyle::DAS_BreakAll;
+    }
   }
 
   if (Line.startsWith(tok::kw_asm) && Right.is(TT_InlineASMColon) &&
@@ -5873,6 +5943,8 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
     // Avoid to break around paste operator.
     if (Left.is(tok::hash) || Right.is(tok::hash))
       return false;
+    if (Left.isOneOf(TT_TableGenBangOperator, TT_TableGenCondOperator))
+      return false;
   }
 
   if (Left.is(tok::at))
diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h
index a631e5f52bc60..25a24dccb1b83 100644
--- a/clang/lib/Format/TokenAnnotator.h
+++ b/clang/lib/Format/TokenAnnotator.h
@@ -16,7 +16,6 @@
 #define LLVM_CLANG_LIB_FORMAT_TOKENANNOTATOR_H
 
 #include "UnwrappedLineParser.h"
-#include "clang/Format/Format.h"
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/UnwrappedLineFormatter.h b/clang/lib/Format/UnwrappedLineFormatter.h
index ee6d31de8c422..9b8acf427a2a0 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.h
+++ b/clang/lib/Format/UnwrappedLineFormatter.h
@@ -16,8 +16,6 @@
 #define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEFORMATTER_H
 
 #include "ContinuationIndenter.h"
-#include "clang/Format/Format.h"
-#include <map>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index e3f0af176b567..98ae1c8f62bbc 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1222,7 +1222,6 @@ void UnwrappedLineParser::parsePPUnknown() {
 static bool tokenCanStartNewLine(const FormatToken &Tok) {
   // Semicolon can be a null-statement, l_square can be a start of a macro or
   // a C++11 attribute, but this doesn't seem to be common.
-  assert(Tok.isNot(TT_AttributeSquare));
   return !Tok.isOneOf(tok::semi, tok::l_brace,
                       // Tokens that can only be used as binary operators and a
                       // part of overloaded operator names.
@@ -3710,14 +3709,19 @@ bool UnwrappedLineParser::parseEnum() {
   if (Style.Language == FormatStyle::LK_Proto && FormatTok->is(tok::equal))
     return false;
 
-  // Eat up enum class ...
-  if (FormatTok->isOneOf(tok::kw_class, tok::kw_struct))
-    nextToken();
+  if (IsCpp) {
+    // Eat up enum class ...
+    if (FormatTok->isOneOf(tok::kw_class, tok::kw_struct))
+      nextToken();
+    while (FormatTok->is(tok::l_square))
+      if (!handleCppAttributes())
+        return false;
+  }
 
   while (FormatTok->Tok.getIdentifierInfo() ||
          FormatTok->isOneOf(tok::colon, tok::coloncolon, tok::less,
                             tok::greater, tok::comma, tok::question,
-                            tok::l_square, tok::r_square)) {
+                            tok::l_square)) {
     if (Style.isVerilog()) {
       FormatTok->setFinalizedType(TT_VerilogDimensionedTypeName);
       nextToken();
@@ -3730,7 +3734,6 @@ bool UnwrappedLineParser::parseEnum() {
     // We can have macros or attributes in between 'enum' and the enum name.
     if (FormatTok->is(tok::l_paren))
       parseParens();
-    assert(FormatTok->isNot(TT_AttributeSquare));
     if (FormatTok->is(tok::identifier)) {
       nextToken();
       // If there are two identifiers in a row, this is likely an elaborate
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 619fbb217882b..e2cf28c0c065d 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -15,17 +15,8 @@
 #ifndef LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
 #define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
 
-#include "Encoding.h"
-#include "FormatToken.h"
 #include "Macros.h"
-#include "clang/Basic/IdentifierTable.h"
-#include "clang/Format/Format.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/Support/Regex.h"
-#include <list>
 #include <stack>
-#include <vector>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/WhitespaceManager.h b/clang/lib/Format/WhitespaceManager.h
index 9942e0f35738f..0ebc6cf8377cd 100644
--- a/clang/lib/Format/WhitespaceManager.h
+++ b/clang/lib/Format/WhitespaceManager.h
@@ -17,11 +17,6 @@
 
 #include "TokenAnnotator.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
-#include "llvm/ADT/SmallVector.h"
-#include <algorithm>
-#include <string>
-#include <tuple>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 83d11dd32bf3a..de229c5488fd4 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3207,6 +3207,22 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
   bool IsIndexHeaderMap = false;
   bool IsSysrootSpecified =
       Args.hasArg(OPT__sysroot_EQ) || Args.hasArg(OPT_isysroot);
+
+  // Expand a leading `=` to the sysroot if one was passed (and it's not a
+  // framework flag).
+  auto PrefixHeaderPath = [IsSysrootSpecified,
+                           &Opts](const llvm::opt::Arg *A,
+                                  bool IsFramework = false) -> std::string {
+    assert(A->getNumValues() && "Unexpected empty search path flag!");
+    if (IsSysrootSpecified && !IsFramework && A->getValue()[0] == '=') {
+      SmallString<32> Buffer;
+      llvm::sys::path::append(Buffer, Opts.Sysroot,
+                              llvm::StringRef(A->getValue()).substr(1));
+      return std::string(Buffer);
+    }
+    return A->getValue();
+  };
+
   for (const auto *A : Args.filtered(OPT_I, OPT_F, OPT_index_header_map)) {
     if (A->getOption().matches(OPT_index_header_map)) {
       // -index-header-map applies to the next -I or -F.
@@ -3218,16 +3234,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
         IsIndexHeaderMap ? frontend::IndexHeaderMap : frontend::Angled;
 
     bool IsFramework = A->getOption().matches(OPT_F);
-    std::string Path = A->getValue();
-
-    if (IsSysrootSpecified && !IsFramework && A->getValue()[0] == '=') {
-      SmallString<32> Buffer;
-      llvm::sys::path::append(Buffer, Opts.Sysroot,
-                              llvm::StringRef(A->getValue()).substr(1));
-      Path = std::string(Buffer);
-    }
-
-    Opts.AddPath(Path, Group, IsFramework,
+    Opts.AddPath(PrefixHeaderPath(A, IsFramework), Group, IsFramework,
                  /*IgnoreSysroot*/ true);
     IsIndexHeaderMap = false;
   }
@@ -3245,12 +3252,18 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
   }
 
   for (const auto *A : Args.filtered(OPT_idirafter))
-    Opts.AddPath(A->getValue(), frontend::After, false, true);
+    Opts.AddPath(PrefixHeaderPath(A), frontend::After, false, true);
   for (const auto *A : Args.filtered(OPT_iquote))
-    Opts.AddPath(A->getValue(), frontend::Quoted, false, true);
-  for (const auto *A : Args.filtered(OPT_isystem, OPT_iwithsysroot))
-    Opts.AddPath(A->getValue(), frontend::System, false,
-                 !A->getOption().matches(OPT_iwithsysroot));
+    Opts.AddPath(PrefixHeaderPath(A), frontend::Quoted, false, true);
+
+  for (const auto *A : Args.filtered(OPT_isystem, OPT_iwithsysroot)) {
+    if (A->getOption().matches(OPT_iwithsysroot)) {
+      Opts.AddPath(A->getValue(), frontend::System, false,
+                   /*IgnoreSysRoot=*/false);
+      continue;
+    }
+    Opts.AddPath(PrefixHeaderPath(A), frontend::System, false, true);
+  }
   for (const auto *A : Args.filtered(OPT_iframework))
     Opts.AddPath(A->getValue(), frontend::System, true, true);
   for (const auto *A : Args.filtered(OPT_iframeworkwithsysroot))
@@ -3309,6 +3322,17 @@ static void ParseAPINotesArgs(APINotesOptions &Opts, ArgList &Args,
     Opts.ModuleSearchPaths.push_back(A->getValue());
 }
 
+static void GeneratePointerAuthArgs(const LangOptions &Opts,
+                                    ArgumentConsumer Consumer) {
+  if (Opts.PointerAuthIntrinsics)
+    GenerateArg(Consumer, OPT_fptrauth_intrinsics);
+}
+
+static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
+                                 DiagnosticsEngine &Diags) {
+  Opts.PointerAuthIntrinsics = Args.hasArg(OPT_fptrauth_intrinsics);
+}
+
 /// Check if input file kind and language standard are compatible.
 static bool IsInputCompatibleWithStandard(InputKind IK,
                                           const LangStandard &S) {
@@ -4191,6 +4215,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 
       if (TT.getArch() == llvm::Triple::UnknownArch ||
           !(TT.getArch() == llvm::Triple::aarch64 || TT.isPPC() ||
+            TT.getArch() == llvm::Triple::systemz ||
             TT.getArch() == llvm::Triple::nvptx ||
             TT.getArch() == llvm::Triple::nvptx64 ||
             TT.getArch() == llvm::Triple::amdgcn ||
@@ -4810,6 +4835,8 @@ bool CompilerInvocation::CreateFromArgsImpl(
                         Res.getFileSystemOpts().WorkingDir);
   ParseAPINotesArgs(Res.getAPINotesOpts(), Args, Diags);
 
+  ParsePointerAuthArgs(LangOpts, Args, Diags);
+
   ParseLangArgs(LangOpts, Args, DashX, T, Res.getPreprocessorOpts().Includes,
                 Diags);
   if (Res.getFrontendOpts().ProgramAction == frontend::RewriteObjC)
@@ -5048,6 +5075,7 @@ void CompilerInvocationBase::generateCC1CommandLine(
   GenerateTargetArgs(getTargetOpts(), Consumer);
   GenerateHeaderSearchArgs(getHeaderSearchOpts(), Consumer);
   GenerateAPINotesArgs(getAPINotesOpts(), Consumer);
+  GeneratePointerAuthArgs(getLangOpts(), Consumer);
   GenerateLangArgs(getLangOpts(), Consumer, T, getFrontendOpts().DashX);
   GenerateCodeGenArgs(getCodeGenOpts(), Consumer, T,
                       getFrontendOpts().OutputFile, &getLangOpts());
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index b9a966be73ee9..97104ccd8db59 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -214,6 +214,7 @@ set(x86_files
   popcntintrin.h
   prfchiintrin.h
   prfchwintrin.h
+  ptrauth.h
   ptwriteintrin.h
   raointintrin.h
   rdpruintrin.h
@@ -253,8 +254,10 @@ set(x86_files
   )
 
 set(windows_only_files
+  intrin0.h
   intrin.h
   vadefs.h
+  yvals_core.h
 )
 
 set(utility_files
diff --git a/clang/lib/Headers/__stddef_null.h b/clang/lib/Headers/__stddef_null.h
index 7336fdab38972..c10bd2d7d9887 100644
--- a/clang/lib/Headers/__stddef_null.h
+++ b/clang/lib/Headers/__stddef_null.h
@@ -7,7 +7,7 @@
  *===-----------------------------------------------------------------------===
  */
 
-#if !defined(NULL) || !__has_feature(modules)
+#if !defined(NULL) || !__building_module(_Builtin_stddef)
 
 /* linux/stddef.h will define NULL to 0. glibc (and other) headers then define
  * __need_NULL and rely on stddef.h to redefine NULL to the correct value again.
diff --git a/clang/lib/Headers/__stddef_nullptr_t.h b/clang/lib/Headers/__stddef_nullptr_t.h
index 183d394d56c1b..7f3fbe6fe0d3a 100644
--- a/clang/lib/Headers/__stddef_nullptr_t.h
+++ b/clang/lib/Headers/__stddef_nullptr_t.h
@@ -7,7 +7,12 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _NULLPTR_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_NULLPTR_T) ||                                                    \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _NULLPTR_T
 
 #ifdef __cplusplus
diff --git a/clang/lib/Headers/__stddef_offsetof.h b/clang/lib/Headers/__stddef_offsetof.h
index 3b347b3b92f62..84172c6cd2735 100644
--- a/clang/lib/Headers/__stddef_offsetof.h
+++ b/clang/lib/Headers/__stddef_offsetof.h
@@ -7,6 +7,11 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef offsetof
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(offsetof) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define offsetof(t, d) __builtin_offsetof(t, d)
 #endif
diff --git a/clang/lib/Headers/__stddef_ptrdiff_t.h b/clang/lib/Headers/__stddef_ptrdiff_t.h
index 3ea6d7d2852e1..fd3c893c66c97 100644
--- a/clang/lib/Headers/__stddef_ptrdiff_t.h
+++ b/clang/lib/Headers/__stddef_ptrdiff_t.h
@@ -7,7 +7,12 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _PTRDIFF_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_PTRDIFF_T) ||                                                    \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _PTRDIFF_T
 
 typedef __PTRDIFF_TYPE__ ptrdiff_t;
diff --git a/clang/lib/Headers/__stddef_rsize_t.h b/clang/lib/Headers/__stddef_rsize_t.h
index b6428d0c12b62..dd433d40d9733 100644
--- a/clang/lib/Headers/__stddef_rsize_t.h
+++ b/clang/lib/Headers/__stddef_rsize_t.h
@@ -7,7 +7,12 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _RSIZE_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_RSIZE_T) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _RSIZE_T
 
 typedef __SIZE_TYPE__ rsize_t;
diff --git a/clang/lib/Headers/__stddef_size_t.h b/clang/lib/Headers/__stddef_size_t.h
index e4a389510bcdb..3dd7b1f379294 100644
--- a/clang/lib/Headers/__stddef_size_t.h
+++ b/clang/lib/Headers/__stddef_size_t.h
@@ -7,7 +7,12 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _SIZE_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_SIZE_T) ||                                                       \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _SIZE_T
 
 typedef __SIZE_TYPE__ size_t;
diff --git a/clang/lib/Headers/__stddef_unreachable.h b/clang/lib/Headers/__stddef_unreachable.h
index 3e7fe01979662..518580c92d3f5 100644
--- a/clang/lib/Headers/__stddef_unreachable.h
+++ b/clang/lib/Headers/__stddef_unreachable.h
@@ -7,6 +7,11 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef unreachable
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(unreachable) ||                                                   \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define unreachable() __builtin_unreachable()
 #endif
diff --git a/clang/lib/Headers/__stddef_wchar_t.h b/clang/lib/Headers/__stddef_wchar_t.h
index 16a6186512c0c..bd69f63225416 100644
--- a/clang/lib/Headers/__stddef_wchar_t.h
+++ b/clang/lib/Headers/__stddef_wchar_t.h
@@ -9,7 +9,12 @@
 
 #if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
 
-#ifndef _WCHAR_T
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_WCHAR_T) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define _WCHAR_T
 
 #ifdef _MSC_EXTENSIONS
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index ecd9bf18a41ca..a8882e82e171a 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -2201,6 +2201,10 @@ _mm256_cvtpd_ps(__m256d __a)
 
 /// Converts a vector of [8 x float] into a vector of [8 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
@@ -2230,9 +2234,13 @@ _mm256_cvtps_pd(__m128 __a)
   return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
 }
 
-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
-///    x i32], truncating the result by rounding towards zero when it is
-///    inexact.
+/// Converts a 256-bit vector of [4 x double] into four signed truncated
+///    (rounded toward zero) 32-bit integers returned in a 128-bit vector of
+///    [4 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2247,9 +2255,12 @@ _mm256_cvttpd_epi32(__m256d __a)
   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
 }
 
-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
-///    x i32]. When a conversion is inexact, the value returned is rounded
-///    according to the rounding control bits in the MXCSR register.
+/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
+///    [4 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2264,8 +2275,12 @@ _mm256_cvtpd_epi32(__m256d __a)
   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
 }
 
-/// Converts a vector of [8 x float] into a vector of [8 x i32],
-///    truncating the result by rounding towards zero when it is inexact.
+/// Converts a vector of [8 x float] into eight signed truncated (rounded
+///    toward zero) 32-bit integers returned in a vector of [8 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/bmiintrin.h b/clang/lib/Headers/bmiintrin.h
index d8e57c0cb4940..78bffe68e221a 100644
--- a/clang/lib/Headers/bmiintrin.h
+++ b/clang/lib/Headers/bmiintrin.h
@@ -161,8 +161,7 @@ _mm_tzcnt_64(unsigned long long __X)
 
 #undef __RELAXED_FN_ATTRS
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__BMI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
@@ -610,7 +609,6 @@ __blsr_u64(unsigned long long __X)
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules)   \
-          || defined(__BMI__) */
+#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__BMI__) */
 
 #endif /* __BMIINTRIN_H */
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 984f0cf917e99..f0c2db752195a 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -1294,6 +1294,10 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
 ///    64 bits of the result vector are set to zero.
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
@@ -1309,6 +1313,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
 /// Converts the low-order element of a 128-bit vector of [2 x double]
 ///    into a 32-bit signed integer value.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
@@ -1394,12 +1402,13 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
 }
 
 /// Converts the two double-precision floating-point elements of a
-///    128-bit vector of [2 x double] into two signed 32-bit integer values,
-///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
+///    128-bit vector of [2 x double] into two signed truncated (rounded
+///    toward zero) 32-bit integer values, returned in the lower 64 bits
+///    of a 128-bit vector of [4 x i32].
 ///
-///    If the result of either conversion is inexact, the result is truncated
-///    (rounded towards zero) regardless of the current MXCSR setting. The upper
-///    64 bits of the result vector are set to zero.
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1415,7 +1424,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
 }
 
 /// Converts the low-order element of a [2 x double] vector into a 32-bit
-///    signed integer value, truncating the result when it is inexact.
+///    signed truncated (rounded toward zero) integer value.
+///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1434,6 +1447,10 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
 ///    returned in a 64-bit vector of [2 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
@@ -1446,11 +1463,12 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
 }
 
 /// Converts the two double-precision floating-point elements of a
-///    128-bit vector of [2 x double] into two signed 32-bit integer values,
-///    returned in a 64-bit vector of [2 x i32].
+///    128-bit vector of [2 x double] into two signed truncated (rounded toward
+///    zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
 ///
-///    If the result of either conversion is inexact, the result is truncated
-///    (rounded towards zero) regardless of the current MXCSR setting.
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3216,7 +3234,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
 }
 
 /// Converts the first (lower) element of a vector of [2 x double] into a
-///    64-bit signed integer value, according to the current rounding mode.
+///    64-bit signed integer value.
+///
+///    If the converted value does not fit in a 64-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3231,7 +3253,11 @@ static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
 }
 
 /// Converts the first (lower) element of a vector of [2 x double] into a
-///    64-bit signed integer value, truncating the result when it is inexact.
+///    64-bit signed truncated (rounded toward zero) integer value.
+///
+///    If a converted value does not fit in a 64-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3262,6 +3288,10 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
 
 /// Converts a vector of [4 x float] into a vector of [4 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
@@ -3274,8 +3304,12 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
 }
 
-/// Converts a vector of [4 x float] into a vector of [4 x i32],
-///    truncating the result when it is inexact.
+/// Converts a vector of [4 x float] into four signed truncated (rounded toward
+///    zero) 32-bit integers, returned in a vector of [4 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 45f8544392584..5e703772b7ee4 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -252,6 +252,116 @@ double3 ceil(double3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_ceil)
 double4 ceil(double4);
 
+//===----------------------------------------------------------------------===//
+// clamp builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T clamp(T X, T Min, T Max)
+/// \brief Clamps the specified value \a X to the specified
+/// minimum ( \a Min) and maximum ( \a Max) range.
+/// \param X A value to clamp.
+/// \param Min The specified minimum range.
+/// \param Max The specified maximum range.
+///
+/// Returns The clamped value for the \a X parameter.
+/// For values of -INF or INF, clamp will behave as expected.
+/// However for values of NaN, the results are undefined.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+half clamp(half, half, half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+half2 clamp(half2, half2, half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+half3 clamp(half3, half3, half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+half4 clamp(half4, half4, half4);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int16_t clamp(int16_t, int16_t, int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int16_t2 clamp(int16_t2, int16_t2, int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int16_t3 clamp(int16_t3, int16_t3, int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int16_t4 clamp(int16_t4, int16_t4, int16_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint16_t clamp(uint16_t, uint16_t, uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint16_t2 clamp(uint16_t2, uint16_t2, uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint16_t3 clamp(uint16_t3, uint16_t3, uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint16_t4 clamp(uint16_t4, uint16_t4, uint16_t4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int clamp(int, int, int);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int2 clamp(int2, int2, int2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int3 clamp(int3, int3, int3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int4 clamp(int4, int4, int4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint clamp(uint, uint, uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint2 clamp(uint2, uint2, uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint3 clamp(uint3, uint3, uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint4 clamp(uint4, uint4, uint4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int64_t clamp(int64_t, int64_t, int64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int64_t2 clamp(int64_t2, int64_t2, int64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int64_t3 clamp(int64_t3, int64_t3, int64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int64_t4 clamp(int64_t4, int64_t4, int64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint64_t clamp(uint64_t, uint64_t, uint64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint64_t2 clamp(uint64_t2, uint64_t2, uint64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint64_t3 clamp(uint64_t3, uint64_t3, uint64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint64_t4 clamp(uint64_t4, uint64_t4, uint64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+float clamp(float, float, float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+float2 clamp(float2, float2, float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+float3 clamp(float3, float3, float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+float4 clamp(float4, float4, float4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+double clamp(double, double, double);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+double2 clamp(double2, double2, double2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+double3 clamp(double3, double3, double3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+double4 clamp(double4, double4, double4);
+
 //===----------------------------------------------------------------------===//
 // cos builtins
 //===----------------------------------------------------------------------===//
@@ -525,6 +635,39 @@ float3 frac(float3);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
 float4 frac(float4);
 
+//===----------------------------------------------------------------------===//
+// isinf builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T isinf(T x)
+/// \brief Determines if the specified value \a x  is infinite.
+/// \param x The specified input value.
+///
+/// Returns a value of the same size as the input, with a value set
+/// to True if the x parameter is +INF or -INF. Otherwise, False.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool isinf(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool2 isinf(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool3 isinf(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool4 isinf(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool isinf(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool2 isinf(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool3 isinf(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool4 isinf(float4);
+
 //===----------------------------------------------------------------------===//
 // lerp builtins
 //===----------------------------------------------------------------------===//
@@ -1153,6 +1296,39 @@ double3 rcp(double3);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rcp)
 double4 rcp(double4);
 
+//===----------------------------------------------------------------------===//
+// rsqrt builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T rsqrt(T x)
+/// \brief Returns the reciprocal of the square root of the specified value.
+/// ie 1 / sqrt( \a x).
+/// \param x The specified input value.
+///
+/// This function uses the following formula: 1 / sqrt(x).
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+half rsqrt(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+half2 rsqrt(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+half3 rsqrt(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+half4 rsqrt(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+float rsqrt(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+float2 rsqrt(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+float3 rsqrt(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+float4 rsqrt(float4);
+
 //===----------------------------------------------------------------------===//
 // round builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 27800f7a8202c..508696d3725b9 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -16,281 +16,239 @@
 
 #include <x86gprintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MMX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MMX__)
 #include <mmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE__)
 #include <xmmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE2__)
 #include <emmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE3__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE3__)
 #include <pmmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSSE3__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSSE3__)
 #include <tmmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__SSE4_2__) || defined(__SSE4_1__))
 #include <smmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AES__) || defined(__PCLMUL__))
 #include <wmmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLFLUSHOPT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLFLUSHOPT__)
 #include <clflushoptintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLWB__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLWB__)
 #include <clwbintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX__)
 #include <avxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX2__)
 #include <avx2intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__F16C__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__F16C__)
 #include <f16cintrin.h>
 #endif
 
 /* No feature check desired due to internal checks */
 #include <bmiintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__BMI2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI2__)
 #include <bmi2intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__LZCNT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__LZCNT__)
 #include <lzcntintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__POPCNT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__POPCNT__)
 #include <popcntintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FMA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA__)
 #include <fmaintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512F__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512F__)
 #include <avx512fintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VL__)
 #include <avx512vlintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512BW__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BW__)
 #include <avx512bwintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512BITALG__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BITALG__)
 #include <avx512bitalgintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512CD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512CD__)
 #include <avx512cdintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VPOPCNTDQ__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
 #include <avx512vpopcntdqintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
 #include <avx512vpopcntdqvlintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VNNI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VNNI__)
 #include <avx512vnniintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512VNNI__))
 #include <avx512vlvnniintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXVNNI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNI__)
 #include <avxvnniintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512DQ__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512BITALG__))
 #include <avx512vlbitalgintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512BW__))
 #include <avx512vlbwintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512CD__))
 #include <avx512vlcdintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512DQ__))
 #include <avx512vldqintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512ER__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512ER__)
 #include <avx512erintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512IFMA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
 #include <avx512ifmaintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512IFMA__) && defined(__AVX512VL__))
 #include <avx512ifmavlintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXIFMA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXIFMA__)
 #include <avxifmaintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VBMI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI__)
 #include <avx512vbmiintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VBMI__) && defined(__AVX512VL__))
 #include <avx512vbmivlintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VBMI2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI2__)
 #include <avx512vbmi2intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
 #include <avx512vlvbmi2intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512PF__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512PF__)
 #include <avx512pfintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512FP16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
 #include <avx512fp16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512FP16__))
 #include <avx512vlfp16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512BF16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BF16__)
 #include <avx512bf16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512BF16__))
 #include <avx512vlbf16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PKU__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PKU__)
 #include <pkuintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__VPCLMULQDQ__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__VPCLMULQDQ__)
 #include <vpclmulqdqintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__VAES__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__VAES__)
 #include <vaesintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__GFNI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__GFNI__)
 #include <gfniintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXVNNIINT8__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT8__)
 #include <avxvnniint8intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXNECONVERT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXNECONVERT__)
 #include <avxneconvertintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SHA512__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA512__)
 #include <sha512intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SM3__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SM3__)
 #include <sm3intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SM4__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SM4__)
 #include <sm4intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXVNNIINT16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT16__)
 #include <avxvnniint16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDPID__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPID__)
 /// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
 ///
 /// \headerfile <immintrin.h>
@@ -304,8 +262,7 @@ _rdpid_u32(void) {
 }
 #endif // __RDPID__
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDRND__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDRND__)
 /// Returns a 16-bit hardware-generated random value.
 ///
 /// \headerfile <immintrin.h>
@@ -367,8 +324,7 @@ _rdrand64_step(unsigned long long *__p)
 }
 #endif /* __RDRND__ */
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FSGSBASE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FSGSBASE__)
 #ifdef __x86_64__
 /// Reads the FS base register.
 ///
@@ -481,8 +437,7 @@ _writegsbase_u64(unsigned long long __V)
 #endif
 #endif /* __FSGSBASE__ */
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MOVBE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVBE__)
 
 /* The structs used below are to force the load/store to be unaligned. This
  * is accomplished with the __packed__ attribute. The __may_alias__ prevents
@@ -598,139 +553,118 @@ _storebe_i64(void * __P, long long __D) {
 #endif
 #endif /* __MOVBE */
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RTM__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RTM__)
 #include <rtmintrin.h>
 #include <xtestintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SHA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA__)
 #include <shaintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FXSR__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FXSR__)
 #include <fxsrintrin.h>
 #endif
 
 /* No feature check desired due to internal MSC_VER checks */
 #include <xsaveintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XSAVEOPT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEOPT__)
 #include <xsaveoptintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XSAVEC__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEC__)
 #include <xsavecintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XSAVES__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVES__)
 #include <xsavesintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SHSTK__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHSTK__)
 #include <cetintrin.h>
 #endif
 
 /* Intrinsics inside adcintrin.h are available at all times. */
 #include <adcintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__ADX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__ADX__)
 #include <adxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDSEED__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDSEED__)
 #include <rdseedintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__WBNOINVD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WBNOINVD__)
 #include <wbnoinvdintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLDEMOTE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLDEMOTE__)
 #include <cldemoteintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__WAITPKG__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WAITPKG__)
 #include <waitpkgintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MOVDIRI__) || defined(__MOVDIR64B__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVDIRI__) ||     \
+    defined(__MOVDIR64B__)
 #include <movdirintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PCONFIG__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
 #include <pconfigintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SGX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SGX__)
 #include <sgxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PTWRITE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PTWRITE__)
 #include <ptwriteintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__INVPCID__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
 #include <invpcidintrin.h>
 #endif
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMX_FP16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
 #include <amxfp16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__KL__) || defined(__WIDEKL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) ||          \
+    defined(__WIDEKL__)
 #include <keylockerintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMX_TILE__) || defined(__AMX_INT8__) || defined(__AMX_BF16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TILE__) ||    \
+    defined(__AMX_INT8__) || defined(__AMX_BF16__)
 #include <amxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMX_COMPLEX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
 #include <amxcomplexintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
 #include <avx512vlvp2intersectintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__ENQCMD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
 #include <enqcmdintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SERIALIZE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SERIALIZE__)
 #include <serializeintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__TSXLDTRK__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__TSXLDTRK__)
 #include <tsxldtrkintrin.h>
 #endif
 
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index a6395143db54c..fd27955fbe002 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -15,6 +15,8 @@
 #ifndef __INTRIN_H
 #define __INTRIN_H
 
+#include <intrin0.h>
+
 /* First include the standard intrinsics. */
 #if defined(__i386__) || defined(__x86_64__)
 #include <x86intrin.h>
@@ -131,8 +133,6 @@ void __writefsqword(unsigned long, unsigned __int64);
 void __writefsword(unsigned long, unsigned short);
 void __writemsr(unsigned long, unsigned __int64);
 void *_AddressOfReturnAddress(void);
-unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
-unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
 unsigned char _bittest(long const *, long);
 unsigned char _bittestandcomplement(long *, long);
 unsigned char _bittestandreset(long *, long);
@@ -151,7 +151,6 @@ long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
 void _ReadBarrier(void);
-void _ReadWriteBarrier(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
@@ -182,12 +181,6 @@ unsigned char __readgsbyte(unsigned long);
 unsigned long __readgsdword(unsigned long);
 unsigned __int64 __readgsqword(unsigned long);
 unsigned short __readgsword(unsigned long);
-unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
-                                unsigned __int64 _HighPart,
-                                unsigned char _Shift);
-unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
-                                 unsigned __int64 _HighPart,
-                                 unsigned char _Shift);
 void __stosq(unsigned __int64 *, unsigned __int64, size_t);
 unsigned char __vmx_on(unsigned __int64 *);
 unsigned char __vmx_vmclear(unsigned __int64 *);
@@ -236,212 +229,10 @@ unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
 unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
 __int64 __mulh(__int64, __int64);
 unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
-__int64 _mul128(__int64, __int64, __int64*);
-unsigned __int64 _umul128(unsigned __int64,
-                          unsigned __int64,
-                          unsigned __int64*);
+__int64 _mul128(__int64, __int64, __int64 *);
 
 #endif /* __x86_64__ */
 
-#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-
-unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
-
-#endif
-
-#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
-__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
-__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
-__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
-
-#endif
-
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange Add
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value);
-char _InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value);
-char _InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value);
-short _InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value);
-short _InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value);
-short _InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value);
-long _InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value);
-long _InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value);
-long _InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value);
-__int64 _InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value);
-__int64 _InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value);
-__int64 _InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Increment
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-short _InterlockedIncrement16_acq(short volatile *_Value);
-short _InterlockedIncrement16_nf(short volatile *_Value);
-short _InterlockedIncrement16_rel(short volatile *_Value);
-long _InterlockedIncrement_acq(long volatile *_Value);
-long _InterlockedIncrement_nf(long volatile *_Value);
-long _InterlockedIncrement_rel(long volatile *_Value);
-__int64 _InterlockedIncrement64_acq(__int64 volatile *_Value);
-__int64 _InterlockedIncrement64_nf(__int64 volatile *_Value);
-__int64 _InterlockedIncrement64_rel(__int64 volatile *_Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Decrement
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-short _InterlockedDecrement16_acq(short volatile *_Value);
-short _InterlockedDecrement16_nf(short volatile *_Value);
-short _InterlockedDecrement16_rel(short volatile *_Value);
-long _InterlockedDecrement_acq(long volatile *_Value);
-long _InterlockedDecrement_nf(long volatile *_Value);
-long _InterlockedDecrement_rel(long volatile *_Value);
-__int64 _InterlockedDecrement64_acq(__int64 volatile *_Value);
-__int64 _InterlockedDecrement64_nf(__int64 volatile *_Value);
-__int64 _InterlockedDecrement64_rel(__int64 volatile *_Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked And
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedAnd8_acq(char volatile *_Value, char _Mask);
-char _InterlockedAnd8_nf(char volatile *_Value, char _Mask);
-char _InterlockedAnd8_rel(char volatile *_Value, char _Mask);
-short _InterlockedAnd16_acq(short volatile *_Value, short _Mask);
-short _InterlockedAnd16_nf(short volatile *_Value, short _Mask);
-short _InterlockedAnd16_rel(short volatile *_Value, short _Mask);
-long _InterlockedAnd_acq(long volatile *_Value, long _Mask);
-long _InterlockedAnd_nf(long volatile *_Value, long _Mask);
-long _InterlockedAnd_rel(long volatile *_Value, long _Mask);
-__int64 _InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Bit Counting and Testing
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-unsigned char _interlockedbittestandset_acq(long volatile *_BitBase,
-                                            long _BitPos);
-unsigned char _interlockedbittestandset_nf(long volatile *_BitBase,
-                                           long _BitPos);
-unsigned char _interlockedbittestandset_rel(long volatile *_BitBase,
-                                            long _BitPos);
-unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase,
-                                              long _BitPos);
-unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase,
-                                             long _BitPos);
-unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase,
-                                              long _BitPos);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Or
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedOr8_acq(char volatile *_Value, char _Mask);
-char _InterlockedOr8_nf(char volatile *_Value, char _Mask);
-char _InterlockedOr8_rel(char volatile *_Value, char _Mask);
-short _InterlockedOr16_acq(short volatile *_Value, short _Mask);
-short _InterlockedOr16_nf(short volatile *_Value, short _Mask);
-short _InterlockedOr16_rel(short volatile *_Value, short _Mask);
-long _InterlockedOr_acq(long volatile *_Value, long _Mask);
-long _InterlockedOr_nf(long volatile *_Value, long _Mask);
-long _InterlockedOr_rel(long volatile *_Value, long _Mask);
-__int64 _InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Xor
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedXor8_acq(char volatile *_Value, char _Mask);
-char _InterlockedXor8_nf(char volatile *_Value, char _Mask);
-char _InterlockedXor8_rel(char volatile *_Value, char _Mask);
-short _InterlockedXor16_acq(short volatile *_Value, short _Mask);
-short _InterlockedXor16_nf(short volatile *_Value, short _Mask);
-short _InterlockedXor16_rel(short volatile *_Value, short _Mask);
-long _InterlockedXor_acq(long volatile *_Value, long _Mask);
-long _InterlockedXor_nf(long volatile *_Value, long _Mask);
-long _InterlockedXor_rel(long volatile *_Value, long _Mask);
-__int64 _InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedExchange8_acq(char volatile *_Target, char _Value);
-char _InterlockedExchange8_nf(char volatile *_Target, char _Value);
-char _InterlockedExchange8_rel(char volatile *_Target, char _Value);
-short _InterlockedExchange16_acq(short volatile *_Target, short _Value);
-short _InterlockedExchange16_nf(short volatile *_Target, short _Value);
-short _InterlockedExchange16_rel(short volatile *_Target, short _Value);
-long _InterlockedExchange_acq(long volatile *_Target, long _Value);
-long _InterlockedExchange_nf(long volatile *_Target, long _Value);
-long _InterlockedExchange_rel(long volatile *_Target, long _Value);
-__int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value);
-__int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value);
-__int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Compare Exchange
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedCompareExchange8_acq(char volatile *_Destination,
-                             char _Exchange, char _Comparand);
-char _InterlockedCompareExchange8_nf(char volatile *_Destination,
-                             char _Exchange, char _Comparand);
-char _InterlockedCompareExchange8_rel(char volatile *_Destination,
-                             char _Exchange, char _Comparand);
-short _InterlockedCompareExchange16_acq(short volatile *_Destination,
-                              short _Exchange, short _Comparand);
-short _InterlockedCompareExchange16_nf(short volatile *_Destination,
-                              short _Exchange, short _Comparand);
-short _InterlockedCompareExchange16_rel(short volatile *_Destination,
-                              short _Exchange, short _Comparand);
-long _InterlockedCompareExchange_acq(long volatile *_Destination,
-                              long _Exchange, long _Comparand);
-long _InterlockedCompareExchange_nf(long volatile *_Destination,
-                              long _Exchange, long _Comparand);
-long _InterlockedCompareExchange_rel(long volatile *_Destination,
-                              long _Exchange, long _Comparand);
-__int64 _InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand);
-__int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand);
-__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand);
-#endif
-#if defined(__x86_64__) || defined(__aarch64__)
-unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
-                                             __int64 _ExchangeHigh,
-                                             __int64 _ExchangeLow,
-                                             __int64 *_ComparandResult);
-#endif
-#if defined(__aarch64__)
-unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
-                                                 __int64 _ExchangeHigh,
-                                                 __int64 _ExchangeLow,
-                                                 __int64 *_ComparandResult);
-unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
-                                                __int64 _ExchangeHigh,
-                                                __int64 _ExchangeLow,
-                                                __int64 *_ComparandResult);
-unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
-                                                 __int64 _ExchangeHigh,
-                                                 __int64 _ExchangeLow,
-                                                 __int64 *_ComparandResult);
-#endif
-
 /*----------------------------------------------------------------------------*\
 |* movs, stos
 \*----------------------------------------------------------------------------*/
@@ -583,8 +374,6 @@ unsigned int _CountLeadingOnes(unsigned long);
 unsigned int _CountLeadingOnes64(unsigned __int64);
 unsigned int _CountLeadingSigns(long);
 unsigned int _CountLeadingSigns64(__int64);
-unsigned int _CountLeadingZeros(unsigned long);
-unsigned int _CountLeadingZeros64(unsigned _int64);
 unsigned int _CountOneBits(unsigned long);
 unsigned int _CountOneBits64(unsigned __int64);
 
diff --git a/clang/lib/Headers/intrin0.h b/clang/lib/Headers/intrin0.h
new file mode 100644
index 0000000000000..31f362ec84d5c
--- /dev/null
+++ b/clang/lib/Headers/intrin0.h
@@ -0,0 +1,247 @@
+/* ===-------- intrin.h ---------------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <intrin0.h>
+#else
+
+#ifndef __INTRIN0_H
+#define __INTRIN0_H
+
+#ifdef __x86_64__
+#include <adcintrin.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+void _ReadWriteBarrier(void);
+
+#if defined(__aarch64__)
+unsigned int _CountLeadingZeros(unsigned long);
+unsigned int _CountLeadingZeros64(unsigned _int64);
+unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+#endif
+
+#ifdef __x86_64__
+unsigned __int64 _umul128(unsigned __int64, unsigned __int64,
+                          unsigned __int64 *);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
+void _mm_pause(void);
+#endif
+
+#if defined(__x86_64__) || defined(__aarch64__)
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_ComparandResult);
+#endif
+
+#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) ||            \
+    defined(__aarch64__)
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+char _InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value);
+char _InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value);
+char _InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value);
+short _InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value);
+short _InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value);
+short _InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value);
+long _InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value);
+__int64 _InterlockedExchangeAdd64_acq(__int64 volatile *_Addend,
+                                      __int64 _Value);
+__int64 _InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value);
+__int64 _InterlockedExchangeAdd64_rel(__int64 volatile *_Addend,
+                                      __int64 _Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+short _InterlockedIncrement16_acq(short volatile *_Value);
+short _InterlockedIncrement16_nf(short volatile *_Value);
+short _InterlockedIncrement16_rel(short volatile *_Value);
+long _InterlockedIncrement_acq(long volatile *_Value);
+long _InterlockedIncrement_nf(long volatile *_Value);
+long _InterlockedIncrement_rel(long volatile *_Value);
+__int64 _InterlockedIncrement64_acq(__int64 volatile *_Value);
+__int64 _InterlockedIncrement64_nf(__int64 volatile *_Value);
+__int64 _InterlockedIncrement64_rel(__int64 volatile *_Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+short _InterlockedDecrement16_acq(short volatile *_Value);
+short _InterlockedDecrement16_nf(short volatile *_Value);
+short _InterlockedDecrement16_rel(short volatile *_Value);
+long _InterlockedDecrement_acq(long volatile *_Value);
+long _InterlockedDecrement_nf(long volatile *_Value);
+long _InterlockedDecrement_rel(long volatile *_Value);
+__int64 _InterlockedDecrement64_acq(__int64 volatile *_Value);
+__int64 _InterlockedDecrement64_nf(__int64 volatile *_Value);
+__int64 _InterlockedDecrement64_rel(__int64 volatile *_Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+char _InterlockedAnd8_acq(char volatile *_Value, char _Mask);
+char _InterlockedAnd8_nf(char volatile *_Value, char _Mask);
+char _InterlockedAnd8_rel(char volatile *_Value, char _Mask);
+short _InterlockedAnd16_acq(short volatile *_Value, short _Mask);
+short _InterlockedAnd16_nf(short volatile *_Value, short _Mask);
+short _InterlockedAnd16_rel(short volatile *_Value, short _Mask);
+long _InterlockedAnd_acq(long volatile *_Value, long _Mask);
+long _InterlockedAnd_nf(long volatile *_Value, long _Mask);
+long _InterlockedAnd_rel(long volatile *_Value, long _Mask);
+__int64 _InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask);
+
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+unsigned char _interlockedbittestandset_acq(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandset_nf(long volatile *_BitBase,
+                                           long _BitPos);
+unsigned char _interlockedbittestandset_rel(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase,
+                                              long _BitPos);
+unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase,
+                                             long _BitPos);
+unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase,
+                                              long _BitPos);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+char _InterlockedOr8_acq(char volatile *_Value, char _Mask);
+char _InterlockedOr8_nf(char volatile *_Value, char _Mask);
+char _InterlockedOr8_rel(char volatile *_Value, char _Mask);
+short _InterlockedOr16_acq(short volatile *_Value, short _Mask);
+short _InterlockedOr16_nf(short volatile *_Value, short _Mask);
+short _InterlockedOr16_rel(short volatile *_Value, short _Mask);
+long _InterlockedOr_acq(long volatile *_Value, long _Mask);
+long _InterlockedOr_nf(long volatile *_Value, long _Mask);
+long _InterlockedOr_rel(long volatile *_Value, long _Mask);
+__int64 _InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+char _InterlockedXor8_acq(char volatile *_Value, char _Mask);
+char _InterlockedXor8_nf(char volatile *_Value, char _Mask);
+char _InterlockedXor8_rel(char volatile *_Value, char _Mask);
+short _InterlockedXor16_acq(short volatile *_Value, short _Mask);
+short _InterlockedXor16_nf(short volatile *_Value, short _Mask);
+short _InterlockedXor16_rel(short volatile *_Value, short _Mask);
+long _InterlockedXor_acq(long volatile *_Value, long _Mask);
+long _InterlockedXor_nf(long volatile *_Value, long _Mask);
+long _InterlockedXor_rel(long volatile *_Value, long _Mask);
+__int64 _InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+char _InterlockedExchange8_acq(char volatile *_Target, char _Value);
+char _InterlockedExchange8_nf(char volatile *_Target, char _Value);
+char _InterlockedExchange8_rel(char volatile *_Target, char _Value);
+short _InterlockedExchange16_acq(short volatile *_Target, short _Value);
+short _InterlockedExchange16_nf(short volatile *_Target, short _Value);
+short _InterlockedExchange16_rel(short volatile *_Target, short _Value);
+long _InterlockedExchange_acq(long volatile *_Target, long _Value);
+long _InterlockedExchange_nf(long volatile *_Target, long _Value);
+long _InterlockedExchange_rel(long volatile *_Target, long _Value);
+__int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value);
+__int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value);
+__int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+char _InterlockedCompareExchange8_acq(char volatile *_Destination,
+                                      char _Exchange, char _Comparand);
+char _InterlockedCompareExchange8_nf(char volatile *_Destination,
+                                     char _Exchange, char _Comparand);
+char _InterlockedCompareExchange8_rel(char volatile *_Destination,
+                                      char _Exchange, char _Comparand);
+short _InterlockedCompareExchange16_acq(short volatile *_Destination,
+                                        short _Exchange, short _Comparand);
+short _InterlockedCompareExchange16_nf(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+short _InterlockedCompareExchange16_rel(short volatile *_Destination,
+                                        short _Exchange, short _Comparand);
+long _InterlockedCompareExchange_acq(long volatile *_Destination,
+                                     long _Exchange, long _Comparand);
+long _InterlockedCompareExchange_nf(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+long _InterlockedCompareExchange_rel(long volatile *_Destination,
+                                     long _Exchange, long _Comparand);
+__int64 _InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
+                                          __int64 _Exchange,
+                                          __int64 _Comparand);
+__int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
+                                          __int64 _Exchange,
+                                          __int64 _Comparand);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __INTRIN0_H */
+#endif /* _MSC_VER */
diff --git a/clang/lib/Headers/keylockerintrin.h b/clang/lib/Headers/keylockerintrin.h
index 1994ac42070ad..f76e91b4d4b30 100644
--- a/clang/lib/Headers/keylockerintrin.h
+++ b/clang/lib/Headers/keylockerintrin.h
@@ -28,8 +28,7 @@
 #ifndef _KEYLOCKERINTRIN_H
 #define _KEYLOCKERINTRIN_H
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__KL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__)
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
@@ -327,11 +326,9 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
-          || defined(__KL__) */
+#endif /* !defined(__SCE__ || __has_feature(modules) || defined(__KL__) */
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__WIDEKL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
@@ -524,7 +521,7 @@ _mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
-          || defined(__WIDEKL__) */
+#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)   \
+        */
 
 #endif /* _KEYLOCKERINTRIN_H */
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index a786689d39177..8741968fa7f36 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -155,9 +155,9 @@ module _Builtin_intrinsics [system] [extern_c] {
 
 // Start -fbuiltin-headers-in-system-modules affected modules
 
-// The following modules all ignore their top level headers
-// when -fbuiltin-headers-in-system-modules is passed, and
-// most of those headers join system modules when present.
+// The following modules all ignore their headers when
+// -fbuiltin-headers-in-system-modules is passed, and many of
+// those headers join system modules when present.
 
 // e.g. if -fbuiltin-headers-in-system-modules is passed, then
 // float.h will not be in the _Builtin_float module (that module
@@ -190,11 +190,6 @@ module _Builtin_stdalign [system] {
   export *
 }
 
-// When -fbuiltin-headers-in-system-modules is passed, only
-// the top level headers are removed, the implementation headers
-// will always be in their submodules. That means when stdarg.h
-// is included, it will still import this module and make the
-// appropriate submodules visible.
 module _Builtin_stdarg [system] {
   textual header "stdarg.h"
 
@@ -237,6 +232,8 @@ module _Builtin_stdbool [system] {
 module _Builtin_stddef [system] {
   textual header "stddef.h"
 
+  // __stddef_max_align_t.h is always in this module, even if
+  // -fbuiltin-headers-in-system-modules is passed.
   explicit module max_align_t {
     header "__stddef_max_align_t.h"
     export *
@@ -283,9 +280,10 @@ module _Builtin_stddef [system] {
   }
 }
 
-/* wint_t is provided by <wchar.h> and not <stddef.h>. It's here
- * for compatibility, but must be explicitly requested. Therefore
- * __stddef_wint_t.h is not part of _Builtin_stddef. */
+// wint_t is provided by <wchar.h> and not <stddef.h>. It's here
+// for compatibility, but must be explicitly requested. Therefore
+// __stddef_wint_t.h is not part of _Builtin_stddef. It is always in
+// this module even if -fbuiltin-headers-in-system-modules is passed.
 module _Builtin_stddef_wint_t [system] {
   header "__stddef_wint_t.h"
   export *
@@ -317,3 +315,8 @@ module opencl_c {
   header "opencl-c.h"
   header "opencl-c-base.h"
 }
+
+module ptrauth {
+  header "ptrauth.h"
+  export *
+}
diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h
new file mode 100644
index 0000000000000..56c3c3636c9bc
--- /dev/null
+++ b/clang/lib/Headers/ptrauth.h
@@ -0,0 +1,185 @@
+/*===---- ptrauth.h - Pointer authentication -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PTRAUTH_H
+#define __PTRAUTH_H
+
+typedef enum {
+  ptrauth_key_asia = 0,
+  ptrauth_key_asib = 1,
+  ptrauth_key_asda = 2,
+  ptrauth_key_asdb = 3,
+} ptrauth_key;
+
+/* An integer type of the appropriate size for a discriminator argument. */
+typedef __UINTPTR_TYPE__ ptrauth_extra_data_t;
+
+/* An integer type of the appropriate size for a generic signature. */
+typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
+
+/* A signed pointer value embeds the original pointer together with
+   a signature that attests to the validity of that pointer.  Because
+   this signature must use only "spare" bits of the pointer, a
+   signature's validity is probabilistic in practice: it is unlikely
+   but still plausible that an invalidly-derived signature will
+   somehow equal the correct signature and therefore successfully
+   authenticate.  Nonetheless, this scheme provides a strong degree
+   of protection against certain kinds of attacks. */
+
+/* Authenticating a pointer that was not signed with the given key
+   and extra-data value will (likely) fail by trapping. */
+
+#if __has_feature(ptrauth_intrinsics)
+
+/* Strip the signature from a value without authenticating it.
+
+   If the value is a function pointer, the result will not be a
+   legal function pointer because of the missing signature, and
+   attempting to call it will result in an authentication failure.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The result will have the same type as the original value. */
+#define ptrauth_strip(__value, __key) __builtin_ptrauth_strip(__value, __key)
+
+/* Blend a constant discriminator into the given pointer-like value
+   to form a new discriminator.  Not all bits of the inputs are
+   guaranteed to contribute to the result.
+
+   On arm64e, the integer must fall within the range of a uint16_t;
+   other bits may be ignored.
+
+   The first argument must be an expression of pointer type.
+   The second argument must be an expression of integer type.
+   The result will have type uintptr_t. */
+#define ptrauth_blend_discriminator(__pointer, __integer)                      \
+  __builtin_ptrauth_blend_discriminator(__pointer, __integer)
+
+/* Add a signature to the given pointer value using a specific key,
+   using the given extra data as a salt to the signing process.
+
+   This operation does not authenticate the original value and is
+   therefore potentially insecure if an attacker could possibly
+   control that value.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value. */
+#define ptrauth_sign_unauthenticated(__value, __key, __data)                   \
+  __builtin_ptrauth_sign_unauthenticated(__value, __key, __data)
+
+/* Authenticate a pointer using one scheme and resign it using another.
+
+   If the result is subsequently authenticated using the new scheme, that
+   authentication is guaranteed to fail if and only if the initial
+   authentication failed.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This operation is guaranteed to not leave the intermediate value
+   available for attack before it is re-signed.
+
+   Do not pass a null pointer to this function. A null pointer
+   will not successfully authenticate.
+
+   This operation traps if the authentication fails. */
+#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key,     \
+                                __new_data)                                    \
+  __builtin_ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, \
+                                    __new_data)
+
+/* Authenticate a data pointer.
+
+   The value must be an expression of non-function pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This operation traps if the authentication fails. */
+#define ptrauth_auth_data(__value, __old_key, __old_data)                      \
+  __builtin_ptrauth_auth(__value, __old_key, __old_data)
+
+/* Compute a signature for the given pair of pointer-sized values.
+   The order of the arguments is significant.
+
+   Like a pointer signature, the resulting signature depends on
+   private key data and therefore should not be reliably reproducible
+   by attackers.  That means that this can be used to validate the
+   integrity of arbitrary data by storing a signature for that data
+   alongside it, then checking that the signature is still valid later.
+   Data which exceeds two pointers in size can be signed by either
+   computing a tree of generic signatures or just signing an ordinary
+   cryptographic hash of the data.
+
+   The result has type ptrauth_generic_signature_t.  However, it may
+   not have as many bits of entropy as that type's width would suggest;
+   some implementations are known to compute a compressed signature as
+   if the arguments were a pointer and a discriminator.
+
+   The arguments must be either pointers or integers; if integers, they
+   will be coerce to uintptr_t. */
+#define ptrauth_sign_generic_data(__value, __data)                             \
+  __builtin_ptrauth_sign_generic_data(__value, __data)
+
+#else
+
+#define ptrauth_strip(__value, __key)                                          \
+  ({                                                                           \
+    (void)__key;                                                               \
+    __value;                                                                   \
+  })
+
+#define ptrauth_blend_discriminator(__pointer, __integer)                      \
+  ({                                                                           \
+    (void)__pointer;                                                           \
+    (void)__integer;                                                           \
+    ((ptrauth_extra_data_t)0);                                                 \
+  })
+
+#define ptrauth_sign_unauthenticated(__value, __key, __data)                   \
+  ({                                                                           \
+    (void)__key;                                                               \
+    (void)__data;                                                              \
+    __value;                                                                   \
+  })
+
+#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key,     \
+                                __new_data)                                    \
+  ({                                                                           \
+    (void)__old_key;                                                           \
+    (void)__old_data;                                                          \
+    (void)__new_key;                                                           \
+    (void)__new_data;                                                          \
+    __value;                                                                   \
+  })
+
+#define ptrauth_auth_data(__value, __old_key, __old_data)                      \
+  ({                                                                           \
+    (void)__old_key;                                                           \
+    (void)__old_data;                                                          \
+    __value;                                                                   \
+  })
+
+#define ptrauth_sign_generic_data(__value, __data)                             \
+  ({                                                                           \
+    (void)__value;                                                             \
+    (void)__data;                                                              \
+    ((ptrauth_generic_signature_t)0);                                          \
+  })
+
+#endif /* __has_feature(ptrauth_intrinsics) */
+
+#endif /* __PTRAUTH_H */
diff --git a/clang/lib/Headers/x86gprintrin.h b/clang/lib/Headers/x86gprintrin.h
index ed141879fbc74..3d5cc606d7e63 100644
--- a/clang/lib/Headers/x86gprintrin.h
+++ b/clang/lib/Headers/x86gprintrin.h
@@ -10,38 +10,31 @@
 #ifndef __X86GPRINTRIN_H
 #define __X86GPRINTRIN_H
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__HRESET__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__HRESET__)
 #include <hresetintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__UINTR__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__UINTR__)
 #include <uintrintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__USERMSR__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__USERMSR__)
 #include <usermsrintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CRC32__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CRC32__)
 #include <crc32intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PRFCHI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHI__)
 #include <prfchiintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RAOINT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RAOINT__)
 #include <raointintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CMPCCXADD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CMPCCXADD__)
 #include <cmpccxaddintrin.h>
 #endif
 
diff --git a/clang/lib/Headers/x86intrin.h b/clang/lib/Headers/x86intrin.h
index 450fd008dab95..c20bfbb8fe46e 100644
--- a/clang/lib/Headers/x86intrin.h
+++ b/clang/lib/Headers/x86intrin.h
@@ -14,53 +14,43 @@
 
 #include <immintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__3dNOW__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__3dNOW__)
 #include <mm3dnow.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PRFCHW__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHW__)
 #include <prfchwintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE4A__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE4A__)
 #include <ammintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FMA4__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA4__)
 #include <fma4intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XOP__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XOP__)
 #include <xopintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__TBM__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__TBM__)
 #include <tbmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__LWP__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__LWP__)
 #include <lwpintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MWAITX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MWAITX__)
 #include <mwaitxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLZERO__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLZERO__)
 #include <clzerointrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDPRU__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPRU__)
 #include <rdpruintrin.h>
 #endif
 
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 8e386a72cde78..b2c68c3b7be97 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -1333,6 +1333,10 @@ _mm_ucomineq_ss(__m128 __a, __m128 __b)
 /// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
@@ -1351,6 +1355,10 @@ _mm_cvtss_si32(__m128 __a)
 /// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
@@ -1371,6 +1379,10 @@ _mm_cvt_ss2si(__m128 __a)
 /// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 64-bit integer.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
@@ -1391,6 +1403,10 @@ _mm_cvtss_si64(__m128 __a)
 /// Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
@@ -1407,6 +1423,10 @@ _mm_cvtps_pi32(__m128 __a)
 /// Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
@@ -1420,9 +1440,12 @@ _mm_cvt_ps2pi(__m128 __a)
   return _mm_cvtps_pi32(__a);
 }
 
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 32-bit integer, truncating the result when it is
-///    inexact.
+/// Converts the lower (first) element of a vector of [4 x float] into a signed
+///    truncated (rounded toward zero) 32-bit integer.
+///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1439,9 +1462,12 @@ _mm_cvttss_si32(__m128 __a)
   return __builtin_ia32_cvttss2si((__v4sf)__a);
 }
 
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 32-bit integer, truncating the result when it is
-///    inexact.
+/// Converts the lower (first) element of a vector of [4 x float] into a signed
+///    truncated (rounded toward zero) 32-bit integer.
+///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1459,9 +1485,12 @@ _mm_cvtt_ss2si(__m128 __a)
 }
 
 #ifdef __x86_64__
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 64-bit integer, truncating the result when it is
-///    inexact.
+/// Converts the lower (first) element of a vector of [4 x float] into a signed
+///    truncated (rounded toward zero) 64-bit integer.
+///
+///    If the converted value does not fit in a 64-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1479,9 +1508,13 @@ _mm_cvttss_si64(__m128 __a)
 }
 #endif
 
-/// Converts two low-order float values in a 128-bit vector of
-///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
-///    when it is inexact.
+/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
+///    into two signed truncated (rounded toward zero) 32-bit integers,
+///    returned in a 64-bit vector of [2 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1497,9 +1530,13 @@ _mm_cvttps_pi32(__m128 __a)
   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
 }
 
-/// Converts two low-order float values in a 128-bit vector of [4 x
-///    float] into a 64-bit vector of [2 x i32], truncating the result when it
-///    is inexact.
+/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
+///    into two signed truncated (rounded toward zero) 64-bit integers,
+///    returned in a 64-bit vector of [2 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/yvals_core.h b/clang/lib/Headers/yvals_core.h
new file mode 100644
index 0000000000000..5ee194a3e5f5f
--- /dev/null
+++ b/clang/lib/Headers/yvals_core.h
@@ -0,0 +1,25 @@
+//===----- yvals_core.h - Internal MSVC STL core header -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Only include this if we are aiming for MSVC compatibility.
+#ifndef _MSC_VER
+#include_next <yvals_core.h>
+#else
+
+#ifndef __clang_yvals_core_h
+#define __clang_yvals_core_h
+
+#include_next <yvals_core.h>
+
+#ifdef _STL_INTRIN_HEADER
+#undef _STL_INTRIN_HEADER
+#define _STL_INTRIN_HEADER <intrin0.h>
+#endif
+
+#endif
+#endif
diff --git a/clang/lib/InstallAPI/CMakeLists.txt b/clang/lib/InstallAPI/CMakeLists.txt
index dc90d6370de41..894db699578f2 100644
--- a/clang/lib/InstallAPI/CMakeLists.txt
+++ b/clang/lib/InstallAPI/CMakeLists.txt
@@ -1,10 +1,12 @@
 set(LLVM_LINK_COMPONENTS
   Support
   TextAPI
+  Demangle
   Core
   )
 
 add_clang_library(clangInstallAPI
+  DylibVerifier.cpp
   FileList.cpp
   Frontend.cpp
   HeaderFile.cpp
diff --git a/clang/lib/InstallAPI/DylibVerifier.cpp b/clang/lib/InstallAPI/DylibVerifier.cpp
new file mode 100644
index 0000000000000..700763b3fee0d
--- /dev/null
+++ b/clang/lib/InstallAPI/DylibVerifier.cpp
@@ -0,0 +1,528 @@
+#include "clang/InstallAPI/DylibVerifier.h"
+#include "clang/InstallAPI/FrontendRecords.h"
+#include "clang/InstallAPI/InstallAPIDiagnostic.h"
+#include "llvm/Demangle/Demangle.h"
+
+using namespace llvm::MachO;
+
+namespace clang {
+namespace installapi {
+
+/// Metadata stored about a mapping of a declaration to a symbol.
+struct DylibVerifier::SymbolContext {
+  // Name to use for printing in diagnostics.
+  std::string PrettyPrintName{""};
+
+  // Name to use for all querying and verification
+  // purposes.
+  std::string SymbolName{""};
+
+  // Kind to map symbol type against record.
+  EncodeKind Kind = EncodeKind::GlobalSymbol;
+
+  // Frontend Attributes tied to the AST.
+  const FrontendAttrs *FA = nullptr;
+
+  // The ObjCInterface symbol type, if applicable.
+  ObjCIFSymbolKind ObjCIFKind = ObjCIFSymbolKind::None;
+
+  // Whether Decl is inlined.
+  bool Inlined = false;
+};
+
+static std::string
+getAnnotatedName(const Record *R, EncodeKind Kind, StringRef Name,
+                 bool ValidSourceLoc = true,
+                 ObjCIFSymbolKind ObjCIF = ObjCIFSymbolKind::None) {
+  assert(!Name.empty() && "Need symbol name for printing");
+
+  std::string Annotation;
+  if (R->isWeakDefined())
+    Annotation += "(weak-def) ";
+  if (R->isWeakReferenced())
+    Annotation += "(weak-ref) ";
+  if (R->isThreadLocalValue())
+    Annotation += "(tlv) ";
+
+  // Check if symbol represents only part of a @interface declaration.
+  const bool IsAnnotatedObjCClass = ((ObjCIF != ObjCIFSymbolKind::None) &&
+                                     (ObjCIF <= ObjCIFSymbolKind::EHType));
+
+  if (IsAnnotatedObjCClass) {
+    if (ObjCIF == ObjCIFSymbolKind::EHType)
+      Annotation += "Exception Type of ";
+    if (ObjCIF == ObjCIFSymbolKind::MetaClass)
+      Annotation += "Metaclass of ";
+    if (ObjCIF == ObjCIFSymbolKind::Class)
+      Annotation += "Class of ";
+  }
+
+  // Only print symbol type prefix or leading "_" if there is no source location
+  // tied to it. This can only ever happen when the location has to come from
+  // debug info.
+  if (ValidSourceLoc) {
+    if ((Kind == EncodeKind::GlobalSymbol) && Name.starts_with("_"))
+      return Annotation + Name.drop_front(1).str();
+    return Annotation + Name.str();
+  }
+
+  if (IsAnnotatedObjCClass)
+    return Annotation + Name.str();
+
+  switch (Kind) {
+  case EncodeKind::GlobalSymbol:
+    return Annotation + Name.str();
+  case EncodeKind::ObjectiveCInstanceVariable:
+    return Annotation + "(ObjC IVar) " + Name.str();
+  case EncodeKind::ObjectiveCClass:
+    return Annotation + "(ObjC Class) " + Name.str();
+  case EncodeKind::ObjectiveCClassEHType:
+    return Annotation + "(ObjC Class EH) " + Name.str();
+  }
+
+  llvm_unreachable("unexpected case for EncodeKind");
+}
+
+static std::string demangle(StringRef Name) {
+  // InstallAPI currently only supports itanium manglings.
+  if (!(Name.starts_with("_Z") || Name.starts_with("__Z") ||
+        Name.starts_with("___Z")))
+    return Name.str();
+  char *Result = llvm::itaniumDemangle(Name);
+  if (!Result)
+    return Name.str();
+
+  std::string Demangled(Result);
+  free(Result);
+  return Demangled;
+}
+
+static DylibVerifier::Result updateResult(const DylibVerifier::Result Prev,
+                                          const DylibVerifier::Result Curr) {
+  if (Prev == Curr)
+    return Prev;
+
+  // Never update from invalid or noverify state.
+  if ((Prev == DylibVerifier::Result::Invalid) ||
+      (Prev == DylibVerifier::Result::NoVerify))
+    return Prev;
+
+  // Don't let an ignored verification remove a valid one.
+  if (Prev == DylibVerifier::Result::Valid &&
+      Curr == DylibVerifier::Result::Ignore)
+    return Prev;
+
+  return Curr;
+}
+// __private_extern__ is a deprecated specifier that clang does not
+// respect in all contexts, it should just be considered hidden for InstallAPI.
+static bool shouldIgnorePrivateExternAttr(const Decl *D) {
+  if (const FunctionDecl *FD = cast<FunctionDecl>(D))
+    return FD->getStorageClass() == StorageClass::SC_PrivateExtern;
+  if (const VarDecl *VD = cast<VarDecl>(D))
+    return VD->getStorageClass() == StorageClass::SC_PrivateExtern;
+
+  return false;
+}
+
+Record *findRecordFromSlice(const RecordsSlice *Slice, StringRef Name,
+                            EncodeKind Kind) {
+  switch (Kind) {
+  case EncodeKind::GlobalSymbol:
+    return Slice->findGlobal(Name);
+  case EncodeKind::ObjectiveCInstanceVariable:
+    return Slice->findObjCIVar(Name.contains('.'), Name);
+  case EncodeKind::ObjectiveCClass:
+  case EncodeKind::ObjectiveCClassEHType:
+    return Slice->findObjCInterface(Name);
+  }
+  llvm_unreachable("unexpected end when finding record");
+}
+
+void DylibVerifier::updateState(Result State) {
+  Ctx.FrontendState = updateResult(Ctx.FrontendState, State);
+}
+
+void DylibVerifier::addSymbol(const Record *R, SymbolContext &SymCtx,
+                              TargetList &&Targets) {
+  if (Targets.empty())
+    Targets = {Ctx.Target};
+
+  Exports->addGlobal(SymCtx.Kind, SymCtx.SymbolName, R->getFlags(), Targets);
+}
+
+bool DylibVerifier::shouldIgnoreObsolete(const Record *R, SymbolContext &SymCtx,
+                                         const Record *DR) {
+  return SymCtx.FA->Avail.isObsoleted();
+}
+
+bool DylibVerifier::compareObjCInterfaceSymbols(const Record *R,
+                                                SymbolContext &SymCtx,
+                                                const ObjCInterfaceRecord *DR) {
+  const bool IsDeclVersionComplete =
+      ((SymCtx.ObjCIFKind & ObjCIFSymbolKind::Class) ==
+       ObjCIFSymbolKind::Class) &&
+      ((SymCtx.ObjCIFKind & ObjCIFSymbolKind::MetaClass) ==
+       ObjCIFSymbolKind::MetaClass);
+
+  const bool IsDylibVersionComplete = DR->isCompleteInterface();
+
+  // The common case, a complete ObjCInterface.
+  if (IsDeclVersionComplete && IsDylibVersionComplete)
+    return true;
+
+  auto PrintDiagnostic = [&](auto SymLinkage, const Record *Record,
+                             StringRef SymName, bool PrintAsWarning = false) {
+    if (SymLinkage == RecordLinkage::Unknown)
+      Ctx.emitDiag([&]() {
+        Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                         PrintAsWarning ? diag::warn_library_missing_symbol
+                                        : diag::err_library_missing_symbol)
+            << SymName;
+      });
+    else
+      Ctx.emitDiag([&]() {
+        Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                         PrintAsWarning ? diag::warn_library_hidden_symbol
+                                        : diag::err_library_hidden_symbol)
+            << SymName;
+      });
+  };
+
+  if (IsDeclVersionComplete) {
+    // The decl represents a complete ObjCInterface, but the symbols in the
+    // dylib do not. Determine which symbol is missing. To keep older projects
+    // building, treat this as a warning.
+    if (!DR->isExportedSymbol(ObjCIFSymbolKind::Class))
+      PrintDiagnostic(DR->getLinkageForSymbol(ObjCIFSymbolKind::Class), R,
+                      getAnnotatedName(R, SymCtx.Kind, SymCtx.PrettyPrintName,
+                                       /*ValidSourceLoc=*/true,
+                                       ObjCIFSymbolKind::Class),
+                      /*PrintAsWarning=*/true);
+
+    if (!DR->isExportedSymbol(ObjCIFSymbolKind::MetaClass))
+      PrintDiagnostic(DR->getLinkageForSymbol(ObjCIFSymbolKind::MetaClass), R,
+                      getAnnotatedName(R, SymCtx.Kind, SymCtx.PrettyPrintName,
+                                       /*ValidSourceLoc=*/true,
+                                       ObjCIFSymbolKind::MetaClass),
+                      /*PrintAsWarning=*/true);
+    return true;
+  }
+
+  if (DR->isExportedSymbol(SymCtx.ObjCIFKind)) {
+    if (!IsDylibVersionComplete) {
+      // Both the declaration and dylib have a non-complete interface.
+      SymCtx.Kind = EncodeKind::GlobalSymbol;
+      SymCtx.SymbolName = R->getName();
+    }
+    return true;
+  }
+
+  // At this point that means there was not a matching class symbol
+  // to represent the one discovered as a declaration.
+  PrintDiagnostic(DR->getLinkageForSymbol(SymCtx.ObjCIFKind), R,
+                  SymCtx.PrettyPrintName);
+  return false;
+}
+
+DylibVerifier::Result DylibVerifier::compareVisibility(const Record *R,
+                                                       SymbolContext &SymCtx,
+                                                       const Record *DR) {
+
+  if (R->isExported()) {
+    if (!DR) {
+      Ctx.emitDiag([&]() {
+        Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                         diag::err_library_missing_symbol)
+            << SymCtx.PrettyPrintName;
+      });
+      return Result::Invalid;
+    }
+    if (DR->isInternal()) {
+      Ctx.emitDiag([&]() {
+        Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                         diag::err_library_hidden_symbol)
+            << SymCtx.PrettyPrintName;
+      });
+      return Result::Invalid;
+    }
+  }
+
+  // Emit a diagnostic for hidden declarations with external symbols, except
+  // when theres an inlined attribute.
+  if ((R->isInternal() && !SymCtx.Inlined) && DR && DR->isExported()) {
+
+    if (Mode == VerificationMode::ErrorsOnly)
+      return Result::Ignore;
+
+    if (shouldIgnorePrivateExternAttr(SymCtx.FA->D))
+      return Result::Ignore;
+
+    unsigned ID;
+    Result Outcome;
+    if (Mode == VerificationMode::ErrorsAndWarnings) {
+      ID = diag::warn_header_hidden_symbol;
+      Outcome = Result::Ignore;
+    } else {
+      ID = diag::err_header_hidden_symbol;
+      Outcome = Result::Invalid;
+    }
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(), ID)
+          << SymCtx.PrettyPrintName;
+    });
+    return Outcome;
+  }
+
+  if (R->isInternal())
+    return Result::Ignore;
+
+  return Result::Valid;
+}
+
+DylibVerifier::Result DylibVerifier::compareAvailability(const Record *R,
+                                                         SymbolContext &SymCtx,
+                                                         const Record *DR) {
+  if (!SymCtx.FA->Avail.isUnavailable())
+    return Result::Valid;
+
+  const bool IsDeclAvailable = SymCtx.FA->Avail.isUnavailable();
+
+  switch (Mode) {
+  case VerificationMode::ErrorsAndWarnings:
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::warn_header_availability_mismatch)
+          << SymCtx.PrettyPrintName << IsDeclAvailable << IsDeclAvailable;
+    });
+    return Result::Ignore;
+  case VerificationMode::Pedantic:
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::err_header_availability_mismatch)
+          << SymCtx.PrettyPrintName << IsDeclAvailable << IsDeclAvailable;
+    });
+    return Result::Invalid;
+  case VerificationMode::ErrorsOnly:
+    return Result::Ignore;
+  case VerificationMode::Invalid:
+    llvm_unreachable("Unexpected verification mode symbol verification");
+  }
+  llvm_unreachable("Unexpected verification mode symbol verification");
+}
+
+bool DylibVerifier::compareSymbolFlags(const Record *R, SymbolContext &SymCtx,
+                                       const Record *DR) {
+  std::string DisplayName =
+      Demangle ? demangle(DR->getName()) : DR->getName().str();
+
+  if (DR->isThreadLocalValue() && !R->isThreadLocalValue()) {
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::err_dylib_symbol_flags_mismatch)
+          << getAnnotatedName(DR, SymCtx.Kind, DisplayName)
+          << DR->isThreadLocalValue();
+    });
+    return false;
+  }
+  if (!DR->isThreadLocalValue() && R->isThreadLocalValue()) {
+    Ctx.emitDiag([&]() {
+      SymCtx.FA->D->getLocation(),
+          Ctx.Diag->Report(diag::err_header_symbol_flags_mismatch)
+              << SymCtx.PrettyPrintName << R->isThreadLocalValue();
+    });
+    return false;
+  }
+
+  if (DR->isWeakDefined() && !R->isWeakDefined()) {
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::err_dylib_symbol_flags_mismatch)
+          << getAnnotatedName(DR, SymCtx.Kind, DisplayName)
+          << R->isWeakDefined();
+    });
+    return false;
+  }
+  if (!DR->isWeakDefined() && R->isWeakDefined()) {
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::err_header_symbol_flags_mismatch)
+          << SymCtx.PrettyPrintName << R->isWeakDefined();
+    });
+    return false;
+  }
+
+  return true;
+}
+
+DylibVerifier::Result DylibVerifier::verifyImpl(Record *R,
+                                                SymbolContext &SymCtx) {
+  R->setVerify();
+  if (!canVerify()) {
+    // Accumulate symbols when not in verifying against dylib.
+    if (R->isExported() && !SymCtx.FA->Avail.isUnavailable() &&
+        !SymCtx.FA->Avail.isObsoleted()) {
+      addSymbol(R, SymCtx);
+    }
+    return Ctx.FrontendState;
+  }
+
+  Record *DR =
+      findRecordFromSlice(Ctx.DylibSlice, SymCtx.SymbolName, SymCtx.Kind);
+  if (DR)
+    DR->setVerify();
+
+  if (shouldIgnoreObsolete(R, SymCtx, DR)) {
+    updateState(Result::Ignore);
+    return Ctx.FrontendState;
+  }
+
+  // Unavailable declarations don't need matching symbols.
+  if (SymCtx.FA->Avail.isUnavailable() && (!DR || DR->isInternal())) {
+    updateState(Result::Valid);
+    return Ctx.FrontendState;
+  }
+
+  Result VisibilityCheck = compareVisibility(R, SymCtx, DR);
+  if (VisibilityCheck != Result::Valid) {
+    updateState(VisibilityCheck);
+    return Ctx.FrontendState;
+  }
+
+  // All missing symbol cases to diagnose have been handled now.
+  if (!DR) {
+    updateState(Result::Ignore);
+    return Ctx.FrontendState;
+  }
+
+  // Check for mismatching ObjC interfaces.
+  if (SymCtx.ObjCIFKind != ObjCIFSymbolKind::None) {
+    if (!compareObjCInterfaceSymbols(
+            R, SymCtx, Ctx.DylibSlice->findObjCInterface(DR->getName()))) {
+      updateState(Result::Invalid);
+      return Ctx.FrontendState;
+    }
+  }
+
+  Result AvailabilityCheck = compareAvailability(R, SymCtx, DR);
+  if (AvailabilityCheck != Result::Valid) {
+    updateState(AvailabilityCheck);
+    return Ctx.FrontendState;
+  }
+
+  if (!compareSymbolFlags(R, SymCtx, DR)) {
+    updateState(Result::Invalid);
+    return Ctx.FrontendState;
+  }
+
+  addSymbol(R, SymCtx);
+  updateState(Result::Valid);
+  return Ctx.FrontendState;
+}
+
+bool DylibVerifier::canVerify() {
+  return Ctx.FrontendState != Result::NoVerify;
+}
+
+void DylibVerifier::assignSlice(const Target &T) {
+  assert(T == Ctx.Target && "Active targets should match.");
+  if (Dylib.empty())
+    return;
+
+  // Note: there are no reexport slices with binaries, as opposed to TBD files,
+  // so it can be assumed that the target match is the active top-level library.
+  auto It = find_if(
+      Dylib, [&T](const auto &Slice) { return T == Slice->getTarget(); });
+
+  assert(It != Dylib.end() && "Target slice should always exist.");
+  Ctx.DylibSlice = It->get();
+}
+
+void DylibVerifier::setTarget(const Target &T) {
+  Ctx.Target = T;
+  Ctx.DiscoveredFirstError = false;
+  if (Dylib.empty()) {
+    updateState(Result::NoVerify);
+    return;
+  }
+  updateState(Result::Ignore);
+  assignSlice(T);
+}
+
+DylibVerifier::Result DylibVerifier::verify(ObjCIVarRecord *R,
+                                            const FrontendAttrs *FA,
+                                            const StringRef SuperClass) {
+  if (R->isVerified())
+    return getState();
+
+  std::string FullName =
+      ObjCIVarRecord::createScopedName(SuperClass, R->getName());
+  SymbolContext SymCtx{
+      getAnnotatedName(R, EncodeKind::ObjectiveCInstanceVariable,
+                       Demangle ? demangle(FullName) : FullName),
+      FullName, EncodeKind::ObjectiveCInstanceVariable, FA};
+  return verifyImpl(R, SymCtx);
+}
+
+static ObjCIFSymbolKind assignObjCIFSymbolKind(const ObjCInterfaceRecord *R) {
+  ObjCIFSymbolKind Result = ObjCIFSymbolKind::None;
+  if (R->getLinkageForSymbol(ObjCIFSymbolKind::Class) != RecordLinkage::Unknown)
+    Result |= ObjCIFSymbolKind::Class;
+  if (R->getLinkageForSymbol(ObjCIFSymbolKind::MetaClass) !=
+      RecordLinkage::Unknown)
+    Result |= ObjCIFSymbolKind::MetaClass;
+  if (R->getLinkageForSymbol(ObjCIFSymbolKind::EHType) !=
+      RecordLinkage::Unknown)
+    Result |= ObjCIFSymbolKind::EHType;
+  return Result;
+}
+
+DylibVerifier::Result DylibVerifier::verify(ObjCInterfaceRecord *R,
+                                            const FrontendAttrs *FA) {
+  if (R->isVerified())
+    return getState();
+  SymbolContext SymCtx;
+  SymCtx.SymbolName = R->getName();
+  SymCtx.ObjCIFKind = assignObjCIFSymbolKind(R);
+
+  std::string DisplayName =
+      Demangle ? demangle(SymCtx.SymbolName) : SymCtx.SymbolName;
+  SymCtx.Kind = R->hasExceptionAttribute() ? EncodeKind::ObjectiveCClassEHType
+                                           : EncodeKind::ObjectiveCClass;
+  SymCtx.PrettyPrintName = getAnnotatedName(R, SymCtx.Kind, DisplayName);
+  SymCtx.FA = FA;
+
+  return verifyImpl(R, SymCtx);
+}
+
+DylibVerifier::Result DylibVerifier::verify(GlobalRecord *R,
+                                            const FrontendAttrs *FA) {
+  if (R->isVerified())
+    return getState();
+
+  // Global classifications could be obfusciated with `asm`.
+  SimpleSymbol Sym = parseSymbol(R->getName());
+  SymbolContext SymCtx;
+  SymCtx.SymbolName = Sym.Name;
+  SymCtx.PrettyPrintName =
+      getAnnotatedName(R, Sym.Kind, Demangle ? demangle(Sym.Name) : Sym.Name);
+  SymCtx.Kind = Sym.Kind;
+  SymCtx.FA = FA;
+  SymCtx.Inlined = R->isInlined();
+  return verifyImpl(R, SymCtx);
+}
+
+void DylibVerifier::VerifierContext::emitDiag(
+    llvm::function_ref<void()> Report) {
+  if (!DiscoveredFirstError) {
+    Diag->Report(diag::warn_target)
+        << (PrintArch ? getArchitectureName(Target.Arch)
+                      : getTargetTripleName(Target));
+    DiscoveredFirstError = true;
+  }
+
+  Report();
+}
+
+} // namespace installapi
+} // namespace clang
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
index 707aeb17dc890..12cd5fcbc22bf 100644
--- a/clang/lib/InstallAPI/Frontend.cpp
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -16,41 +16,47 @@ using namespace llvm;
 using namespace llvm::MachO;
 
 namespace clang::installapi {
-
-GlobalRecord *FrontendRecordsSlice::addGlobal(
+std::pair<GlobalRecord *, FrontendAttrs *> FrontendRecordsSlice::addGlobal(
     StringRef Name, RecordLinkage Linkage, GlobalRecord::Kind GV,
     const clang::AvailabilityInfo Avail, const Decl *D, const HeaderType Access,
     SymbolFlags Flags, bool Inlined) {
 
-  auto *GR =
+  GlobalRecord *GR =
       llvm::MachO::RecordsSlice::addGlobal(Name, Linkage, GV, Flags, Inlined);
-  FrontendRecords.insert({GR, FrontendAttrs{Avail, D, Access}});
-  return GR;
+  auto Result = FrontendRecords.insert({GR, FrontendAttrs{Avail, D, Access}});
+  return {GR, &(Result.first->second)};
 }
 
-ObjCInterfaceRecord *FrontendRecordsSlice::addObjCInterface(
-    StringRef Name, RecordLinkage Linkage, const clang::AvailabilityInfo Avail,
-    const Decl *D, HeaderType Access, bool IsEHType) {
+std::pair<ObjCInterfaceRecord *, FrontendAttrs *>
+FrontendRecordsSlice::addObjCInterface(StringRef Name, RecordLinkage Linkage,
+                                       const clang::AvailabilityInfo Avail,
+                                       const Decl *D, HeaderType Access,
+                                       bool IsEHType) {
   ObjCIFSymbolKind SymType =
       ObjCIFSymbolKind::Class | ObjCIFSymbolKind::MetaClass;
   if (IsEHType)
     SymType |= ObjCIFSymbolKind::EHType;
-  auto *ObjCR =
+
+  ObjCInterfaceRecord *ObjCR =
       llvm::MachO::RecordsSlice::addObjCInterface(Name, Linkage, SymType);
-  FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
-  return ObjCR;
+  auto Result =
+      FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
+  return {ObjCR, &(Result.first->second)};
 }
 
-ObjCCategoryRecord *FrontendRecordsSlice::addObjCCategory(
-    StringRef ClassToExtend, StringRef CategoryName,
-    const clang::AvailabilityInfo Avail, const Decl *D, HeaderType Access) {
-  auto *ObjCR =
+std::pair<ObjCCategoryRecord *, FrontendAttrs *>
+FrontendRecordsSlice::addObjCCategory(StringRef ClassToExtend,
+                                      StringRef CategoryName,
+                                      const clang::AvailabilityInfo Avail,
+                                      const Decl *D, HeaderType Access) {
+  ObjCCategoryRecord *ObjCR =
       llvm::MachO::RecordsSlice::addObjCCategory(ClassToExtend, CategoryName);
-  FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
-  return ObjCR;
+  auto Result =
+      FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
+  return {ObjCR, &(Result.first->second)};
 }
 
-ObjCIVarRecord *FrontendRecordsSlice::addObjCIVar(
+std::pair<ObjCIVarRecord *, FrontendAttrs *> FrontendRecordsSlice::addObjCIVar(
     ObjCContainerRecord *Container, StringRef IvarName, RecordLinkage Linkage,
     const clang::AvailabilityInfo Avail, const Decl *D, HeaderType Access,
     const clang::ObjCIvarDecl::AccessControl AC) {
@@ -59,11 +65,12 @@ ObjCIVarRecord *FrontendRecordsSlice::addObjCIVar(
   if ((Linkage == RecordLinkage::Exported) &&
       ((AC == ObjCIvarDecl::Private) || (AC == ObjCIvarDecl::Package)))
     Linkage = RecordLinkage::Internal;
-  auto *ObjCR =
+  ObjCIVarRecord *ObjCR =
       llvm::MachO::RecordsSlice::addObjCIVar(Container, IvarName, Linkage);
-  FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
+  auto Result =
+      FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
 
-  return nullptr;
+  return {ObjCR, &(Result.first->second)};
 }
 
 std::optional<HeaderType>
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
index b4ed5974a0570..452c8f2fb1e48 100644
--- a/clang/lib/InstallAPI/Visitor.cpp
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -11,6 +11,7 @@
 #include "clang/AST/ParentMapContext.h"
 #include "clang/AST/VTableBuilder.h"
 #include "clang/Basic/Linkage.h"
+#include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/FrontendRecords.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -156,7 +157,9 @@ void InstallAPIVisitor::recordObjCInstanceVariables(
     StringRef Name = IV->getName();
     const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(IV);
     auto AC = IV->getCanonicalAccessControl();
-    Ctx.Slice->addObjCIVar(Record, Name, Linkage, Avail, IV, *Access, AC);
+    auto [ObjCIVR, FA] =
+        Ctx.Slice->addObjCIVar(Record, Name, Linkage, Avail, IV, *Access, AC);
+    Ctx.Verifier->verify(ObjCIVR, FA, SuperClass);
   }
 }
 
@@ -178,15 +181,16 @@ bool InstallAPIVisitor::VisitObjCInterfaceDecl(const ObjCInterfaceDecl *D) {
       (!D->getASTContext().getLangOpts().ObjCRuntime.isFragile() &&
        hasObjCExceptionAttribute(D));
 
-  ObjCInterfaceRecord *Class =
+  auto [Class, FA] =
       Ctx.Slice->addObjCInterface(Name, Linkage, Avail, D, *Access, IsEHType);
+  Ctx.Verifier->verify(Class, FA);
 
   // Get base class.
   StringRef SuperClassName;
   if (const auto *SuperClass = D->getSuperClass())
     SuperClassName = SuperClass->getObjCRuntimeNameAsString();
 
-  recordObjCInstanceVariables(D->getASTContext(), Class, SuperClassName,
+  recordObjCInstanceVariables(D->getASTContext(), Class, Class->getName(),
                               D->ivars());
   return true;
 }
@@ -201,8 +205,8 @@ bool InstallAPIVisitor::VisitObjCCategoryDecl(const ObjCCategoryDecl *D) {
   const ObjCInterfaceDecl *InterfaceD = D->getClassInterface();
   const StringRef InterfaceName = InterfaceD->getName();
 
-  ObjCCategoryRecord *Category = Ctx.Slice->addObjCCategory(
-      InterfaceName, CategoryName, Avail, D, *Access);
+  auto [Category, FA] = Ctx.Slice->addObjCCategory(InterfaceName, CategoryName,
+                                                   Avail, D, *Access);
   recordObjCInstanceVariables(D->getASTContext(), Category, InterfaceName,
                               D->ivars());
   return true;
@@ -236,8 +240,10 @@ bool InstallAPIVisitor::VisitVarDecl(const VarDecl *D) {
   const bool WeakDef = D->hasAttr<WeakAttr>();
   const bool ThreadLocal = D->getTLSKind() != VarDecl::TLS_None;
   const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(D);
-  Ctx.Slice->addGlobal(getMangledName(D), Linkage, GlobalRecord::Kind::Variable,
-                       Avail, D, *Access, getFlags(WeakDef, ThreadLocal));
+  auto [GR, FA] = Ctx.Slice->addGlobal(getMangledName(D), Linkage,
+                                       GlobalRecord::Kind::Variable, Avail, D,
+                                       *Access, getFlags(WeakDef, ThreadLocal));
+  Ctx.Verifier->verify(GR, FA);
   return true;
 }
 
@@ -287,8 +293,10 @@ bool InstallAPIVisitor::VisitFunctionDecl(const FunctionDecl *D) {
   const RecordLinkage Linkage = (Inlined || !isExported(D))
                                     ? RecordLinkage::Internal
                                     : RecordLinkage::Exported;
-  Ctx.Slice->addGlobal(Name, Linkage, GlobalRecord::Kind::Function, Avail, D,
-                       *Access, getFlags(WeakDef), Inlined);
+  auto [GR, FA] =
+      Ctx.Slice->addGlobal(Name, Linkage, GlobalRecord::Kind::Function, Avail,
+                           D, *Access, getFlags(WeakDef), Inlined);
+  Ctx.Verifier->verify(GR, FA);
   return true;
 }
 
@@ -478,9 +486,10 @@ void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
         VTableLinkage == CXXLinkage::WeakODRLinkage) {
       const std::string Name = getMangledCXXVTableName(D);
       const bool WeakDef = VTableLinkage == CXXLinkage::WeakODRLinkage;
-      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                           GlobalRecord::Kind::Variable, Avail, D, Access,
-                           getFlags(WeakDef));
+      auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                           GlobalRecord::Kind::Variable, Avail,
+                                           D, Access, getFlags(WeakDef));
+      Ctx.Verifier->verify(GR, FA);
       if (!D->getDescribedClassTemplate() && !D->isInvalidDecl()) {
         VTableContextBase *VTable = D->getASTContext().getVTableContext();
         auto AddThunk = [&](GlobalDecl GD) {
@@ -491,9 +500,10 @@ void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
 
           for (const auto &Thunk : *Thunks) {
             const std::string Name = getMangledCXXThunk(GD, Thunk);
-            Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                                 GlobalRecord::Kind::Function, Avail,
-                                 GD.getDecl(), Access);
+            auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                                 GlobalRecord::Kind::Function,
+                                                 Avail, GD.getDecl(), Access);
+            Ctx.Verifier->verify(GR, FA);
           }
         };
 
@@ -519,12 +529,16 @@ void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
 
   if (hasRTTI(D)) {
     std::string Name = getMangledCXXRTTI(D);
-    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                         GlobalRecord::Kind::Variable, Avail, D, Access);
+    auto [GR, FA] =
+        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                             GlobalRecord::Kind::Variable, Avail, D, Access);
+    Ctx.Verifier->verify(GR, FA);
 
     Name = getMangledCXXRTTIName(D);
-    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                         GlobalRecord::Kind::Variable, Avail, D, Access);
+    auto [NamedGR, NamedFA] =
+        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                             GlobalRecord::Kind::Variable, Avail, D, Access);
+    Ctx.Verifier->verify(NamedGR, NamedFA);
   }
 
   for (const auto &It : D->bases()) {
@@ -615,15 +629,17 @@ bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
         continue;
 
       std::string Name = getMangledCtorDtor(M, Ctor_Base);
-      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                           GlobalRecord::Kind::Function, Avail, D, *Access,
-                           getFlags(WeakDef));
+      auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                           GlobalRecord::Kind::Function, Avail,
+                                           D, *Access, getFlags(WeakDef));
+      Ctx.Verifier->verify(GR, FA);
 
       if (!D->isAbstract()) {
         std::string Name = getMangledCtorDtor(M, Ctor_Complete);
-        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                             GlobalRecord::Kind::Function, Avail, D, *Access,
-                             getFlags(WeakDef));
+        auto [GR, FA] = Ctx.Slice->addGlobal(
+            Name, RecordLinkage::Exported, GlobalRecord::Kind::Function, Avail,
+            D, *Access, getFlags(WeakDef));
+        Ctx.Verifier->verify(GR, FA);
       }
 
       continue;
@@ -635,20 +651,23 @@ bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
         continue;
 
       std::string Name = getMangledCtorDtor(M, Dtor_Base);
-      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                           GlobalRecord::Kind::Function, Avail, D, *Access,
-                           getFlags(WeakDef));
+      auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                           GlobalRecord::Kind::Function, Avail,
+                                           D, *Access, getFlags(WeakDef));
+      Ctx.Verifier->verify(GR, FA);
 
       Name = getMangledCtorDtor(M, Dtor_Complete);
-      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                           GlobalRecord::Kind::Function, Avail, D, *Access,
-                           getFlags(WeakDef));
+      auto [CompleteGR, CompleteFA] = Ctx.Slice->addGlobal(
+          Name, RecordLinkage::Exported, GlobalRecord::Kind::Function, Avail, D,
+          *Access, getFlags(WeakDef));
+      Ctx.Verifier->verify(CompleteGR, CompleteFA);
 
       if (Dtor->isVirtual()) {
         Name = getMangledCtorDtor(M, Dtor_Deleting);
-        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                             GlobalRecord::Kind::Function, Avail, D, *Access,
-                             getFlags(WeakDef));
+        auto [VirtualGR, VirtualFA] = Ctx.Slice->addGlobal(
+            Name, RecordLinkage::Exported, GlobalRecord::Kind::Function, Avail,
+            D, *Access, getFlags(WeakDef));
+        Ctx.Verifier->verify(VirtualGR, VirtualFA);
       }
 
       continue;
@@ -661,9 +680,10 @@ bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
       continue;
 
     std::string Name = getMangledName(M);
-    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                         GlobalRecord::Kind::Function, Avail, D, *Access,
-                         getFlags(WeakDef));
+    auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                         GlobalRecord::Kind::Function, Avail, M,
+                                         *Access, getFlags(WeakDef));
+    Ctx.Verifier->verify(GR, FA);
   }
 
   if (auto *Templ = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
@@ -694,9 +714,10 @@ bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
     const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(Var);
     const bool WeakDef = Var->hasAttr<WeakAttr>() || KeepInlineAsWeak;
 
-    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                         GlobalRecord::Kind::Variable, Avail, D, *Access,
-                         getFlags(WeakDef));
+    auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                         GlobalRecord::Kind::Variable, Avail, D,
+                                         *Access, getFlags(WeakDef));
+    Ctx.Verifier->verify(GR, FA);
   }
 
   return true;
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index afb2948f05ae5..10c475f617d48 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -2498,9 +2498,12 @@ void ModuleMapParser::parseHeaderDecl(MMToken::TokenKind LeadingToken,
   }
 
   bool NeedsFramework = false;
-  // Don't add the top level headers to the builtin modules if the builtin headers
-  // belong to the system modules.
-  if (!Map.LangOpts.BuiltinHeadersInSystemModules || ActiveModule->isSubModule() || !isBuiltInModuleName(ActiveModule->Name))
+  // Don't add headers to the builtin modules if the builtin headers belong to
+  // the system modules, with the exception of __stddef_max_align_t.h which
+  // always had its own module.
+  if (!Map.LangOpts.BuiltinHeadersInSystemModules ||
+      !isBuiltInModuleName(ActiveModule->getTopLevelModuleName()) ||
+      ActiveModule->fullModuleNameIs({"_Builtin_stddef", "max_align_t"}))
     Map.addUnresolvedHeader(ActiveModule, std::move(Header), NeedsFramework);
 
   if (NeedsFramework)
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 3576ebe37810b..1fe6ddf16d62b 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -629,6 +629,10 @@ void Parser::ParseGNUAttributeArgs(
     ParseAttributeWithTypeArg(*AttrName, AttrNameLoc, Attrs, ScopeName,
                               ScopeLoc, Form);
     return;
+  } else if (AttrKind == ParsedAttr::AT_CountedBy) {
+    ParseBoundsAttribute(*AttrName, AttrNameLoc, Attrs, ScopeName, ScopeLoc,
+                         Form);
+    return;
   }
 
   // These may refer to the function arguments, but need to be parsed early to
@@ -3251,6 +3255,54 @@ void Parser::ParseAlignmentSpecifier(ParsedAttributes &Attrs,
   }
 }
 
+/// Bounds attributes (e.g., counted_by):
+///   AttrName '(' expression ')'
+void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
+                                  SourceLocation AttrNameLoc,
+                                  ParsedAttributes &Attrs,
+                                  IdentifierInfo *ScopeName,
+                                  SourceLocation ScopeLoc,
+                                  ParsedAttr::Form Form) {
+  assert(Tok.is(tok::l_paren) && "Attribute arg list not starting with '('");
+
+  BalancedDelimiterTracker Parens(*this, tok::l_paren);
+  Parens.consumeOpen();
+
+  if (Tok.is(tok::r_paren)) {
+    Diag(Tok.getLocation(), diag::err_argument_required_after_attribute);
+    Parens.consumeClose();
+    return;
+  }
+
+  ArgsVector ArgExprs;
+  // Don't evaluate argument when the attribute is ignored.
+  using ExpressionKind =
+      Sema::ExpressionEvaluationContextRecord::ExpressionKind;
+  EnterExpressionEvaluationContext EC(
+      Actions, Sema::ExpressionEvaluationContext::PotentiallyEvaluated, nullptr,
+      ExpressionKind::EK_BoundsAttrArgument);
+
+  ExprResult ArgExpr(
+      Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression()));
+
+  if (ArgExpr.isInvalid()) {
+    Parens.skipToEnd();
+    return;
+  }
+
+  ArgExprs.push_back(ArgExpr.get());
+  Parens.consumeClose();
+
+  ASTContext &Ctx = Actions.getASTContext();
+
+  ArgExprs.push_back(IntegerLiteral::Create(
+      Ctx, llvm::APInt(Ctx.getTypeSize(Ctx.getSizeType()), 0),
+      Ctx.getSizeType(), SourceLocation()));
+
+  Attrs.addNew(&AttrName, SourceRange(AttrNameLoc, Parens.getCloseLocation()),
+               ScopeName, ScopeLoc, ArgExprs.data(), ArgExprs.size(), Form);
+}
+
 ExprResult Parser::ParseExtIntegerArgument() {
   assert(Tok.isOneOf(tok::kw__ExtInt, tok::kw__BitInt) &&
          "Not an extended int type");
@@ -4680,6 +4732,39 @@ void Parser::ParseDeclarationSpecifiers(
   }
 }
 
+static void DiagnoseCountAttributedTypeInUnnamedAnon(ParsingDeclSpec &DS,
+                                                     Parser &P) {
+
+  if (DS.getTypeSpecType() != DeclSpec::TST_struct)
+    return;
+
+  auto *RD = dyn_cast<RecordDecl>(DS.getRepAsDecl());
+  // We're only interested in unnamed, non-anonymous struct
+  if (!RD || !RD->getName().empty() || RD->isAnonymousStructOrUnion())
+    return;
+
+  for (auto *I : RD->decls()) {
+    auto *VD = dyn_cast<ValueDecl>(I);
+    if (!VD)
+      continue;
+
+    auto *CAT = VD->getType()->getAs<CountAttributedType>();
+    if (!CAT)
+      continue;
+
+    for (const auto &DD : CAT->dependent_decls()) {
+      if (!RD->containsDecl(DD.getDecl())) {
+        P.Diag(VD->getBeginLoc(),
+               diag::err_flexible_array_count_not_in_same_struct)
+            << DD.getDecl();
+        P.Diag(DD.getDecl()->getBeginLoc(),
+               diag::note_flexible_array_counted_by_attr_field)
+            << DD.getDecl();
+      }
+    }
+  }
+}
+
 /// ParseStructDeclaration - Parse a struct declaration without the terminating
 /// semicolon.
 ///
@@ -4760,6 +4845,11 @@ void Parser::ParseStructDeclaration(
     } else
       DeclaratorInfo.D.SetIdentifier(nullptr, Tok.getLocation());
 
+    // Here, we now know that the unnamed struct is not an anonymous struct.
+    // Report an error if a counted_by attribute refers to a field in a
+    // different named struct.
+    DiagnoseCountAttributedTypeInUnnamedAnon(DS, *this);
+
     if (TryConsumeToken(tok::colon)) {
       ExprResult Res(ParseConstantExpression());
       if (Res.isInvalid())
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 9fbe214f2bd19..806a0fe3b745e 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1974,6 +1974,191 @@ static bool SemaOpenCLBuiltinToAddr(Sema &S, unsigned BuiltinID,
   return false;
 }
 
+namespace {
+enum PointerAuthOpKind {
+  PAO_Strip,
+  PAO_Sign,
+  PAO_Auth,
+  PAO_SignGeneric,
+  PAO_Discriminator,
+  PAO_BlendPointer,
+  PAO_BlendInteger
+};
+}
+
+static bool checkPointerAuthEnabled(Sema &S, Expr *E) {
+  if (S.getLangOpts().PointerAuthIntrinsics)
+    return false;
+
+  S.Diag(E->getExprLoc(), diag::err_ptrauth_disabled) << E->getSourceRange();
+  return true;
+}
+
+static bool checkPointerAuthKey(Sema &S, Expr *&Arg) {
+  // Convert it to type 'int'.
+  if (convertArgumentToType(S, Arg, S.Context.IntTy))
+    return true;
+
+  // Value-dependent expressions are okay; wait for template instantiation.
+  if (Arg->isValueDependent())
+    return false;
+
+  unsigned KeyValue;
+  return S.checkConstantPointerAuthKey(Arg, KeyValue);
+}
+
+bool Sema::checkConstantPointerAuthKey(Expr *Arg, unsigned &Result) {
+  // Attempt to constant-evaluate the expression.
+  std::optional<llvm::APSInt> KeyValue = Arg->getIntegerConstantExpr(Context);
+  if (!KeyValue) {
+    Diag(Arg->getExprLoc(), diag::err_expr_not_ice)
+        << 0 << Arg->getSourceRange();
+    return true;
+  }
+
+  // Ask the target to validate the key parameter.
+  if (!Context.getTargetInfo().validatePointerAuthKey(*KeyValue)) {
+    llvm::SmallString<32> Value;
+    {
+      llvm::raw_svector_ostream Str(Value);
+      Str << *KeyValue;
+    }
+
+    Diag(Arg->getExprLoc(), diag::err_ptrauth_invalid_key)
+        << Value << Arg->getSourceRange();
+    return true;
+  }
+
+  Result = KeyValue->getZExtValue();
+  return false;
+}
+
+static bool checkPointerAuthValue(Sema &S, Expr *&Arg,
+                                  PointerAuthOpKind OpKind) {
+  if (Arg->hasPlaceholderType()) {
+    ExprResult R = S.CheckPlaceholderExpr(Arg);
+    if (R.isInvalid())
+      return true;
+    Arg = R.get();
+  }
+
+  auto AllowsPointer = [](PointerAuthOpKind OpKind) {
+    return OpKind != PAO_BlendInteger;
+  };
+  auto AllowsInteger = [](PointerAuthOpKind OpKind) {
+    return OpKind == PAO_Discriminator || OpKind == PAO_BlendInteger ||
+           OpKind == PAO_SignGeneric;
+  };
+
+  // Require the value to have the right range of type.
+  QualType ExpectedTy;
+  if (AllowsPointer(OpKind) && Arg->getType()->isPointerType()) {
+    ExpectedTy = Arg->getType().getUnqualifiedType();
+  } else if (AllowsPointer(OpKind) && Arg->getType()->isNullPtrType()) {
+    ExpectedTy = S.Context.VoidPtrTy;
+  } else if (AllowsInteger(OpKind) &&
+             Arg->getType()->isIntegralOrUnscopedEnumerationType()) {
+    ExpectedTy = S.Context.getUIntPtrType();
+
+  } else {
+    // Diagnose the failures.
+    S.Diag(Arg->getExprLoc(), diag::err_ptrauth_value_bad_type)
+        << unsigned(OpKind == PAO_Discriminator  ? 1
+                    : OpKind == PAO_BlendPointer ? 2
+                    : OpKind == PAO_BlendInteger ? 3
+                                                 : 0)
+        << unsigned(AllowsInteger(OpKind) ? (AllowsPointer(OpKind) ? 2 : 1) : 0)
+        << Arg->getType() << Arg->getSourceRange();
+    return true;
+  }
+
+  // Convert to that type.  This should just be an lvalue-to-rvalue
+  // conversion.
+  if (convertArgumentToType(S, Arg, ExpectedTy))
+    return true;
+
+  // Warn about null pointers for non-generic sign and auth operations.
+  if ((OpKind == PAO_Sign || OpKind == PAO_Auth) &&
+      Arg->isNullPointerConstant(S.Context, Expr::NPC_ValueDependentIsNull)) {
+    S.Diag(Arg->getExprLoc(), OpKind == PAO_Sign
+                                  ? diag::warn_ptrauth_sign_null_pointer
+                                  : diag::warn_ptrauth_auth_null_pointer)
+        << Arg->getSourceRange();
+  }
+
+  return false;
+}
+
+static ExprResult SemaPointerAuthStrip(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 2))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], PAO_Strip) ||
+      checkPointerAuthKey(S, Call->getArgs()[1]))
+    return ExprError();
+
+  Call->setType(Call->getArgs()[0]->getType());
+  return Call;
+}
+
+static ExprResult SemaPointerAuthBlendDiscriminator(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 2))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], PAO_BlendPointer) ||
+      checkPointerAuthValue(S, Call->getArgs()[1], PAO_BlendInteger))
+    return ExprError();
+
+  Call->setType(S.Context.getUIntPtrType());
+  return Call;
+}
+
+static ExprResult SemaPointerAuthSignGenericData(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 2))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], PAO_SignGeneric) ||
+      checkPointerAuthValue(S, Call->getArgs()[1], PAO_Discriminator))
+    return ExprError();
+
+  Call->setType(S.Context.getUIntPtrType());
+  return Call;
+}
+
+static ExprResult SemaPointerAuthSignOrAuth(Sema &S, CallExpr *Call,
+                                            PointerAuthOpKind OpKind) {
+  if (checkArgCount(S, Call, 3))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], OpKind) ||
+      checkPointerAuthKey(S, Call->getArgs()[1]) ||
+      checkPointerAuthValue(S, Call->getArgs()[2], PAO_Discriminator))
+    return ExprError();
+
+  Call->setType(Call->getArgs()[0]->getType());
+  return Call;
+}
+
+static ExprResult SemaPointerAuthAuthAndResign(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 5))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], PAO_Auth) ||
+      checkPointerAuthKey(S, Call->getArgs()[1]) ||
+      checkPointerAuthValue(S, Call->getArgs()[2], PAO_Discriminator) ||
+      checkPointerAuthKey(S, Call->getArgs()[3]) ||
+      checkPointerAuthValue(S, Call->getArgs()[4], PAO_Discriminator))
+    return ExprError();
+
+  Call->setType(Call->getArgs()[0]->getType());
+  return Call;
+}
+
 static ExprResult SemaBuiltinLaunder(Sema &S, CallExpr *TheCall) {
   if (checkArgCount(S, TheCall, 1))
     return ExprError();
@@ -2686,6 +2871,18 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     }
     break;
   }
+  case Builtin::BI__builtin_ptrauth_strip:
+    return SemaPointerAuthStrip(*this, TheCall);
+  case Builtin::BI__builtin_ptrauth_blend_discriminator:
+    return SemaPointerAuthBlendDiscriminator(*this, TheCall);
+  case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
+    return SemaPointerAuthSignOrAuth(*this, TheCall, PAO_Sign);
+  case Builtin::BI__builtin_ptrauth_auth:
+    return SemaPointerAuthSignOrAuth(*this, TheCall, PAO_Auth);
+  case Builtin::BI__builtin_ptrauth_sign_generic_data:
+    return SemaPointerAuthSignGenericData(*this, TheCall);
+  case Builtin::BI__builtin_ptrauth_auth_and_resign:
+    return SemaPointerAuthAuthAndResign(*this, TheCall);
   // OpenCL v2.0, s6.13.16 - Pipe functions
   case Builtin::BIread_pipe:
   case Builtin::BIwrite_pipe:
@@ -5040,6 +5237,7 @@ static bool isPPC_64Builtin(unsigned BuiltinID) {
   case PPC::BI__builtin_ppc_fetch_and_andlp:
   case PPC::BI__builtin_ppc_fetch_and_orlp:
   case PPC::BI__builtin_ppc_fetch_and_swaplp:
+  case PPC::BI__builtin_ppc_rldimi:
     return true;
   }
   return false;
@@ -5141,8 +5339,10 @@ bool Sema::CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case PPC::BI__builtin_ppc_rlwnm:
     return SemaValueIsRunOfOnes(TheCall, 2);
   case PPC::BI__builtin_ppc_rlwimi:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 31) ||
+           SemaValueIsRunOfOnes(TheCall, 3);
   case PPC::BI__builtin_ppc_rldimi:
-    return SemaBuiltinConstantArg(TheCall, 2, Result) ||
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 63) ||
            SemaValueIsRunOfOnes(TheCall, 3);
   case PPC::BI__builtin_ppc_addex: {
     if (SemaBuiltinConstantArgRange(TheCall, 2, 0, 3))
@@ -5281,10 +5481,6 @@ bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
                            TheCall->getArg(1)->getEndLoc());
         retValue = true;
       }
-
-      if (!retValue)
-        TheCall->setType(VecTyA->getElementType());
-
       return retValue;
     }
   }
@@ -5298,11 +5494,12 @@ bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
   return true;
 }
 
-bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
-  QualType ExpectedType = S->Context.FloatTy;
+bool CheckArgsTypesAreCorrect(
+    Sema *S, CallExpr *TheCall, QualType ExpectedType,
+    llvm::function_ref<bool(clang::QualType PassedType)> Check) {
   for (unsigned i = 0; i < TheCall->getNumArgs(); ++i) {
     QualType PassedType = TheCall->getArg(i)->getType();
-    if (!PassedType->hasFloatingRepresentation()) {
+    if (Check(PassedType)) {
       if (auto *VecTyA = PassedType->getAs<VectorType>())
         ExpectedType = S->Context.getVectorType(
             ExpectedType, VecTyA->getNumElements(), VecTyA->getVectorKind());
@@ -5315,6 +5512,45 @@ bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
   return false;
 }
 
+bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
+  auto checkAllFloatTypes = [](clang::QualType PassedType) -> bool {
+    return !PassedType->hasFloatingRepresentation();
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                  checkAllFloatTypes);
+}
+
+bool CheckFloatOrHalfRepresentations(Sema *S, CallExpr *TheCall) {
+  auto checkFloatorHalf = [](clang::QualType PassedType) -> bool {
+    clang::QualType BaseType =
+        PassedType->isVectorType()
+            ? PassedType->getAs<clang::VectorType>()->getElementType()
+            : PassedType;
+    return !BaseType->isHalfType() && !BaseType->isFloat32Type();
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                  checkFloatorHalf);
+}
+
+bool CheckNoDoubleVectors(Sema *S, CallExpr *TheCall) {
+  auto checkDoubleVector = [](clang::QualType PassedType) -> bool {
+    if (const auto *VecTy = PassedType->getAs<VectorType>())
+      return VecTy->getElementType()->isDoubleType();
+    return false;
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                  checkDoubleVector);
+}
+
+void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
+                                QualType ReturnType) {
+  auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
+  if (VecTyA)
+    ReturnType = S->Context.getVectorType(ReturnType, VecTyA->getNumElements(),
+                                          VectorKind::Generic);
+  TheCall->setType(ReturnType);
+}
+
 // Note: returning true in this case results in CheckBuiltinFunctionCall
 // returning an ExprError
 bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
@@ -5324,6 +5560,17 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
+  case Builtin::BI__builtin_hlsl_elementwise_clamp: {
+    if (checkArgCount(*this, TheCall, 3))
+      return true;
+    if (CheckVectorElementCallArgs(this, TheCall))
+      return true;
+    if (SemaBuiltinElementwiseTernaryMath(
+            TheCall, /*CheckForFloatArgs*/
+            TheCall->getArg(0)->getType()->hasFloatingRepresentation()))
+      return true;
+    break;
+  }
   case Builtin::BI__builtin_hlsl_dot: {
     if (checkArgCount(*this, TheCall, 2))
       return true;
@@ -5331,14 +5578,31 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     if (SemaBuiltinVectorToScalarMath(TheCall))
       return true;
+    if (CheckNoDoubleVectors(this, TheCall))
+      return true;
     break;
   }
-  case Builtin::BI__builtin_hlsl_elementwise_rcp:
+  case Builtin::BI__builtin_hlsl_elementwise_rcp: {
+    if (CheckAllArgsHaveFloatRepresentation(this, TheCall))
+      return true;
+    if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_rsqrt:
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
+    if (CheckFloatOrHalfRepresentations(this, TheCall))
+      return true;
     if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
       return true;
-    if (CheckAllArgsHaveFloatRepresentation(this, TheCall))
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_isinf: {
+    if (CheckFloatOrHalfRepresentations(this, TheCall))
       return true;
+    if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
+      return true;
+    SetElementTypeAsReturnType(this, TheCall, this->Context.BoolTy);
     break;
   }
   case Builtin::BI__builtin_hlsl_lerp: {
@@ -5348,7 +5612,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     if (SemaBuiltinElementwiseTernaryMath(TheCall))
       return true;
-    if (CheckAllArgsHaveFloatRepresentation(this, TheCall))
+    if (CheckFloatOrHalfRepresentations(this, TheCall))
       return true;
     break;
   }
@@ -5357,7 +5621,9 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
-    if (SemaBuiltinElementwiseTernaryMath(TheCall, /*CheckForFloatArgs*/ false))
+    if (SemaBuiltinElementwiseTernaryMath(
+            TheCall, /*CheckForFloatArgs*/
+            TheCall->getArg(0)->getType()->hasFloatingRepresentation()))
       return true;
   }
   }
@@ -16674,12 +16940,26 @@ static void AnalyzeImplicitConversions(
         BO->getRHS()->isKnownToHaveBooleanValue() &&
         BO->getLHS()->HasSideEffects(S.Context) &&
         BO->getRHS()->HasSideEffects(S.Context)) {
-      S.Diag(BO->getBeginLoc(), diag::warn_bitwise_instead_of_logical)
-          << (BO->getOpcode() == BO_And ? "&" : "|") << OrigE->getSourceRange()
-          << FixItHint::CreateReplacement(
-                 BO->getOperatorLoc(),
-                 (BO->getOpcode() == BO_And ? "&&" : "||"));
-      S.Diag(BO->getBeginLoc(), diag::note_cast_operand_to_int);
+      SourceManager &SM = S.getSourceManager();
+      const LangOptions &LO = S.getLangOpts();
+      SourceLocation BLoc = BO->getOperatorLoc();
+      SourceLocation ELoc = Lexer::getLocForEndOfToken(BLoc, 0, SM, LO);
+      StringRef SR = clang::Lexer::getSourceText(
+          clang::CharSourceRange::getTokenRange(BLoc, ELoc), SM, LO);
+      // To reduce false positives, only issue the diagnostic if the operator
+      // is explicitly spelled as a punctuator. This suppresses the diagnostic
+      // when using 'bitand' or 'bitor' either as keywords in C++ or as macros
+      // in C, along with other macro spellings the user might invent.
+      if (SR.str() == "&" || SR.str() == "|") {
+
+        S.Diag(BO->getBeginLoc(), diag::warn_bitwise_instead_of_logical)
+            << (BO->getOpcode() == BO_And ? "&" : "|")
+            << OrigE->getSourceRange()
+            << FixItHint::CreateReplacement(
+                   BO->getOperatorLoc(),
+                   (BO->getOpcode() == BO_And ? "&&" : "||"));
+        S.Diag(BO->getBeginLoc(), diag::note_cast_operand_to_int);
+      }
     }
 
   // For conditional operators, we analyze the arguments as if they
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index a8e387e35fb4c..1c546e9f5894f 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -692,11 +692,15 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
   // A lambda conversion operator has the same constraints as the call operator
   // and constraints checking relies on whether we are in a lambda call operator
   // (and may refer to its parameters), so check the call operator instead.
+  // Note that the declarations outside of the lambda should also be
+  // considered. Turning on the 'ForOverloadResolution' flag results in the
+  // LocalInstantiationScope not looking into its parents, but we can still
+  // access Decls from the parents while building a lambda RAII scope later.
   if (const auto *MD = dyn_cast<CXXConversionDecl>(FD);
       MD && isLambdaConversionOperator(const_cast<CXXConversionDecl *>(MD)))
     return CheckFunctionConstraints(MD->getParent()->getLambdaCallOperator(),
                                     Satisfaction, UsageLoc,
-                                    ForOverloadResolution);
+                                    /*ShouldAddDeclsFromParentScope=*/true);
 
   DeclContext *CtxToSave = const_cast<FunctionDecl *>(FD);
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 69e9a28dabdc8..59d3670649ae3 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -1192,9 +1192,13 @@ Sema::NameClassification Sema::ClassifyName(Scope *S, CXXScopeSpec &SS,
     return ParsedType::make(T);
   }
 
-  if (isa<ConceptDecl>(FirstDecl))
+  if (isa<ConceptDecl>(FirstDecl)) {
+    // We want to preserve the UsingShadowDecl for concepts.
+    if (auto *USD = dyn_cast<UsingShadowDecl>(Result.getRepresentativeDecl()))
+      return NameClassification::Concept(TemplateName(USD));
     return NameClassification::Concept(
         TemplateName(cast<TemplateDecl>(FirstDecl)));
+  }
 
   if (auto *EmptyD = dyn_cast<UnresolvedUsingIfExistsDecl>(FirstDecl)) {
     (void)DiagnoseUseOfDecl(EmptyD, NameLoc);
@@ -2267,12 +2271,6 @@ void Sema::ActOnPopScope(SourceLocation Loc, Scope *S) {
       }
       ShadowingDecls.erase(ShadowI);
     }
-
-    if (!getLangOpts().CPlusPlus && S->isClassScope()) {
-      if (auto *FD = dyn_cast<FieldDecl>(TmpD);
-          FD && FD->hasAttr<CountedByAttr>())
-        CheckCountedByAttr(S, FD);
-    }
   }
 
   llvm::sort(DeclDiags,
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index c76b280e29406..49af476dc529e 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -11501,133 +11501,114 @@ static void handleZeroCallUsedRegsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(ZeroCallUsedRegsAttr::Create(S.Context, Kind, AL));
 }
 
-static void handleCountedByAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  if (!AL.isArgIdent(0)) {
-    S.Diag(AL.getLoc(), diag::err_attribute_argument_type)
-        << AL << AANT_ArgumentIdentifier;
-    return;
+static const RecordDecl *GetEnclosingNamedOrTopAnonRecord(const FieldDecl *FD) {
+  const auto *RD = FD->getParent();
+  // An unnamed struct is anonymous struct only if it's not instantiated.
+  // However, the struct may not be fully processed yet to determine
+  // whether it's anonymous or not. In that case, this function treats it as
+  // an anonymous struct and tries to find a named parent.
+  while (RD && (RD->isAnonymousStructOrUnion() ||
+                (!RD->isCompleteDefinition() && RD->getName().empty()))) {
+    const auto *Parent = dyn_cast<RecordDecl>(RD->getParent());
+    if (!Parent)
+      break;
+    RD = Parent;
   }
-
-  IdentifierLoc *IL = AL.getArgAsIdent(0);
-  CountedByAttr *CBA =
-      ::new (S.Context) CountedByAttr(S.Context, AL, IL->Ident);
-  CBA->setCountedByFieldLoc(IL->Loc);
-  D->addAttr(CBA);
+  return RD;
 }
 
-static const FieldDecl *
-FindFieldInTopLevelOrAnonymousStruct(const RecordDecl *RD,
-                                     const IdentifierInfo *FieldName) {
-  for (const Decl *D : RD->decls()) {
-    if (const auto *FD = dyn_cast<FieldDecl>(D))
-      if (FD->getName() == FieldName->getName())
-        return FD;
-
-    if (const auto *R = dyn_cast<RecordDecl>(D))
-      if (const FieldDecl *FD =
-              FindFieldInTopLevelOrAnonymousStruct(R, FieldName))
-        return FD;
+static bool
+CheckCountExpr(Sema &S, FieldDecl *FD, Expr *E,
+               llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
+  if (FD->getParent()->isUnion()) {
+    S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_in_union)
+        << FD->getSourceRange();
+    return true;
   }
 
-  return nullptr;
-}
+  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
+    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
+        << E->getSourceRange();
+    return true;
+  }
 
-bool Sema::CheckCountedByAttr(Scope *S, const FieldDecl *FD) {
   LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
       LangOptions::StrictFlexArraysLevelKind::IncompleteOnly;
-  if (!Decl::isFlexibleArrayMemberLike(Context, FD, FD->getType(),
+
+  if (!Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FD->getType(),
                                        StrictFlexArraysLevel, true)) {
     // The "counted_by" attribute must be on a flexible array member.
     SourceRange SR = FD->getLocation();
-    Diag(SR.getBegin(), diag::err_counted_by_attr_not_on_flexible_array_member)
+    S.Diag(SR.getBegin(),
+           diag::err_counted_by_attr_not_on_flexible_array_member)
         << SR;
     return true;
   }
 
-  const auto *CBA = FD->getAttr<CountedByAttr>();
-  const IdentifierInfo *FieldName = CBA->getCountedByField();
-
-  auto GetNonAnonStructOrUnion = [](const RecordDecl *RD) {
-    while (RD && !RD->getDeclName())
-      if (const auto *R = dyn_cast<RecordDecl>(RD->getDeclContext()))
-        RD = R;
-      else
-        break;
-
-    return RD;
-  };
-
-  const RecordDecl *EnclosingRD = GetNonAnonStructOrUnion(FD->getParent());
-  const FieldDecl *CountFD =
-      FindFieldInTopLevelOrAnonymousStruct(EnclosingRD, FieldName);
+  auto *DRE = dyn_cast<DeclRefExpr>(E);
+  if (!DRE) {
+    S.Diag(E->getBeginLoc(),
+           diag::err_counted_by_attr_only_support_simple_decl_reference)
+        << E->getSourceRange();
+    return true;
+  }
 
+  auto *CountDecl = DRE->getDecl();
+  FieldDecl *CountFD = dyn_cast<FieldDecl>(CountDecl);
+  if (auto *IFD = dyn_cast<IndirectFieldDecl>(CountDecl)) {
+    CountFD = IFD->getAnonField();
+  }
   if (!CountFD) {
-    DeclarationNameInfo NameInfo(FieldName,
-                                 CBA->getCountedByFieldLoc().getBegin());
-    LookupResult MemResult(*this, NameInfo, Sema::LookupMemberName);
-    LookupName(MemResult, S);
-
-    if (!MemResult.empty()) {
-      SourceRange SR = CBA->getCountedByFieldLoc();
-      Diag(SR.getBegin(), diag::err_flexible_array_count_not_in_same_struct)
-          << CBA->getCountedByField() << SR;
-
-      if (auto *ND = MemResult.getAsSingle<NamedDecl>()) {
-        SR = ND->getLocation();
-        Diag(SR.getBegin(), diag::note_flexible_array_counted_by_attr_field)
-            << ND << SR;
-      }
+    S.Diag(E->getBeginLoc(), diag::err_counted_by_must_be_in_structure)
+        << CountDecl << E->getSourceRange();
 
-      return true;
-    } else {
-      // The "counted_by" field needs to exist in the struct.
-      LookupResult OrdResult(*this, NameInfo, Sema::LookupOrdinaryName);
-      LookupName(OrdResult, S);
-
-      if (!OrdResult.empty()) {
-        SourceRange SR = FD->getLocation();
-        Diag(SR.getBegin(), diag::err_counted_by_must_be_in_structure)
-            << FieldName << SR;
-
-        if (auto *ND = OrdResult.getAsSingle<NamedDecl>()) {
-          SR = ND->getLocation();
-          Diag(SR.getBegin(), diag::note_flexible_array_counted_by_attr_field)
-              << ND << SR;
-        }
+    S.Diag(CountDecl->getBeginLoc(),
+           diag::note_flexible_array_counted_by_attr_field)
+        << CountDecl << CountDecl->getSourceRange();
+    return true;
+  }
 
-        return true;
-      }
+  if (FD->getParent() != CountFD->getParent()) {
+    if (CountFD->getParent()->isUnion()) {
+      S.Diag(CountFD->getBeginLoc(), diag::err_counted_by_attr_refer_to_union)
+          << CountFD->getSourceRange();
+      return true;
+    }
+    // Whether CountRD is an anonymous struct is not determined at this
+    // point. Thus, an additional diagnostic in case it's not anonymous struct
+    // is done later in `Parser::ParseStructDeclaration`.
+    auto *RD = GetEnclosingNamedOrTopAnonRecord(FD);
+    auto *CountRD = GetEnclosingNamedOrTopAnonRecord(CountFD);
+
+    if (RD != CountRD) {
+      S.Diag(E->getBeginLoc(),
+             diag::err_flexible_array_count_not_in_same_struct)
+          << CountFD << E->getSourceRange();
+      S.Diag(CountFD->getBeginLoc(),
+             diag::note_flexible_array_counted_by_attr_field)
+          << CountFD << CountFD->getSourceRange();
+      return true;
     }
-
-    CXXScopeSpec SS;
-    DeclFilterCCC<FieldDecl> Filter(FieldName);
-    return DiagnoseEmptyLookup(S, SS, MemResult, Filter, nullptr, std::nullopt,
-                               const_cast<DeclContext *>(FD->getDeclContext()));
   }
 
-  if (CountFD->hasAttr<CountedByAttr>()) {
-    // The "counted_by" field can't point to the flexible array member.
-    SourceRange SR = CBA->getCountedByFieldLoc();
-    Diag(SR.getBegin(), diag::err_counted_by_attr_refers_to_flexible_array)
-        << CBA->getCountedByField() << SR;
-    return true;
-  }
+  Decls.push_back(TypeCoupledDeclRefInfo(CountFD, /*IsDref*/ false));
+  return false;
+}
 
-  if (!CountFD->getType()->isIntegerType() ||
-      CountFD->getType()->isBooleanType()) {
-    // The "counted_by" field must have an integer type.
-    SourceRange SR = CBA->getCountedByFieldLoc();
-    Diag(SR.getBegin(),
-         diag::err_flexible_array_counted_by_attr_field_not_integer)
-        << CBA->getCountedByField() << SR;
+static void handleCountedByAttrField(Sema &S, Decl *D, const ParsedAttr &AL) {
+  auto *FD = dyn_cast<FieldDecl>(D);
+  assert(FD);
 
-    SR = CountFD->getLocation();
-    Diag(SR.getBegin(), diag::note_flexible_array_counted_by_attr_field)
-        << CountFD << SR;
-    return true;
-  }
+  auto *CountExpr = AL.getArgAsExpr(0);
+  if (!CountExpr)
+    return;
 
-  return false;
+  llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
+  if (CheckCountExpr(S, FD, CountExpr, Decls))
+    return;
+
+  QualType CAT = S.BuildCountAttributedArrayType(FD->getType(), CountExpr);
+  FD->setType(CAT);
 }
 
 static void handleFunctionReturnThunksAttr(Sema &S, Decl *D,
@@ -12857,7 +12838,7 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
     break;
 
   case ParsedAttr::AT_CountedBy:
-    handleCountedByAttr(S, D, AL);
+    handleCountedByAttrField(S, D, AL);
     break;
 
   // Microsoft attributes:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index df7066d876552..6b6d607852fdb 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -1167,12 +1167,13 @@ ExprResult Sema::DefaultVariadicArgumentPromotion(Expr *E, VariadicCallType CT,
   return E;
 }
 
-/// Converts an integer to complex float type.  Helper function of
+/// Convert complex integers to complex floats and real integers to
+/// real floats as required for complex arithmetic. Helper function of
 /// UsualArithmeticConversions()
 ///
 /// \return false if the integer expression is an integer type and is
-/// successfully converted to the complex type.
-static bool handleIntegerToComplexFloatConversion(Sema &S, ExprResult &IntExpr,
+/// successfully converted to the (complex) float type.
+static bool handleComplexIntegerToFloatConversion(Sema &S, ExprResult &IntExpr,
                                                   ExprResult &ComplexExpr,
                                                   QualType IntTy,
                                                   QualType ComplexTy,
@@ -1182,8 +1183,6 @@ static bool handleIntegerToComplexFloatConversion(Sema &S, ExprResult &IntExpr,
   if (IntTy->isIntegerType()) {
     QualType fpTy = ComplexTy->castAs<ComplexType>()->getElementType();
     IntExpr = S.ImpCastExprToType(IntExpr.get(), fpTy, CK_IntegralToFloating);
-    IntExpr = S.ImpCastExprToType(IntExpr.get(), ComplexTy,
-                                  CK_FloatingRealToComplex);
   } else {
     assert(IntTy->isComplexIntegerType());
     IntExpr = S.ImpCastExprToType(IntExpr.get(), ComplexTy,
@@ -1228,11 +1227,11 @@ static QualType handleComplexFloatConversion(Sema &S, ExprResult &Shorter,
 static QualType handleComplexConversion(Sema &S, ExprResult &LHS,
                                         ExprResult &RHS, QualType LHSType,
                                         QualType RHSType, bool IsCompAssign) {
-  // if we have an integer operand, the result is the complex type.
-  if (!handleIntegerToComplexFloatConversion(S, RHS, LHS, RHSType, LHSType,
+  // Handle (complex) integer types.
+  if (!handleComplexIntegerToFloatConversion(S, RHS, LHS, RHSType, LHSType,
                                              /*SkipCast=*/false))
     return LHSType;
-  if (!handleIntegerToComplexFloatConversion(S, LHS, RHS, LHSType, RHSType,
+  if (!handleComplexIntegerToFloatConversion(S, LHS, RHS, LHSType, RHSType,
                                              /*SkipCast=*/IsCompAssign))
     return RHSType;
 
@@ -2806,6 +2805,24 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
     return ActOnDependentIdExpression(SS, TemplateKWLoc, NameInfo,
                                       IsAddressOfOperand, TemplateArgs);
 
+  // BoundsSafety: This specially handles arguments of bounds attributes
+  // appertains to a type of C struct field such that the name lookup
+  // within a struct finds the member name, which is not the case for other
+  // contexts in C.
+  if (isBoundsAttrContext() && !getLangOpts().CPlusPlus && S->isClassScope()) {
+    // See if this is reference to a field of struct.
+    LookupResult R(*this, NameInfo, LookupMemberName);
+    // LookupParsedName handles a name lookup from within anonymous struct.
+    if (LookupParsedName(R, S, &SS)) {
+      if (auto *VD = dyn_cast<ValueDecl>(R.getFoundDecl())) {
+        QualType type = VD->getType().getNonReferenceType();
+        // This will eventually be translated into MemberExpr upon
+        // the use of instantiated struct fields.
+        return BuildDeclRefExpr(VD, type, VK_PRValue, NameLoc);
+      }
+    }
+  }
+
   // Perform the required lookup.
   LookupResult R(*this, NameInfo,
                  (Id.getKind() == UnqualifiedIdKind::IK_ImplicitSelfParam)
@@ -2962,7 +2979,8 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
   // to get this right here so that we don't end up making a
   // spuriously dependent expression if we're inside a dependent
   // instance method.
-  if (!R.empty() && (*R.begin())->isCXXClassMember()) {
+  if (getLangOpts().CPlusPlus && !R.empty() &&
+      (*R.begin())->isCXXClassMember()) {
     bool MightBeImplicitMember;
     if (!IsAddressOfOperand)
       MightBeImplicitMember = true;
@@ -3618,7 +3636,8 @@ ExprResult Sema::BuildDeclarationNameExpr(
   case Decl::Field:
   case Decl::IndirectField:
   case Decl::ObjCIvar:
-    assert(getLangOpts().CPlusPlus && "building reference to field in C?");
+    assert((getLangOpts().CPlusPlus || isBoundsAttrContext()) &&
+           "building reference to field in C?");
 
     // These can't have reference type in well-formed programs, but
     // for internal consistency we do this anyway.
@@ -3809,7 +3828,10 @@ ExprResult Sema::BuildPredefinedExpr(SourceLocation Loc,
   else {
     // Pre-defined identifiers are of type char[x], where x is the length of
     // the string.
-    auto Str = PredefinedExpr::ComputeName(IK, currentDecl);
+    bool ForceElaboratedPrinting =
+        IK == PredefinedIdentKind::Function && getLangOpts().MSVCCompat;
+    auto Str =
+        PredefinedExpr::ComputeName(IK, currentDecl, ForceElaboratedPrinting);
     unsigned Length = Str.length();
 
     llvm::APInt LengthI(32, Length + 1);
@@ -4812,6 +4834,7 @@ static void captureVariablyModifiedType(ASTContext &Context, QualType T,
     case Type::BTFTagAttributed:
     case Type::SubstTemplateTypeParm:
     case Type::MacroQualified:
+    case Type::CountAttributed:
       // Keep walking after single level desugaring.
       T = T.getSingleStepDesugaredType(Context);
       break;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 859e2d53d68b5..763c4c9c8e654 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -14368,7 +14368,8 @@ StmtResult Sema::ActOnOpenMPTargetParallelForSimdDirective(
   // The point of exit cannot be a branch out of the structured block.
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel_for);
+  for (int ThisCaptureLevel =
+           getOpenMPCaptureLevels(OMPD_target_parallel_for_simd);
        ThisCaptureLevel > 1; --ThisCaptureLevel) {
     CS = cast<CapturedStmt>(CS->getCapturedStmt());
     // 1.2.2 OpenMP Language Terminology
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 51e8db2dfbaac..005529a53270c 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1156,6 +1156,7 @@ bool Sema::BuildTypeConstraint(const CXXScopeSpec &SS,
 
   TemplateName TN = TypeConstr->Template.get();
   ConceptDecl *CD = cast<ConceptDecl>(TN.getAsTemplateDecl());
+  UsingShadowDecl *USD = TN.getAsUsingShadowDecl();
 
   DeclarationNameInfo ConceptName(DeclarationName(TypeConstr->Name),
                                   TypeConstr->TemplateNameLoc);
@@ -1174,15 +1175,15 @@ bool Sema::BuildTypeConstraint(const CXXScopeSpec &SS,
   }
   return AttachTypeConstraint(
       SS.isSet() ? SS.getWithLocInContext(Context) : NestedNameSpecifierLoc(),
-      ConceptName, CD,
+      ConceptName, CD, /*FoundDecl=*/USD ? cast<NamedDecl>(USD) : CD,
       TypeConstr->LAngleLoc.isValid() ? &TemplateArgs : nullptr,
       ConstrainedParameter, EllipsisLoc);
 }
 
-template<typename ArgumentLocAppender>
+template <typename ArgumentLocAppender>
 static ExprResult formImmediatelyDeclaredConstraint(
     Sema &S, NestedNameSpecifierLoc NS, DeclarationNameInfo NameInfo,
-    ConceptDecl *NamedConcept, SourceLocation LAngleLoc,
+    ConceptDecl *NamedConcept, NamedDecl *FoundDecl, SourceLocation LAngleLoc,
     SourceLocation RAngleLoc, QualType ConstrainedType,
     SourceLocation ParamNameLoc, ArgumentLocAppender Appender,
     SourceLocation EllipsisLoc) {
@@ -1203,7 +1204,8 @@ static ExprResult formImmediatelyDeclaredConstraint(
   SS.Adopt(NS);
   ExprResult ImmediatelyDeclaredConstraint = S.CheckConceptTemplateId(
       SS, /*TemplateKWLoc=*/SourceLocation(), NameInfo,
-      /*FoundDecl=*/NamedConcept, NamedConcept, &ConstraintArgs);
+      /*FoundDecl=*/FoundDecl ? FoundDecl : NamedConcept, NamedConcept,
+      &ConstraintArgs);
   if (ImmediatelyDeclaredConstraint.isInvalid() || !EllipsisLoc.isValid())
     return ImmediatelyDeclaredConstraint;
 
@@ -1233,7 +1235,7 @@ static ExprResult formImmediatelyDeclaredConstraint(
 /// of arguments for the named concept).
 bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS,
                                 DeclarationNameInfo NameInfo,
-                                ConceptDecl *NamedConcept,
+                                ConceptDecl *NamedConcept, NamedDecl *FoundDecl,
                                 const TemplateArgumentListInfo *TemplateArgs,
                                 TemplateTypeParmDecl *ConstrainedParameter,
                                 SourceLocation EllipsisLoc) {
@@ -1246,24 +1248,24 @@ bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS,
 
   QualType ParamAsArgument(ConstrainedParameter->getTypeForDecl(), 0);
 
-  ExprResult ImmediatelyDeclaredConstraint =
-      formImmediatelyDeclaredConstraint(
-          *this, NS, NameInfo, NamedConcept,
-          TemplateArgs ? TemplateArgs->getLAngleLoc() : SourceLocation(),
-          TemplateArgs ? TemplateArgs->getRAngleLoc() : SourceLocation(),
-          ParamAsArgument, ConstrainedParameter->getLocation(),
-          [&] (TemplateArgumentListInfo &ConstraintArgs) {
-            if (TemplateArgs)
-              for (const auto &ArgLoc : TemplateArgs->arguments())
-                ConstraintArgs.addArgument(ArgLoc);
-          }, EllipsisLoc);
+  ExprResult ImmediatelyDeclaredConstraint = formImmediatelyDeclaredConstraint(
+      *this, NS, NameInfo, NamedConcept, FoundDecl,
+      TemplateArgs ? TemplateArgs->getLAngleLoc() : SourceLocation(),
+      TemplateArgs ? TemplateArgs->getRAngleLoc() : SourceLocation(),
+      ParamAsArgument, ConstrainedParameter->getLocation(),
+      [&](TemplateArgumentListInfo &ConstraintArgs) {
+        if (TemplateArgs)
+          for (const auto &ArgLoc : TemplateArgs->arguments())
+            ConstraintArgs.addArgument(ArgLoc);
+      },
+      EllipsisLoc);
   if (ImmediatelyDeclaredConstraint.isInvalid())
     return true;
 
   auto *CL = ConceptReference::Create(Context, /*NNS=*/NS,
                                       /*TemplateKWLoc=*/SourceLocation{},
                                       /*ConceptNameInfo=*/NameInfo,
-                                      /*FoundDecl=*/NamedConcept,
+                                      /*FoundDecl=*/FoundDecl,
                                       /*NamedConcept=*/NamedConcept,
                                       /*ArgsWritten=*/ArgsAsWritten);
   ConstrainedParameter->setTypeConstraint(CL,
@@ -1293,8 +1295,9 @@ bool Sema::AttachTypeConstraint(AutoTypeLoc TL,
     return true;
   ExprResult ImmediatelyDeclaredConstraint = formImmediatelyDeclaredConstraint(
       *this, TL.getNestedNameSpecifierLoc(), TL.getConceptNameInfo(),
-      TL.getNamedConcept(), TL.getLAngleLoc(), TL.getRAngleLoc(),
-      BuildDecltypeType(Ref), OrigConstrainedParm->getLocation(),
+      TL.getNamedConcept(), /*FoundDecl=*/TL.getFoundDecl(), TL.getLAngleLoc(),
+      TL.getRAngleLoc(), BuildDecltypeType(Ref),
+      OrigConstrainedParm->getLocation(),
       [&](TemplateArgumentListInfo &ConstraintArgs) {
         for (unsigned I = 0, C = TL.getNumArgs(); I != C; ++I)
           ConstraintArgs.addArgument(TL.getArgLoc(I));
@@ -5415,7 +5418,7 @@ ExprResult Sema::BuildTemplateIdExpr(const CXXScopeSpec &SS,
 
   if (R.getAsSingle<ConceptDecl>()) {
     return CheckConceptTemplateId(SS, TemplateKWLoc, R.getLookupNameInfo(),
-                                  R.getFoundDecl(),
+                                  R.getRepresentativeDecl(),
                                   R.getAsSingle<ConceptDecl>(), TemplateArgs);
   }
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 451fab3816446..de39df731f6bc 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -3069,7 +3069,8 @@ bool Sema::SubstTypeConstraint(
   }
   return AttachTypeConstraint(
       TC->getNestedNameSpecifierLoc(), TC->getConceptNameInfo(),
-      TC->getNamedConcept(), &InstArgs, Inst,
+      TC->getNamedConcept(),
+      /*FoundDecl=*/TC->getConceptReference()->getFoundDecl(), &InstArgs, Inst,
       Inst->isParameterPack()
           ? cast<CXXFoldExpr>(TC->getImmediatelyDeclaredConstraint())
                 ->getEllipsisLoc()
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 9c48e43d85357..5d742e3292873 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2245,6 +2245,7 @@ Decl *TemplateDeclInstantiator::VisitClassTemplateDecl(ClassTemplateDecl *D) {
     assert(!Owner->isDependentContext());
     Inst->setLexicalDeclContext(Owner);
     RecordInst->setLexicalDeclContext(Owner);
+    Inst->setObjectOfFriendDecl();
 
     if (PrevClassTemplate) {
       Inst->setCommonPtr(PrevClassTemplate->getCommonPtr());
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 8d63661688d3c..5c6f6ab94c3a9 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -3514,7 +3514,7 @@ InventTemplateParameter(TypeProcessingState &state, QualType T,
       if (!Invalid) {
         S.AttachTypeConstraint(
             AutoLoc.getNestedNameSpecifierLoc(), AutoLoc.getConceptNameInfo(),
-            AutoLoc.getNamedConcept(),
+            AutoLoc.getNamedConcept(), /*FoundDecl=*/AutoLoc.getFoundDecl(),
             AutoLoc.hasExplicitTemplateArgs() ? &TAL : nullptr,
             InventedTemplateParam, D.getEllipsisLoc());
       }
@@ -3540,11 +3540,17 @@ InventTemplateParameter(TypeProcessingState &state, QualType T,
         }
       }
       if (!Invalid) {
+        UsingShadowDecl *USD =
+            TemplateId->Template.get().getAsUsingShadowDecl();
+        auto *CD =
+            cast<ConceptDecl>(TemplateId->Template.get().getAsTemplateDecl());
         S.AttachTypeConstraint(
             D.getDeclSpec().getTypeSpecScope().getWithLocInContext(S.Context),
             DeclarationNameInfo(DeclarationName(TemplateId->Name),
                                 TemplateId->TemplateNameLoc),
-            cast<ConceptDecl>(TemplateId->Template.get().getAsTemplateDecl()),
+            CD,
+            /*FoundDecl=*/
+            USD ? cast<NamedDecl>(USD) : CD,
             TemplateId->LAngleLoc.isValid() ? &TemplateArgsInfo : nullptr,
             InventedTemplateParam, D.getEllipsisLoc());
       }
@@ -6449,9 +6455,12 @@ namespace {
       DeclarationNameInfo DNI = DeclarationNameInfo(
           TL.getTypePtr()->getTypeConstraintConcept()->getDeclName(),
           TemplateId->TemplateNameLoc);
+      auto TN = TemplateId->Template.get();
       auto *CR = ConceptReference::Create(
           Context, NNS, TemplateId->TemplateKWLoc, DNI,
-          /*FoundDecl=*/nullptr,
+          /*FoundDecl=*/TN.getKind() == TemplateName::NameKind::UsingTemplate
+              ? cast<NamedDecl>(TN.getAsUsingShadowDecl())
+              : cast_if_present<NamedDecl>(TN.getAsTemplateDecl()),
           /*NamedDecl=*/TL.getTypePtr()->getTypeConstraintConcept(),
           ASTTemplateArgumentListInfo::Create(Context, TemplateArgsInfo));
       TL.setConceptReference(CR);
@@ -6520,6 +6529,9 @@ namespace {
     void VisitAttributedTypeLoc(AttributedTypeLoc TL) {
       fillAttributedTypeLoc(TL, State);
     }
+    void VisitCountAttributedTypeLoc(CountAttributedTypeLoc TL) {
+      // nothing
+    }
     void VisitBTFTagAttributedTypeLoc(BTFTagAttributedTypeLoc TL) {
       // nothing
     }
@@ -9837,6 +9849,26 @@ QualType Sema::BuildTypeofExprType(Expr *E, TypeOfKind Kind) {
   return Context.getTypeOfExprType(E, Kind);
 }
 
+static void
+BuildTypeCoupledDecls(Expr *E,
+                      llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
+  // Currently, 'counted_by' only allows direct DeclRefExpr to FieldDecl.
+  auto *CountDecl = cast<DeclRefExpr>(E)->getDecl();
+  Decls.push_back(TypeCoupledDeclRefInfo(CountDecl, /*IsDref*/ false));
+}
+
+QualType Sema::BuildCountAttributedArrayType(QualType WrappedTy,
+                                             Expr *CountExpr) {
+  assert(WrappedTy->isIncompleteArrayType());
+
+  llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
+  BuildTypeCoupledDecls(CountExpr, Decls);
+  /// When the resulting expression is invalid, we still create the AST using
+  /// the original count expression for the sake of AST dump.
+  return Context.getCountAttributedType(
+      WrappedTy, CountExpr, /*CountInBytes*/ false, /*OrNull*/ false, Decls);
+}
+
 /// getDecltypeForExpr - Given an expr, will return the decltype for
 /// that expression, according to the rules in C++11
 /// [dcl.type.simple]p4 and C++11 [expr.lambda.prim]p18.
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 1bc5f82f6058f..b665f6d6b8be2 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7297,6 +7297,34 @@ QualType TreeTransform<Derived>::TransformAttributedType(TypeLocBuilder &TLB,
       });
 }
 
+template <typename Derived>
+QualType TreeTransform<Derived>::TransformCountAttributedType(
+    TypeLocBuilder &TLB, CountAttributedTypeLoc TL) {
+  const CountAttributedType *OldTy = TL.getTypePtr();
+  QualType InnerTy = getDerived().TransformType(TLB, TL.getInnerLoc());
+  if (InnerTy.isNull())
+    return QualType();
+
+  Expr *OldCount = TL.getCountExpr();
+  Expr *NewCount = nullptr;
+  if (OldCount) {
+    ExprResult CountResult = getDerived().TransformExpr(OldCount);
+    if (CountResult.isInvalid())
+      return QualType();
+    NewCount = CountResult.get();
+  }
+
+  QualType Result = TL.getType();
+  if (getDerived().AlwaysRebuild() || InnerTy != OldTy->desugar() ||
+      OldCount != NewCount) {
+    // Currently, CountAttributedType can only wrap incomplete array types.
+    Result = SemaRef.BuildCountAttributedArrayType(InnerTy, NewCount);
+  }
+
+  TLB.push<CountAttributedTypeLoc>(Result);
+  return Result;
+}
+
 template <typename Derived>
 QualType TreeTransform<Derived>::TransformBTFTagAttributedType(
     TypeLocBuilder &TLB, BTFTagAttributedTypeLoc TL) {
@@ -13824,6 +13852,16 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
 
     // Capturing 'this' is trivial.
     if (C->capturesThis()) {
+      // If this is a lambda that is part of a default member initialiser
+      // and which we're instantiating outside the class that 'this' is
+      // supposed to refer to, adjust the type of 'this' accordingly.
+      //
+      // Otherwise, leave the type of 'this' as-is.
+      Sema::CXXThisScopeRAII ThisScope(
+          getSema(),
+          dyn_cast_if_present<CXXRecordDecl>(
+              getSema().getFunctionLevelDeclContext()),
+          Qualifiers());
       getSema().CheckCXXThisCapture(C->getLocation(), C->isExplicit(),
                                     /*BuildAndDiagnose*/ true, nullptr,
                                     C->getCaptureKind() == LCK_StarThis);
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 3d59a0de315a8..cb5420555545b 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -6995,6 +6995,10 @@ void TypeLocReader::VisitAttributedTypeLoc(AttributedTypeLoc TL) {
   TL.setAttr(ReadAttr());
 }
 
+void TypeLocReader::VisitCountAttributedTypeLoc(CountAttributedTypeLoc TL) {
+  // Nothing to do
+}
+
 void TypeLocReader::VisitBTFTagAttributedTypeLoc(BTFTagAttributedTypeLoc TL) {
   // Nothing to do.
 }
@@ -9123,6 +9127,10 @@ DeclarationNameInfo ASTRecordReader::readDeclarationNameInfo() {
   return NameInfo;
 }
 
+TypeCoupledDeclRefInfo ASTRecordReader::readTypeCoupledDeclRefInfo() {
+  return TypeCoupledDeclRefInfo(readDeclAs<ValueDecl>(), readBool());
+}
+
 void ASTRecordReader::readQualifierInfo(QualifierInfo &Info) {
   Info.QualifierLoc = readNestedNameSpecifierLoc();
   unsigned NumTPLists = readInt();
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 3653d94c6e073..221409d011a38 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -514,6 +514,10 @@ void TypeLocWriter::VisitAttributedTypeLoc(AttributedTypeLoc TL) {
   Record.AddAttr(TL.getAttr());
 }
 
+void TypeLocWriter::VisitCountAttributedTypeLoc(CountAttributedTypeLoc TL) {
+  // Nothing to do
+}
+
 void TypeLocWriter::VisitBTFTagAttributedTypeLoc(BTFTagAttributedTypeLoc TL) {
   // Nothing to do.
 }
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index d04e1c781b4e2..86f64bf2a2425 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -281,11 +281,18 @@ bool clang::CanElideDeclDef(const Decl *D) {
 
     if (FD->isDependentContext())
       return false;
+
+    if (FD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
+      return false;
   }
 
   if (auto *VD = dyn_cast<VarDecl>(D)) {
     if (!VD->getDeclContext()->getRedeclContext()->isFileContext() ||
-        VD->isInline() || VD->isConstexpr() || isa<ParmVarDecl>(VD))
+        VD->isInline() || VD->isConstexpr() || isa<ParmVarDecl>(VD) ||
+        // Constant initialized variable may not affect the ABI, but they
+        // may be used in constant evaluation in the frontend, so we have
+        // to remain them.
+        VD->hasConstantInitialization())
       return false;
 
     if (VD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 29eb932584027..f82288f1099e8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -83,6 +83,8 @@ class StateUpdateReporter {
     AssumedUpperBound = UpperBoundVal;
   }
 
+  bool assumedNonNegative() { return AssumedNonNegative; }
+
   const NoteTag *createNoteTag(CheckerContext &C) const;
 
 private:
@@ -402,7 +404,8 @@ static bool tryDividePair(std::optional<int64_t> &Val1,
 }
 
 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
-                               NonLoc Offset, NonLoc Extent, SVal Location) {
+                               NonLoc Offset, NonLoc Extent, SVal Location,
+                               bool AlsoMentionUnderflow) {
   std::string RegName = getRegionName(Region);
   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
   assert(EReg && "this checker only handles element access");
@@ -414,6 +417,7 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
   int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
 
   bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
+  const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
 
   SmallString<256> Buf;
   llvm::raw_svector_ostream Out(Buf);
@@ -421,10 +425,12 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
   if (!ExtentN && !UseByteOffsets)
     Out << "'" << ElemType.getAsString() << "' element in ";
   Out << RegName << " at ";
-  if (OffsetN) {
-    Out << (UseByteOffsets ? "byte offset " : "index ") << *OffsetN;
+  if (AlsoMentionUnderflow) {
+    Out << "a negative or overflowing " << OffsetOrIndex;
+  } else if (OffsetN) {
+    Out << OffsetOrIndex << " " << *OffsetN;
   } else {
-    Out << "an overflowing " << (UseByteOffsets ? "byte offset" : "index");
+    Out << "an overflowing " << OffsetOrIndex;
   }
   if (ExtentN) {
     Out << ", while it holds only ";
@@ -441,17 +447,20 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
       Out << "s";
   }
 
-  return {
-      formatv("Out of bound access to memory after the end of {0}", RegName),
-      std::string(Buf)};
+  return {formatv("Out of bound access to memory {0} {1}",
+                  AlsoMentionUnderflow ? "around" : "after the end of",
+                  RegName),
+          std::string(Buf)};
 }
 
-static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
+static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName,
+                             bool AlsoMentionUnderflow) {
   std::string RegName = getRegionName(Region);
   return {formatv("Potential out of bound access to {0} with tainted {1}",
                   RegName, OffsetName),
-          formatv("Access of {0} with a tainted {1} that may be too large",
-                  RegName, OffsetName)};
+          formatv("Access of {0} with a tainted {1} that may be {2}too large",
+                  RegName, OffsetName,
+                  AlsoMentionUnderflow ? "negative or " : "")};
 }
 
 const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
@@ -600,6 +609,13 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   // CHECK UPPER BOUND
   DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
   if (auto KnownSize = Size.getAs<NonLoc>()) {
+    // In a situation where both overflow and overflow are possible (but the
+    // index is either tainted or known to be invalid), the logic of this
+    // checker will first assume that the offset is non-negative, and then
+    // (with this additional assumption) it will detect an overflow error.
+    // In this situation the warning message should mention both possibilities.
+    bool AlsoMentionUnderflow = SUR.assumedNonNegative();
+
     auto [WithinUpperBound, ExceedsUpperBound] =
         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
 
@@ -615,8 +631,9 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
           return;
         }
 
-        Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
-                                       *KnownSize, Location);
+        Messages Msgs =
+            getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, *KnownSize,
+                           Location, AlsoMentionUnderflow);
         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
         return;
       }
@@ -632,7 +649,7 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
           if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
             OffsetName = "index";
 
-        Messages Msgs = getTaintMsgs(Reg, OffsetName);
+        Messages Msgs = getTaintMsgs(Reg, OffsetName, AlsoMentionUnderflow);
         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
                   /*IsTaintBug=*/true);
         return;
diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
index 66e080adb1382..e4373915410fb 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
@@ -20,48 +20,178 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState_Fwd.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+
+#include <iterator>
+#include <utility>
+#include <variant>
 
 using namespace clang;
 using namespace ento;
 
 namespace {
+
+struct CritSectionMarker {
+  const Expr *LockExpr{};
+  const MemRegion *LockReg{};
+
+  void Profile(llvm::FoldingSetNodeID &ID) const {
+    ID.Add(LockExpr);
+    ID.Add(LockReg);
+  }
+
+  [[nodiscard]] constexpr bool
+  operator==(const CritSectionMarker &Other) const noexcept {
+    return LockExpr == Other.LockExpr && LockReg == Other.LockReg;
+  }
+  [[nodiscard]] constexpr bool
+  operator!=(const CritSectionMarker &Other) const noexcept {
+    return !(*this == Other);
+  }
+};
+
+class CallDescriptionBasedMatcher {
+  CallDescription LockFn;
+  CallDescription UnlockFn;
+
+public:
+  CallDescriptionBasedMatcher(CallDescription &&LockFn,
+                              CallDescription &&UnlockFn)
+      : LockFn(std::move(LockFn)), UnlockFn(std::move(UnlockFn)) {}
+  [[nodiscard]] bool matches(const CallEvent &Call, bool IsLock) const {
+    if (IsLock) {
+      return LockFn.matches(Call);
+    }
+    return UnlockFn.matches(Call);
+  }
+};
+
+class FirstArgMutexDescriptor : public CallDescriptionBasedMatcher {
+public:
+  FirstArgMutexDescriptor(CallDescription &&LockFn, CallDescription &&UnlockFn)
+      : CallDescriptionBasedMatcher(std::move(LockFn), std::move(UnlockFn)) {}
+
+  [[nodiscard]] const MemRegion *getRegion(const CallEvent &Call, bool) const {
+    return Call.getArgSVal(0).getAsRegion();
+  }
+};
+
+class MemberMutexDescriptor : public CallDescriptionBasedMatcher {
+public:
+  MemberMutexDescriptor(CallDescription &&LockFn, CallDescription &&UnlockFn)
+      : CallDescriptionBasedMatcher(std::move(LockFn), std::move(UnlockFn)) {}
+
+  [[nodiscard]] const MemRegion *getRegion(const CallEvent &Call, bool) const {
+    return cast<CXXMemberCall>(Call).getCXXThisVal().getAsRegion();
+  }
+};
+
+class RAIIMutexDescriptor {
+  mutable const IdentifierInfo *Guard{};
+  mutable bool IdentifierInfoInitialized{};
+  mutable llvm::SmallString<32> GuardName{};
+
+  void initIdentifierInfo(const CallEvent &Call) const {
+    if (!IdentifierInfoInitialized) {
+      // In case of checking C code, or when the corresponding headers are not
+      // included, we might end up query the identifier table every time when
+      // this function is called instead of early returning it. To avoid this, a
+      // bool variable (IdentifierInfoInitialized) is used and the function will
+      // be run only once.
+      Guard = &Call.getCalleeAnalysisDeclContext()->getASTContext().Idents.get(
+          GuardName);
+      IdentifierInfoInitialized = true;
+    }
+  }
+
+  template <typename T> bool matchesImpl(const CallEvent &Call) const {
+    const T *C = dyn_cast<T>(&Call);
+    if (!C)
+      return false;
+    const IdentifierInfo *II =
+        cast<CXXRecordDecl>(C->getDecl()->getParent())->getIdentifier();
+    return II == Guard;
+  }
+
+public:
+  RAIIMutexDescriptor(StringRef GuardName) : GuardName(GuardName) {}
+  [[nodiscard]] bool matches(const CallEvent &Call, bool IsLock) const {
+    initIdentifierInfo(Call);
+    if (IsLock) {
+      return matchesImpl<CXXConstructorCall>(Call);
+    }
+    return matchesImpl<CXXDestructorCall>(Call);
+  }
+  [[nodiscard]] const MemRegion *getRegion(const CallEvent &Call,
+                                           bool IsLock) const {
+    const MemRegion *LockRegion = nullptr;
+    if (IsLock) {
+      if (std::optional<SVal> Object = Call.getReturnValueUnderConstruction()) {
+        LockRegion = Object->getAsRegion();
+      }
+    } else {
+      LockRegion = cast<CXXDestructorCall>(Call).getCXXThisVal().getAsRegion();
+    }
+    return LockRegion;
+  }
+};
+
+using MutexDescriptor =
+    std::variant<FirstArgMutexDescriptor, MemberMutexDescriptor,
+                 RAIIMutexDescriptor>;
+
 class BlockInCriticalSectionChecker : public Checker<check::PostCall> {
-  mutable IdentifierInfo *IILockGuard = nullptr;
-  mutable IdentifierInfo *IIUniqueLock = nullptr;
-  mutable bool IdentifierInfoInitialized = false;
-
-  const CallDescription LockFn{{"lock"}};
-  const CallDescription UnlockFn{{"unlock"}};
-  const CallDescription SleepFn{{"sleep"}};
-  const CallDescription GetcFn{{"getc"}};
-  const CallDescription FgetsFn{{"fgets"}};
-  const CallDescription ReadFn{{"read"}};
-  const CallDescription RecvFn{{"recv"}};
-  const CallDescription PthreadLockFn{{"pthread_mutex_lock"}};
-  const CallDescription PthreadTryLockFn{{"pthread_mutex_trylock"}};
-  const CallDescription PthreadUnlockFn{{"pthread_mutex_unlock"}};
-  const CallDescription MtxLock{{"mtx_lock"}};
-  const CallDescription MtxTimedLock{{"mtx_timedlock"}};
-  const CallDescription MtxTryLock{{"mtx_trylock"}};
-  const CallDescription MtxUnlock{{"mtx_unlock"}};
-
-  const llvm::StringLiteral ClassLockGuard{"lock_guard"};
-  const llvm::StringLiteral ClassUniqueLock{"unique_lock"};
+private:
+  const std::array<MutexDescriptor, 8> MutexDescriptors{
+      MemberMutexDescriptor(
+          CallDescription(/*QualifiedName=*/{"std", "mutex", "lock"},
+                          /*RequiredArgs=*/0),
+          CallDescription({"std", "mutex", "unlock"}, 0)),
+      FirstArgMutexDescriptor(CallDescription({"pthread_mutex_lock"}, 1),
+                              CallDescription({"pthread_mutex_unlock"}, 1)),
+      FirstArgMutexDescriptor(CallDescription({"mtx_lock"}, 1),
+                              CallDescription({"mtx_unlock"}, 1)),
+      FirstArgMutexDescriptor(CallDescription({"pthread_mutex_trylock"}, 1),
+                              CallDescription({"pthread_mutex_unlock"}, 1)),
+      FirstArgMutexDescriptor(CallDescription({"mtx_trylock"}, 1),
+                              CallDescription({"mtx_unlock"}, 1)),
+      FirstArgMutexDescriptor(CallDescription({"mtx_timedlock"}, 1),
+                              CallDescription({"mtx_unlock"}, 1)),
+      RAIIMutexDescriptor("lock_guard"),
+      RAIIMutexDescriptor("unique_lock")};
+
+  const std::array<CallDescription, 5> BlockingFunctions{
+      ArrayRef{StringRef{"sleep"}}, ArrayRef{StringRef{"getc"}},
+      ArrayRef{StringRef{"fgets"}}, ArrayRef{StringRef{"read"}},
+      ArrayRef{StringRef{"recv"}}};
 
   const BugType BlockInCritSectionBugType{
       this, "Call to blocking function in critical section", "Blocking Error"};
 
-  void initIdentifierInfo(ASTContext &Ctx) const;
+  void reportBlockInCritSection(const CallEvent &call, CheckerContext &C) const;
 
-  void reportBlockInCritSection(SymbolRef FileDescSym,
-                                const CallEvent &call,
-                                CheckerContext &C) const;
+  [[nodiscard]] const NoteTag *createCritSectionNote(CritSectionMarker M,
+                                                     CheckerContext &C) const;
 
-public:
-  bool isBlockingFunction(const CallEvent &Call) const;
-  bool isLockFunction(const CallEvent &Call) const;
-  bool isUnlockFunction(const CallEvent &Call) const;
+  [[nodiscard]] std::optional<MutexDescriptor>
+  checkDescriptorMatch(const CallEvent &Call, CheckerContext &C,
+                       bool IsLock) const;
+
+  void handleLock(const MutexDescriptor &Mutex, const CallEvent &Call,
+                  CheckerContext &C) const;
 
+  void handleUnlock(const MutexDescriptor &Mutex, const CallEvent &Call,
+                    CheckerContext &C) const;
+
+  [[nodiscard]] bool isBlockingInCritSection(const CallEvent &Call,
+                                             CheckerContext &C) const;
+
+public:
   /// Process unlock.
   /// Process lock.
   /// Process blocking functions (sleep, getc, fgets, read, recv)
@@ -70,73 +200,118 @@ class BlockInCriticalSectionChecker : public Checker<check::PostCall> {
 
 } // end anonymous namespace
 
-REGISTER_TRAIT_WITH_PROGRAMSTATE(MutexCounter, unsigned)
-
-void BlockInCriticalSectionChecker::initIdentifierInfo(ASTContext &Ctx) const {
-  if (!IdentifierInfoInitialized) {
-    /* In case of checking C code, or when the corresponding headers are not
-     * included, we might end up query the identifier table every time when this
-     * function is called instead of early returning it. To avoid this, a bool
-     * variable (IdentifierInfoInitialized) is used and the function will be run
-     * only once. */
-    IILockGuard  = &Ctx.Idents.get(ClassLockGuard);
-    IIUniqueLock = &Ctx.Idents.get(ClassUniqueLock);
-    IdentifierInfoInitialized = true;
-  }
+REGISTER_LIST_WITH_PROGRAMSTATE(ActiveCritSections, CritSectionMarker)
+
+namespace std {
+// Iterator traits for ImmutableList data structure
+// that enable the use of STL algorithms.
+// TODO: Move these to llvm::ImmutableList when overhauling immutable data
+// structures for proper iterator concept support.
+template <>
+struct iterator_traits<
+    typename llvm::ImmutableList<CritSectionMarker>::iterator> {
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = CritSectionMarker;
+  using difference_type = std::ptrdiff_t;
+  using reference = CritSectionMarker &;
+  using pointer = CritSectionMarker *;
+};
+} // namespace std
+
+std::optional<MutexDescriptor>
+BlockInCriticalSectionChecker::checkDescriptorMatch(const CallEvent &Call,
+                                                    CheckerContext &C,
+                                                    bool IsLock) const {
+  const auto Descriptor =
+      llvm::find_if(MutexDescriptors, [&Call, IsLock](auto &&Descriptor) {
+        return std::visit(
+            [&Call, IsLock](auto &&DescriptorImpl) {
+              return DescriptorImpl.matches(Call, IsLock);
+            },
+            Descriptor);
+      });
+  if (Descriptor != MutexDescriptors.end())
+    return *Descriptor;
+  return std::nullopt;
 }
 
-bool BlockInCriticalSectionChecker::isBlockingFunction(const CallEvent &Call) const {
-  return matchesAny(Call, SleepFn, GetcFn, FgetsFn, ReadFn, RecvFn);
+static const MemRegion *getRegion(const CallEvent &Call,
+                                  const MutexDescriptor &Descriptor,
+                                  bool IsLock) {
+  return std::visit(
+      [&Call, IsLock](auto &&Descriptor) {
+        return Descriptor.getRegion(Call, IsLock);
+      },
+      Descriptor);
 }
 
-bool BlockInCriticalSectionChecker::isLockFunction(const CallEvent &Call) const {
-  if (const auto *Ctor = dyn_cast<CXXConstructorCall>(&Call)) {
-    auto IdentifierInfo = Ctor->getDecl()->getParent()->getIdentifier();
-    if (IdentifierInfo == IILockGuard || IdentifierInfo == IIUniqueLock)
-      return true;
-  }
+void BlockInCriticalSectionChecker::handleLock(
+    const MutexDescriptor &LockDescriptor, const CallEvent &Call,
+    CheckerContext &C) const {
+  const MemRegion *MutexRegion =
+      getRegion(Call, LockDescriptor, /*IsLock=*/true);
+  if (!MutexRegion)
+    return;
 
-  return matchesAny(Call, LockFn, PthreadLockFn, PthreadTryLockFn, MtxLock,
-                    MtxTimedLock, MtxTryLock);
+  const CritSectionMarker MarkToAdd{Call.getOriginExpr(), MutexRegion};
+  ProgramStateRef StateWithLockEvent =
+      C.getState()->add<ActiveCritSections>(MarkToAdd);
+  C.addTransition(StateWithLockEvent, createCritSectionNote(MarkToAdd, C));
 }
 
-bool BlockInCriticalSectionChecker::isUnlockFunction(const CallEvent &Call) const {
-  if (const auto *Dtor = dyn_cast<CXXDestructorCall>(&Call)) {
-    const auto *DRecordDecl = cast<CXXRecordDecl>(Dtor->getDecl()->getParent());
-    auto IdentifierInfo = DRecordDecl->getIdentifier();
-    if (IdentifierInfo == IILockGuard || IdentifierInfo == IIUniqueLock)
-      return true;
+void BlockInCriticalSectionChecker::handleUnlock(
+    const MutexDescriptor &UnlockDescriptor, const CallEvent &Call,
+    CheckerContext &C) const {
+  const MemRegion *MutexRegion =
+      getRegion(Call, UnlockDescriptor, /*IsLock=*/false);
+  if (!MutexRegion)
+    return;
+
+  ProgramStateRef State = C.getState();
+  const auto ActiveSections = State->get<ActiveCritSections>();
+  const auto MostRecentLock =
+      llvm::find_if(ActiveSections, [MutexRegion](auto &&Marker) {
+        return Marker.LockReg == MutexRegion;
+      });
+  if (MostRecentLock == ActiveSections.end())
+    return;
+
+  // Build a new ImmutableList without this element.
+  auto &Factory = State->get_context<ActiveCritSections>();
+  llvm::ImmutableList<CritSectionMarker> NewList = Factory.getEmptyList();
+  for (auto It = ActiveSections.begin(), End = ActiveSections.end(); It != End;
+       ++It) {
+    if (It != MostRecentLock)
+      NewList = Factory.add(*It, NewList);
   }
 
-  return matchesAny(Call, UnlockFn, PthreadUnlockFn, MtxUnlock);
+  State = State->set<ActiveCritSections>(NewList);
+  C.addTransition(State);
+}
+
+bool BlockInCriticalSectionChecker::isBlockingInCritSection(
+    const CallEvent &Call, CheckerContext &C) const {
+  return llvm::any_of(BlockingFunctions,
+                      [&Call](auto &&Fn) { return Fn.matches(Call); }) &&
+         !C.getState()->get<ActiveCritSections>().isEmpty();
 }
 
 void BlockInCriticalSectionChecker::checkPostCall(const CallEvent &Call,
                                                   CheckerContext &C) const {
-  initIdentifierInfo(C.getASTContext());
-
-  if (!isBlockingFunction(Call)
-      && !isLockFunction(Call)
-      && !isUnlockFunction(Call))
-    return;
-
-  ProgramStateRef State = C.getState();
-  unsigned mutexCount = State->get<MutexCounter>();
-  if (isUnlockFunction(Call) && mutexCount > 0) {
-    State = State->set<MutexCounter>(--mutexCount);
-    C.addTransition(State);
-  } else if (isLockFunction(Call)) {
-    State = State->set<MutexCounter>(++mutexCount);
-    C.addTransition(State);
-  } else if (mutexCount > 0) {
-    SymbolRef BlockDesc = Call.getReturnValue().getAsSymbol();
-    reportBlockInCritSection(BlockDesc, Call, C);
+  if (isBlockingInCritSection(Call, C)) {
+    reportBlockInCritSection(Call, C);
+  } else if (std::optional<MutexDescriptor> LockDesc =
+                 checkDescriptorMatch(Call, C, /*IsLock=*/true)) {
+    handleLock(*LockDesc, Call, C);
+  } else if (std::optional<MutexDescriptor> UnlockDesc =
+                 checkDescriptorMatch(Call, C, /*IsLock=*/false)) {
+    handleUnlock(*UnlockDesc, Call, C);
   }
 }
 
 void BlockInCriticalSectionChecker::reportBlockInCritSection(
-    SymbolRef BlockDescSym, const CallEvent &Call, CheckerContext &C) const {
-  ExplodedNode *ErrNode = C.generateNonFatalErrorNode();
+    const CallEvent &Call, CheckerContext &C) const {
+  ExplodedNode *ErrNode = C.generateNonFatalErrorNode(C.getState());
   if (!ErrNode)
     return;
 
@@ -147,14 +322,63 @@ void BlockInCriticalSectionChecker::reportBlockInCritSection(
   auto R = std::make_unique<PathSensitiveBugReport>(BlockInCritSectionBugType,
                                                     os.str(), ErrNode);
   R->addRange(Call.getSourceRange());
-  R->markInteresting(BlockDescSym);
+  R->markInteresting(Call.getReturnValue());
   C.emitReport(std::move(R));
 }
 
+const NoteTag *
+BlockInCriticalSectionChecker::createCritSectionNote(CritSectionMarker M,
+                                                     CheckerContext &C) const {
+  const BugType *BT = &this->BlockInCritSectionBugType;
+  return C.getNoteTag([M, BT](PathSensitiveBugReport &BR,
+                              llvm::raw_ostream &OS) {
+    if (&BR.getBugType() != BT)
+      return;
+
+    // Get the lock events for the mutex of the current line's lock event.
+    const auto CritSectionBegins =
+        BR.getErrorNode()->getState()->get<ActiveCritSections>();
+    llvm::SmallVector<CritSectionMarker, 4> LocksForMutex;
+    llvm::copy_if(
+        CritSectionBegins, std::back_inserter(LocksForMutex),
+        [M](const auto &Marker) { return Marker.LockReg == M.LockReg; });
+    if (LocksForMutex.empty())
+      return;
+
+    // As the ImmutableList builds the locks by prepending them, we
+    // reverse the list to get the correct order.
+    std::reverse(LocksForMutex.begin(), LocksForMutex.end());
+
+    // Find the index of the lock expression in the list of all locks for a
+    // given mutex (in acquisition order).
+    const auto Position =
+        llvm::find_if(std::as_const(LocksForMutex), [M](const auto &Marker) {
+          return Marker.LockExpr == M.LockExpr;
+        });
+    if (Position == LocksForMutex.end())
+      return;
+
+    // If there is only one lock event, we don't need to specify how many times
+    // the critical section was entered.
+    if (LocksForMutex.size() == 1) {
+      OS << "Entering critical section here";
+      return;
+    }
+
+    const auto IndexOfLock =
+        std::distance(std::as_const(LocksForMutex).begin(), Position);
+
+    const auto OrdinalOfLock = IndexOfLock + 1;
+    OS << "Entering critical section for the " << OrdinalOfLock
+       << llvm::getOrdinalSuffix(OrdinalOfLock) << " time here";
+  });
+}
+
 void ento::registerBlockInCriticalSectionChecker(CheckerManager &mgr) {
   mgr.registerChecker<BlockInCriticalSectionChecker>();
 }
 
-bool ento::shouldRegisterBlockInCriticalSectionChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterBlockInCriticalSectionChecker(
+    const CheckerManager &mgr) {
   return true;
 }
diff --git a/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp b/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
index a80352816be61..7042f1aeb803f 100644
--- a/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
+++ b/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
@@ -190,6 +190,17 @@ static bool isCapturedByReference(ExplodedNode *N, const DeclRefExpr *DR) {
   return FD->getType()->isReferenceType();
 }
 
+static bool isFoundInStmt(const Stmt *S, const VarDecl *VD) {
+  if (const DeclStmt *DS = dyn_cast<DeclStmt>(S)) {
+    for (const Decl *D : DS->decls()) {
+      // Once we reach the declaration of the VD we can return.
+      if (D->getCanonicalDecl() == VD)
+        return true;
+    }
+  }
+  return false;
+}
+
 // A loop counter is considered escaped if:
 // case 1: It is a global variable.
 // case 2: It is a reference parameter or a reference capture.
@@ -219,13 +230,19 @@ static bool isPossiblyEscaped(ExplodedNode *N, const DeclRefExpr *DR) {
       continue;
     }
 
-    if (const DeclStmt *DS = dyn_cast<DeclStmt>(S)) {
-      for (const Decl *D : DS->decls()) {
-        // Once we reach the declaration of the VD we can return.
-        if (D->getCanonicalDecl() == VD)
-          return false;
+    if (isFoundInStmt(S, VD)) {
+      return false;
+    }
+
+    if (const auto *SS = dyn_cast<SwitchStmt>(S)) {
+      if (const auto *CST = dyn_cast<CompoundStmt>(SS->getBody())) {
+        for (const Stmt *CB : CST->body()) {
+          if (isFoundInStmt(CB, VD))
+            return false;
+        }
       }
     }
+
     // Check the usage of the pass-by-ref function calls and adress-of operator
     // on VD and reference initialized by VD.
     ASTContext &ASTCtx =
diff --git a/clang/lib/Tooling/Inclusions/Stdlib/CSpecialSymbolMap.inc b/clang/lib/Tooling/Inclusions/Stdlib/CSpecialSymbolMap.inc
new file mode 100644
index 0000000000000..a515f69ea6a8c
--- /dev/null
+++ b/clang/lib/Tooling/Inclusions/Stdlib/CSpecialSymbolMap.inc
@@ -0,0 +1,14 @@
+//===-- StdSpecialSymbolMap.inc -----------------------------------*- C -*-===//
+//
+// This is a hand-curated list for C symbols that cannot be parsed/extracted
+// via the include-mapping tool (gen_std.py).
+//
+//===----------------------------------------------------------------------===//
+
+SYMBOL(size_t, None, <stddef.h>)
+SYMBOL(size_t, None, <stdio.h>)
+SYMBOL(size_t, None, <stdlib.h>)
+SYMBOL(size_t, None, <string.h>)
+SYMBOL(size_t, None, <time.h>)
+SYMBOL(size_t, None, <uchar.h>)
+SYMBOL(size_t, None, <wchar.h>)
diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
index adf1b230ff031..0832bcf66145f 100644
--- a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
+++ b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
@@ -55,11 +55,12 @@ static const SymbolHeaderMapping *getMappingPerLang(Lang L) {
 }
 
 static int countSymbols(Lang Language) {
-  ArrayRef<const char*> Symbols;
+  ArrayRef<const char *> Symbols;
 #define SYMBOL(Name, NS, Header) #NS #Name,
   switch (Language) {
   case Lang::C: {
     static constexpr const char *CSymbols[] = {
+#include "CSpecialSymbolMap.inc"
 #include "CSymbolMap.inc"
     };
     Symbols = CSymbols;
@@ -147,6 +148,7 @@ static int initialize(Lang Language) {
   switch (Language) {
   case Lang::C: {
     static constexpr Symbol CSymbols[] = {
+#include "CSpecialSymbolMap.inc"
 #include "CSymbolMap.inc"
     };
     for (const Symbol &S : CSymbols)
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index 4b112d7fdddfd..2443992f75fb7 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -429,18 +429,13 @@ namespace Incomplete {
                           // both-note {{array-to-pointer decay of array member without known bound}}
 
   /// These are from test/SemaCXX/constant-expression-cxx11.cpp
-  /// and are the only tests using the 'indexing of array without known bound' diagnostic.
-  /// We currently diagnose them differently.
-  extern int arr[]; // expected-note 3{{declared here}}
+  extern int arr[];
   constexpr int *c = &arr[1]; // both-error  {{must be initialized by a constant expression}} \
-                              // ref-note {{indexing of array without known bound}} \
-                              // expected-note {{read of non-constexpr variable 'arr'}}
+                              // both-note {{indexing of array without known bound}}
   constexpr int *d = &arr[1]; // both-error  {{must be initialized by a constant expression}} \
-                              // ref-note {{indexing of array without known bound}} \
-                              // expected-note {{read of non-constexpr variable 'arr'}}
+                              // both-note {{indexing of array without known bound}}
   constexpr int *e = arr + 1; // both-error  {{must be initialized by a constant expression}} \
-                              // ref-note {{indexing of array without known bound}} \
-                              // expected-note {{read of non-constexpr variable 'arr'}}
+                              // both-note {{indexing of array without known bound}}
 }
 
 namespace GH69115 {
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index 08fca8428cf5e..2e9d1a831dcf6 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -510,3 +510,14 @@ namespace bswap {
   int h4 = __builtin_bswap32(0x1234) == 0x34120000 ? 1 : f();
   int h5 = __builtin_bswap64(0x1234) == 0x3412000000000000 ? 1 : f();
 }
+
+#define CFSTR __builtin___CFStringMakeConstantString
+void test7(void) {
+  const void *X;
+#if !defined(_AIX)
+  X = CFSTR("\242"); // both-warning {{input conversion stopped}}
+  X = CFSTR("\0"); // no-warning
+  X = CFSTR(242); // both-error {{cannot initialize a parameter of type 'const char *' with an rvalue of type 'int'}}
+  X = CFSTR("foo", "bar"); // both-error {{too many arguments to function call}}
+#endif
+}
diff --git a/clang/test/AST/Interp/builtins.cpp b/clang/test/AST/Interp/builtins.cpp
index 06a22b16b2dcb..9095d1bf8d6a3 100644
--- a/clang/test/AST/Interp/builtins.cpp
+++ b/clang/test/AST/Interp/builtins.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -verify -fms-extensions
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -fms-extensions -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -Wno-constant-evaluated -verify -fms-extensions
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -Wno-constant-evaluated -fms-extensions -S -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -verify=ref %s -Wno-constant-evaluated -fms-extensions
 // RUN: %clang_cc1 -verify=ref %s -Wno-constant-evaluated %s -fms-extensions -S -emit-llvm -o - | FileCheck %s
 
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 8de6139efbea0..10e23839f2ba2 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -66,11 +66,11 @@ _Static_assert((&a - 100) != 0, ""); // pedantic-ref-warning {{is a GNU extensio
                                      // pedantic-ref-note {{-100 of non-array}} \
                                      // pedantic-expected-note {{-100 of non-array}}
 /// extern variable of a composite type.
-/// FIXME: The 'cast from void*' note is missing in the new interpreter.
+/// FIXME: The 'this conversion is not allowed' note is missing in the new interpreter.
 extern struct Test50S Test50;
 _Static_assert(&Test50 != (void*)0, ""); // all-warning {{always true}} \
                                          // pedantic-ref-warning {{is a GNU extension}} \
-                                         // pedantic-ref-note {{cast from 'void *' is not allowed}} \
+                                         // pedantic-ref-note {{this conversion is not allowed in a constant expression}} \
                                          // pedantic-expected-warning {{is a GNU extension}}
 
 struct y {int x,y;};
@@ -201,3 +201,11 @@ void localCompoundLiteral(void) {
        // pedantic-ref-warning {{use of an empty initializer}}
   };
 }
+
+/// struct copy
+struct StrA {int a; };
+const struct StrA sa = { 12 };
+const struct StrA * const sb = &sa;
+const struct StrA sc = *sb;
+_Static_assert(sc.a == 12, ""); // pedantic-ref-warning {{GNU extension}} \
+                                // pedantic-expected-warning {{GNU extension}}
diff --git a/clang/test/AST/Interp/c23.c b/clang/test/AST/Interp/c23.c
new file mode 100644
index 0000000000000..cf1bf4d4e7d90
--- /dev/null
+++ b/clang/test/AST/Interp/c23.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -std=c23 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -std=c23 -verify=ref,both %s
+
+
+
+const _Bool inf1 =  (1.0/0.0 == __builtin_inf());
+constexpr _Bool inf2 = (1.0/0.0 == __builtin_inf()); // both-error {{must be initialized by a constant expression}} \
+                                                     // both-note {{division by zero}}
+constexpr _Bool inf3 = __builtin_inf() == __builtin_inf();
+
+
diff --git a/clang/test/AST/Interp/complex.c b/clang/test/AST/Interp/complex.c
index b5f30b87baa79..a39f83160ef8c 100644
--- a/clang/test/AST/Interp/complex.c
+++ b/clang/test/AST/Interp/complex.c
@@ -19,3 +19,12 @@ const _Complex float FC = {0.0f, 0.0f};
 _Static_assert(!FC, "");
 const _Complex float FI = {0, 0};
 _Static_assert(!FI, "");
+
+
+/// Make sure we're stripping the _Atomic part from the
+/// complex type.
+void testComplexFloat(_Atomic(_Complex float) *fp) {
+  _Atomic(_Complex float) x = 2.0f;
+  _Complex float f = *fp;
+  *fp = f;
+}
diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp
index 6a42afc68d26c..09cb620d7b7c3 100644
--- a/clang/test/AST/Interp/complex.cpp
+++ b/clang/test/AST/Interp/complex.cpp
@@ -9,6 +9,9 @@ static_assert(&__imag z1 == &__real z1 + 1, "");
 static_assert((*(&__imag z1)) == __imag z1, "");
 static_assert((*(&__real z1)) == __real z1, "");
 
+constexpr _Complex int Comma1 = {1, 2};
+constexpr _Complex int Comma2 = (0, Comma1);
+static_assert(Comma1 == Comma1, "");
 
 constexpr double setter() {
   _Complex float d = {1.0, 2.0};
@@ -313,3 +316,53 @@ namespace Cmp {
   static_assert((0.0 + 0.0j) > (0.0 + 0.0j)); // both-error {{invalid operands to binary expression}}
   static_assert((0.0 + 0.0j) ^ (0.0 + 0.0j)); // both-error {{invalid operands to binary expression}}
 }
+
+/// From test/SemaCXX/constant-expression-cxx11.cpp
+///
+/// Some of the diagnostics we emit are different than the one of the
+/// current interpreter.
+///
+/// FIXME: For the '&test3 + 1' test, we are _not_ creating an explicit pointer variable
+/// anywhere and so the &test3+1 is the same as __imag(test3) for us.
+namespace ComplexConstexpr {
+  constexpr _Complex float test1 = {};
+  constexpr _Complex float test2 = {1};
+  constexpr _Complex double test3 = {1,2};
+  constexpr _Complex int test4 = {4};
+  constexpr _Complex int test5 = 4;
+  constexpr _Complex int test6 = {5,6};
+  typedef _Complex float fcomplex;
+  constexpr fcomplex test7 = fcomplex();
+
+  constexpr const double &t2r = __real test3;
+  constexpr const double &t2i = __imag test3;
+  static_assert(&t2r + 1 == &t2i, "");
+  static_assert(t2r == 1.0, "");
+  static_assert(t2i == 2.0, "");
+  constexpr const double *t2p = &t2r;
+  static_assert(t2p[-1] == 0.0, ""); // both-error {{constant expr}} \
+                                     // both-note {{cannot refer to element -1 of array of 2 elements}}
+  static_assert(t2p[0] == 1.0, "");
+  static_assert(t2p[1] == 2.0, "");
+  static_assert(t2p[2] == 0.0, ""); // both-error {{constant expr}} \
+                                    // both-note {{one-past-the-end pointer}}
+  static_assert(t2p[3] == 0.0, ""); // both-error {{constant expr}} \
+                                    // both-note {{cannot refer to element 3 of array of 2 elements}}
+  constexpr _Complex float *p = 0;
+  constexpr float pr = __real *p; // both-error {{constant expr}} \
+                                  // ref-note {{cannot access real component of null}} \
+                                  // expected-note {{read of dereferenced null pointer}}
+  constexpr float pi = __imag *p; // both-error {{constant expr}} \
+                                  // ref-note {{cannot access imaginary component of null}} \
+                                  // expected-note {{cannot perform pointer arithmetic on null pointer}}
+  constexpr const _Complex double *q = &test3 + 1;
+  constexpr double qr = __real *q; // ref-error {{constant expr}} \
+                                   // ref-note {{cannot access real component of pointer past the end}}
+  constexpr double qi = __imag *q; // both-error {{constant expr}} \
+                                   // ref-note {{cannot access imaginary component of pointer past the end}} \
+                                   // expected-note {{read of dereferenced one-past-the-end pointer}}
+
+  static_assert(__real test6 == 5, "");
+  static_assert(__imag test6 == 6, "");
+  static_assert(&__imag test6 == &__real test6 + 1, "");
+}
diff --git a/clang/test/AST/Interp/cxx23.cpp b/clang/test/AST/Interp/cxx23.cpp
index 127b58915127c..042e29613aa75 100644
--- a/clang/test/AST/Interp/cxx23.cpp
+++ b/clang/test/AST/Interp/cxx23.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref20,all,all-20 %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref20,all,all20 %s
 // RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=ref23,all %s
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=expected20,all,all-20 %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=expected20,all,all20 %s -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=expected23,all %s -fexperimental-new-constant-interpreter
 
 /// FIXME: The new interpreter is missing all the 'control flows through...' diagnostics.
@@ -108,9 +108,9 @@ namespace StaticOperators {
   static_assert(f2() == 3);
 
   struct S1 {
-    constexpr S1() { // all-20-error {{never produces a constant expression}}
+    constexpr S1() { // all20-error {{never produces a constant expression}}
       throw; // all-note {{not valid in a constant expression}} \
-             // all-20-note {{not valid in a constant expression}}
+             // all20-note {{not valid in a constant expression}}
     }
     static constexpr int operator()() { return 3; } // ref20-warning {{C++23 extension}} \
                                                     // expected20-warning {{C++23 extension}}
@@ -121,3 +121,28 @@ namespace StaticOperators {
 
 
 }
+
+int test_in_lambdas() {
+  auto c = [](int n) constexpr {
+    if (n == 0)
+      return 0;
+    else
+      goto test; // all-note {{subexpression not valid in a constant expression}} \
+                 // all20-warning {{use of this statement in a constexpr function is a C++23 extension}}
+  test:
+    return 1;
+  };
+  c(0);
+  constexpr auto A = c(1); // all-error {{must be initialized by a constant expression}} \
+                           // all-note {{in call to}}
+  return 0;
+}
+
+/// PackIndexExpr.
+template <auto... p>
+struct check_ice {
+    enum e {
+        x = p...[0] // all-warning {{is a C++2c extension}}
+    };
+};
+static_assert(check_ice<42>::x == 42);
diff --git a/clang/test/AST/Interp/cxx98.cpp b/clang/test/AST/Interp/cxx98.cpp
index 73e4537206633..ba6bcd97d920d 100644
--- a/clang/test/AST/Interp/cxx98.cpp
+++ b/clang/test/AST/Interp/cxx98.cpp
@@ -18,13 +18,12 @@ template struct C<cval>;
 
 /// FIXME: This example does not get properly diagnosed in the new interpreter.
 extern const int recurse1;
-const int recurse2 = recurse1; // both-note {{declared here}}
+const int recurse2 = recurse1; // ref-note {{declared here}}
 const int recurse1 = 1;
 int array1[recurse1];
 int array2[recurse2]; // ref-warning 2{{variable length array}} \
                       // ref-note {{initializer of 'recurse2' is not a constant expression}} \
                       // expected-warning {{variable length array}} \
-                      // expected-note {{read of non-const variable 'recurse2'}} \
                       // expected-error {{variable length array}}
 
 int NCI; // both-note {{declared here}}
@@ -39,3 +38,10 @@ struct V {
                          // both-error {{constructor cannot have a return type}}
 };
 _Static_assert(V().c[0], ""); // both-error {{is not an integral constant expression}}
+
+struct C0 {
+  template<typename U> static U Data; // both-warning {{C++14 extension}}
+  template<typename U> static const U Data<U*> = U();
+};
+const int c0_test = C0::Data<int*>;
+_Static_assert(c0_test == 0, "");
diff --git a/clang/test/AST/Interp/if.cpp b/clang/test/AST/Interp/if.cpp
index 86ae8de6f73eb..37289d69d3255 100644
--- a/clang/test/AST/Interp/if.cpp
+++ b/clang/test/AST/Interp/if.cpp
@@ -1,8 +1,5 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fexperimental-new-constant-interpreter %s -verify
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only %s -verify=ref
-
-// expected-no-diagnostics
-// ref-no-diagnostics
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fexperimental-new-constant-interpreter %s -verify=expected,both
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only %s -verify=ref,both
 
 namespace ConstEval {
   constexpr int f() {
@@ -51,3 +48,13 @@ namespace InitDecl {
   }
   static_assert(attrs() == 1, "");
 };
+
+/// The faulty if statement creates a RecoveryExpr with contains-errors,
+/// but the execution will never reach that.
+constexpr char g(char const (&x)[2]) {
+    return 'x';
+  if (auto [a, b] = x) // both-error {{an array type is not allowed here}} \
+                       // both-warning {{ISO C++17 does not permit structured binding declaration in a condition}}
+    ;
+}
+static_assert(g("x") == 'x');
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index 7ae8499b1156d..277438d2e6311 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -910,6 +910,18 @@ namespace TypeTraits {
   struct U {};
   static_assert(S3<U>{}.foo(), "");
   static_assert(!S3<T>{}.foo(), "");
+
+  typedef int Int;
+  typedef Int IntAr[10];
+  typedef const IntAr ConstIntAr;
+  typedef ConstIntAr ConstIntArAr[4];
+
+  static_assert(__array_rank(IntAr) == 1, "");
+  static_assert(__array_rank(ConstIntArAr) == 2, "");
+
+  static_assert(__array_extent(IntAr, 0) == 10, "");
+  static_assert(__array_extent(ConstIntArAr, 0) == 4, "");
+  static_assert(__array_extent(ConstIntArAr, 1) == 10, "");
 }
 
 #if __cplusplus >= 201402L
@@ -1113,6 +1125,9 @@ namespace InvalidDeclRefs {
   int b03 = 3; // both-note {{declared here}}
   static_assert(b03, ""); // both-error {{not an integral constant expression}} \
                           // both-note {{read of non-const variable}}
+
+  extern int var;
+  constexpr int *varp = &var; // Ok.
 }
 
 namespace NonConstReads {
@@ -1182,8 +1197,16 @@ namespace incdecbool {
 }
 
 #if __cplusplus >= 201402L
+/// NOTE: The diagnostics of the two interpreters are a little
+/// different here, but they both make sense.
 constexpr int externvar1() { // both-error {{never produces a constant expression}}
-  extern char arr[]; // both-note {{declared here}}
-  return arr[0]; // both-note {{read of non-constexpr variable 'arr'}}
+  extern char arr[]; // ref-note {{declared here}}
+   return arr[0]; // ref-note {{read of non-constexpr variable 'arr'}} \
+                  // expected-note {{array-to-pointer decay of array member without known bound is not supported}}
 }
 #endif
+
+namespace Extern {
+  constexpr extern char Oops = 1;
+  static_assert(Oops == 1, "");
+}
diff --git a/clang/test/AST/Interp/ms.cpp b/clang/test/AST/Interp/ms.cpp
new file mode 100644
index 0000000000000..99716e90c7a1d
--- /dev/null
+++ b/clang/test/AST/Interp/ms.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -verify=ref,both %s -fms-extensions
+// RUN: %clang_cc1 -verify=expected,both %s -fexperimental-new-constant-interpreter -fms-extensions
+
+// ref-no-diagnostics
+// expected-no-diagnostics
+
+/// Used to assert because the two parameters to _rotl do not have the same type.
+static_assert(_rotl(0x01, 5) == 32);
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 7ddc56c9b5dfc..0f76e0cfe9927 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1238,3 +1238,50 @@ namespace InvalidCtorInitializer {
   // no crash on evaluating the constexpr ctor.
   constexpr int Z = X().Y; // both-error {{constexpr variable 'Z' must be initialized by a constant expression}}
 }
+
+extern int f(); // both-note {{here}}
+struct HasNonConstExprMemInit {
+  int x = f(); // both-note {{non-constexpr function}}
+  constexpr HasNonConstExprMemInit() {} // both-error {{never produces a constant expression}}
+};
+
+namespace {
+  template <class Tp, Tp v>
+  struct integral_constant {
+    static const Tp value = v;
+  };
+
+  template <class Tp, Tp v>
+  const Tp integral_constant<Tp, v>::value;
+
+  typedef integral_constant<bool, true> true_type;
+  typedef integral_constant<bool, false> false_type;
+
+  /// This might look innocent, but we get an evaluateAsInitializer call for the
+  /// static bool member before evaluating the first static_assert, but we do NOT
+  /// get such a call for the second one. So the second one needs to lazily visit
+  /// the data member itself.
+  static_assert(true_type::value, "");
+  static_assert(true_type::value, "");
+}
+
+#if __cplusplus >= 202002L
+namespace {
+  /// Used to crash because the CXXDefaultInitExpr is of compound type.
+  struct A {
+    int &x;
+    constexpr ~A() { --x; }
+  };
+  struct B {
+    int &x;
+    const A &a = A{x};
+  };
+  constexpr int a() {
+    int x = 1;
+    int f = B{x}.x;
+    B{x}; // both-warning {{expression result unused}}
+
+    return 1;
+  }
+}
+#endif
diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp
index 06518a71987a2..5bb174e3548ed 100644
--- a/clang/test/AST/ast-dump-concepts.cpp
+++ b/clang/test/AST/ast-dump-concepts.cpp
@@ -57,3 +57,53 @@ struct Foo {
   template <variadic_concept<int>... Ts>
   Foo();
 };
+
+namespace GH82628 {
+namespace ns {
+
+template <typename T>
+concept C = true;
+
+} // namespace ns
+
+using ns::C;
+
+// CHECK:     ConceptDecl {{.*}} Foo
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 0 T
+// CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} UsingShadow {{.*}} 'C'
+template <typename T>
+concept Foo = C<T>;
+
+// CHECK: TemplateTypeParmDecl {{.*}} Concept {{.*}} 'C' (UsingShadow {{.*}} 'C')
+template <C T>
+constexpr bool FooVar = false;
+
+// CHECK: ConceptSpecializationExpr {{.*}} UsingShadow {{.*}} 'C'
+template <typename T> requires C<T>
+constexpr bool FooVar2 = true;
+
+// CHECK: SimpleRequirement
+// CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} UsingShadow {{.*}} 'C'
+template <typename T> requires requires (T) { C<T>; }
+constexpr bool FooVar3 = true;
+
+// CHECK: NonTypeTemplateParmDecl
+// CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} UsingShadow {{.*}} 'C'
+template <C auto T>
+constexpr bool FooVar4 = bool(T());
+
+// CHECK: FunctionTemplateDecl
+// CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} Concept {{.*}} 'C' (UsingShadow {{.*}} 'C') depth 0 index 0 ... T
+// CHECK: NonTypeTemplateParmDecl {{.*}} depth 0 index 1 U
+// CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} UsingShadow {{.*}} 'C'
+// CHECK: |-TemplateTypeParmDecl {{.*}} Concept {{.*}} 'C' (UsingShadow {{.*}} 'C') depth 0 index 2 V:auto
+
+template <C... T, C auto U>
+auto FooFunc(C auto V) -> C decltype(auto) {
+  // FIXME: TypeLocs inside of the function body cannot be dumped via -ast-dump for now.
+  // See clang-tools-extra/clangd/unittests/SelectionTests.cpp:SelectionTest.UsingConcepts for their checkings.
+  C auto W = V;
+  return W;
+}
+
+}
diff --git a/clang/test/Analysis/block-in-critical-section.cpp b/clang/test/Analysis/block-in-critical-section.cpp
index fcf6188fc033e..87c26b9f1b520 100644
--- a/clang/test/Analysis/block-in-critical-section.cpp
+++ b/clang/test/Analysis/block-in-critical-section.cpp
@@ -1,4 +1,8 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.unix.BlockInCriticalSection -std=c++11 -verify %s
+// RUN: %clang_analyze_cc1 \
+// RUN:   -analyzer-checker=alpha.unix.BlockInCriticalSection \
+// RUN:   -std=c++11 \
+// RUN:   -analyzer-output text \
+// RUN:   -verify %s
 
 void sleep(int x) {}
 
@@ -21,108 +25,242 @@ template<typename T>
 struct not_real_lock {
   not_real_lock<T>(std::mutex) {}
 };
-}
+} // namespace std
+
+struct FILE;
+int getc(FILE *stream);
+char* fgets(char *str, FILE *stream);
+using ssize_t = long long;
+using size_t = unsigned long long;
+ssize_t read(int fd, void *buf, size_t count);
+ssize_t recv(int sockfd, void *buf, size_t len, int flags);
 
-void getc() {}
-void fgets() {}
-void read() {}
-void recv() {}
+struct pthread_mutex_t;
+void pthread_mutex_lock(pthread_mutex_t *mutex);
+void pthread_mutex_trylock(pthread_mutex_t *mutex);
+void pthread_mutex_unlock(pthread_mutex_t *mutex);
 
-void pthread_mutex_lock() {}
-void pthread_mutex_trylock() {}
-void pthread_mutex_unlock() {}
+struct mtx_t;
+void mtx_lock(mtx_t *mutex);
+void mtx_timedlock(mtx_t *mutex);
+void mtx_trylock(mtx_t *mutex);
+void mtx_unlock(mtx_t *mutex);
 
-void mtx_lock() {}
-void mtx_timedlock() {}
-void mtx_trylock() {}
-void mtx_unlock() {}
+// global params for dummy function calls
+FILE *stream;
+char *str;
+int fd;
+void *buf;
+size_t count;
+int sockfd;
+size_t len;
+int flags;
 
 void testBlockInCriticalSectionWithStdMutex() {
   std::mutex m;
-  m.lock();
+  m.lock(); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
   m.unlock();
 }
 
-void testBlockInCriticalSectionWithPthreadMutex() {
-  pthread_mutex_lock();
+void testBlockInCriticalSectionWithPthreadMutex(pthread_mutex_t *mutex) {
+  pthread_mutex_lock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  pthread_mutex_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  pthread_mutex_unlock(mutex);
 
-  pthread_mutex_trylock();
+  pthread_mutex_trylock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  pthread_mutex_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  pthread_mutex_unlock(mutex);
 }
 
-void testBlockInCriticalSectionC11Locks() {
-  mtx_lock();
+void testBlockInCriticalSectionC11Locks(mtx_t *mutex) {
+  mtx_lock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  mtx_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  mtx_unlock(mutex);
 
-  mtx_timedlock();
+  mtx_timedlock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  mtx_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  mtx_unlock(mutex);
 
-  mtx_trylock();
+  mtx_trylock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  mtx_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  mtx_unlock(mutex);
+}
+
+void testMultipleBlockingCalls() {
+  std::mutex m;
+  m.lock(); // expected-note 1{{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  sleep(2); // no-warning
 }
 
-void testBlockInCriticalSectionWithNestedMutexes() {
+void testMultipleMutexesMultipleBlockingCalls() {
   std::mutex m, n, k;
-  m.lock();
-  n.lock();
-  k.lock();
-  sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+  m.lock(); // expected-note 2{{Entering critical section here}}
+  n.lock(); // expected-note 2{{Entering critical section here}}
+  k.lock(); // expected-note 1{{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
   k.unlock();
-  sleep(5); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+}
+
+
+void testRecursiveAcquisition() {
+  std::mutex m;
+  m.lock(); // expected-note {{Entering critical section for the 1st time here}}
+  m.lock(); // expected-note {{Entering critical section for the 2nd time here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  m.unlock();
+}
+
+void testRecursiveAcquisitionWithMultipleBlockingCalls() {
+  std::mutex m;
+  m.lock(); // expected-note 1{{Entering critical section for the 1st time here}}
+            // expected-note@-1 {{Entering critical section here}}
+  m.lock(); // expected-note 1{{Entering critical section for the 2nd time here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  // this next 'sleep' call is only in the critical section of the first lock
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+}
+
+void testRecursiveAcquisitionWithMultipleMutexes() {
+  std::mutex m, n;
+  m.lock(); // expected-note 1{{Entering critical section here}}
+  n.lock(); // expected-note 2{{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  // this next 'sleep' call is only in the critical section of mutex 'n'
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  n.unlock();
+}
+
+
+void testNestedMutexes() {
+  std::mutex m, n, k;
+  m.lock(); // expected-note 3{{Entering critical section here}}
+  n.lock(); // expected-note 2{{Entering critical section here}}
+  k.lock(); // expected-note 1{{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  k.unlock();
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
   n.unlock();
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  sleep(4); // no-warning
+}
+
+void testNonOverlappingMutexes() {
+  std::mutex m;
+  m.lock(); // There should be no warning here
+  m.unlock();
+  m.lock(); // expected-note {{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+}
+
+void testMixedMutexLocksWithIntermittentUnlock() {
+  std::mutex m, n, k;
+  m.lock(); // expected-note {{Entering critical section here}}
+  n.lock(); // the problem is not is this lock's critical section
+  n.unlock();
+  k.lock(); // same as for n.lock()
+  k.unlock();
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
   m.unlock();
-  sleep(3); // no-warning
 }
 
 void f() {
   sleep(1000); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+               // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
 }
 
 void testBlockInCriticalSectionInterProcedural() {
   std::mutex m;
-  m.lock();
-  f();
+  m.lock(); // expected-note {{Entering critical section here}}
+  f(); // expected-note {{Calling 'f'}}
   m.unlock();
 }
 
+void unknown_function_that_may_lock(std::mutex &);
 void testBlockInCriticalSectionUnexpectedUnlock() {
   std::mutex m;
+  unknown_function_that_may_lock(m);
   m.unlock();
   sleep(1); // no-warning
-  m.lock();
-  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+  m.lock(); // expected-note {{Entering critical section here}}
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
 }
 
 void testBlockInCriticalSectionLockGuard() {
@@ -130,12 +268,13 @@ void testBlockInCriticalSectionLockGuard() {
   std::not_real_lock<std::mutex> not_real_lock(g_mutex);
   sleep(1); // no-warning
 
-  std::lock_guard<std::mutex> lock(g_mutex);
+  std::lock_guard<std::mutex> lock(g_mutex); // expected-note {{Entering critical section here}}
   sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
 }
 
 void testBlockInCriticalSectionLockGuardNested() {
-  testBlockInCriticalSectionLockGuard();
+  testBlockInCriticalSectionLockGuard(); // expected-note {{Calling 'testBlockInCriticalSectionLockGuard'}}
   sleep(1); // no-warning
 }
 
@@ -144,11 +283,12 @@ void testBlockInCriticalSectionUniqueLock() {
   std::not_real_lock<std::mutex> not_real_lock(g_mutex);
   sleep(1); // no-warning
 
-  std::unique_lock<std::mutex> lock(g_mutex);
+  std::unique_lock<std::mutex> lock(g_mutex); // expected-note {{Entering critical section here}}
   sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
 }
 
 void testBlockInCriticalSectionUniqueLockNested() {
-  testBlockInCriticalSectionUniqueLock();
+  testBlockInCriticalSectionUniqueLock(); // expected-note {{Calling 'testBlockInCriticalSectionUniqueLock'}}
   sleep(1); // no-warning
 }
diff --git a/clang/test/Analysis/eval-predefined-exprs.cpp b/clang/test/Analysis/eval-predefined-exprs.cpp
index 1eec4476a065f..b26091869cd03 100644
--- a/clang/test/Analysis/eval-predefined-exprs.cpp
+++ b/clang/test/Analysis/eval-predefined-exprs.cpp
@@ -52,12 +52,13 @@ void foo() {
 
 struct A {
   A() {
-    clang_analyzer_dump(__func__);
-    clang_analyzer_dump(__FUNCTION__);
-    clang_analyzer_dump(__PRETTY_FUNCTION__);
-    // expected-warning@-3 {{&Element{"A",0 S64b,char}}}
-    // expected-warning@-3 {{&Element{"A",0 S64b,char}}}
-    // expected-warning@-3 {{&Element{"A::A()",0 S64b,char}}}
+    clang_analyzer_dump(__func__);     // expected-warning {{&Element{"A",0 S64b,char}}}
+#ifdef ANALYZER_MS
+    clang_analyzer_dump(__FUNCTION__); // expected-warning {{&Element{"A::A",0 S64b,char}}}
+#else
+    clang_analyzer_dump(__FUNCTION__); // expected-warning {{&Element{"A",0 S64b,char}}}
+#endif
+    clang_analyzer_dump(__PRETTY_FUNCTION__);  // expected-warning {{&Element{"A::A()",0 S64b,char}}}
 
 #ifdef ANALYZER_MS
     clang_analyzer_dump(__FUNCDNAME__);
@@ -71,12 +72,13 @@ struct A {
 #endif
   }
   ~A() {
-    clang_analyzer_dump(__func__);
-    clang_analyzer_dump(__FUNCTION__);
-    clang_analyzer_dump(__PRETTY_FUNCTION__);
-    // expected-warning@-3 {{&Element{"~A",0 S64b,char}}}
-    // expected-warning@-3 {{&Element{"~A",0 S64b,char}}}
-    // expected-warning@-3 {{&Element{"A::~A()",0 S64b,char}}}
+    clang_analyzer_dump(__func__);          // expected-warning {{&Element{"~A",0 S64b,char}}}
+#ifdef ANALYZER_MS
+    clang_analyzer_dump(__FUNCTION__);      // expected-warning {{&Element{"A::~A",0 S64b,char}}}
+#else
+    clang_analyzer_dump(__FUNCTION__);      // expected-warning {{&Element{"~A",0 S64b,char}}}
+#endif
+    clang_analyzer_dump(__PRETTY_FUNCTION__); // expected-warning {{&Element{"A::~A()",0 S64b,char}}}
 
 #ifdef ANALYZER_MS
     clang_analyzer_dump(__FUNCDNAME__);
diff --git a/clang/test/Analysis/loop-unrolling.cpp b/clang/test/Analysis/loop-unrolling.cpp
index fc1fb06cdc014..66a828abfb513 100644
--- a/clang/test/Analysis/loop-unrolling.cpp
+++ b/clang/test/Analysis/loop-unrolling.cpp
@@ -547,3 +547,15 @@ void capture_implicitly_by_ref_as_loop_counter() {
     }
   };
 }
+
+
+void test_escaping_on_var_before_switch_case_no_crash(int c) {
+  // https://github.com/llvm/llvm-project/issues/68819
+  switch (c) {
+    int i; // no-crash: The declaration of `i` is found here.
+    case 0: {
+      for (i = 0; i < 16; i++) {}
+      break;
+    }
+  }
+}
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 0c3c67c6a546a..92f983d8b1561 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -24,6 +24,33 @@ void taintedIndex(void) {
   scanf("%d", &index);
   // expected-note@-1 {{Taint originated here}}
   // expected-note@-2 {{Taint propagated to the 2nd argument}}
+  TenElements[index] = 5;
+  // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}}
+  // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be negative or too large}}
+}
+
+void taintedIndexNonneg(void) {
+  int index;
+  scanf("%d", &index);
+  // expected-note@-1 {{Taint originated here}}
+  // expected-note@-2 {{Taint propagated to the 2nd argument}}
+
+  // expected-note@+2 {{Assuming 'index' is >= 0}}
+  // expected-note@+1 {{Taking false branch}}
+  if (index < 0)
+    return;
+
+  TenElements[index] = 5;
+  // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}}
+  // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be too large}}
+}
+
+void taintedIndexUnsigned(void) {
+  unsigned index;
+  scanf("%u", &index);
+  // expected-note@-1 {{Taint originated here}}
+  // expected-note@-2 {{Taint propagated to the 2nd argument}}
+
   TenElements[index] = 5;
   // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}}
   // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be too large}}
@@ -59,7 +86,7 @@ void taintedOffset(void) {
   int *p = TenElements + index;
   p[0] = 5;
   // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted offset}}
-  // expected-note@-2 {{Access of 'TenElements' with a tainted offset that may be too large}}
+  // expected-note@-2 {{Access of 'TenElements' with a tainted offset that may be negative or too large}}
 }
 
 void arrayOverflow(void) {
@@ -109,6 +136,33 @@ int *potentialAfterTheEndPtr(int idx) {
   // &TenElements[idx].
 }
 
+int overflowOrUnderflow(int arg) {
+  // expected-note@+2 {{Assuming 'arg' is < 0}}
+  // expected-note@+1 {{Taking false branch}}
+  if (arg >= 0)
+    return 0;
+
+  return TenElements[arg - 1];
+  // expected-warning@-1 {{Out of bound access to memory around 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at a negative or overflowing index, while it holds only 10 'int' elements}}
+}
+
+char TwoElements[2] = {11, 22};
+char overflowOrUnderflowConcrete(int arg) {
+  // expected-note@#cond {{Assuming 'arg' is < 3}}
+  // expected-note@#cond {{Left side of '||' is false}}
+  // expected-note@#cond {{Assuming 'arg' is not equal to 0}}
+  // expected-note@#cond {{Left side of '||' is false}}
+  // expected-note@#cond {{Assuming 'arg' is not equal to 1}}
+  // expected-note@#cond {{Taking false branch}}
+  if (arg >= 3 || arg == 0 || arg == 1) // #cond
+    return 0;
+
+  return TwoElements[arg];
+  // expected-warning@-1 {{Out of bound access to memory around 'TwoElements'}}
+  // expected-note@-2 {{Access of 'TwoElements' at a negative or overflowing index, while it holds only 2 'char' elements}}
+}
+
 int scalar;
 int scalarOverflow(void) {
   return (&scalar)[1];
diff --git a/clang/test/Analysis/taint-diagnostic-visitor.c b/clang/test/Analysis/taint-diagnostic-visitor.c
index 020e9579ac535..2ba7d9938fc3d 100644
--- a/clang/test/Analysis/taint-diagnostic-visitor.c
+++ b/clang/test/Analysis/taint-diagnostic-visitor.c
@@ -30,7 +30,7 @@ int taintDiagnosticOutOfBound(void) {
   scanf("%d", &index); // expected-note {{Taint originated here}}
                        // expected-note@-1 {{Taint propagated to the 2nd argument}}
   return Array[index]; // expected-warning {{Potential out of bound access to 'Array' with tainted index}}
-                       // expected-note@-1 {{Access of 'Array' with a tainted index that may be too large}}
+                       // expected-note@-1 {{Access of 'Array' with a tainted index}}
 }
 
 int taintDiagnosticDivZero(int operand) {
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-error.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-error.c
index 5f57d7575c859..272e0222dc9e4 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-error.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-error.c
@@ -24,13 +24,16 @@ void test_trap(void) {
   __tw(ia, ib, 0); //expected-error {{argument value 0 is outside the valid range [1, 31]}}
 }
 
+#ifdef __PPC64__
 void test_builtin_ppc_rldimi() {
   unsigned int shift;
   unsigned long long mask;
   unsigned long long res = __builtin_ppc_rldimi(ull, ull, shift, 7); // expected-error {{argument to '__builtin_ppc_rldimi' must be a constant integer}}
   res = __builtin_ppc_rldimi(ull, ull, 63, mask);                    // expected-error {{argument to '__builtin_ppc_rldimi' must be a constant integer}}
   res = __builtin_ppc_rldimi(ull, ull, 63, 0xFFFF000000000F00);      // expected-error {{argument 3 value should represent a contiguous bit field}}
+  res = __builtin_ppc_rldimi(ull, ull, 64, 0xFFFF000000000000);      // expected-error {{argument value 64 is outside the valid range [0, 63]}}
 }
+#endif
 
 void test_builtin_ppc_rlwimi() {
   unsigned int shift;
@@ -83,6 +86,10 @@ void testalignx(const void *pointer, unsigned int alignment) {
 }
 
 #ifndef __PPC64__
+unsigned long long testrldimi32() {
+  return __rldimi(ull, ui, 3, 0x7ffff8ULL); //expected-error {{this builtin is only available on 64-bit targets}}
+}
+
 long long testbpermd(long long bit_selector, long long source) {
   return __bpermd(bit_selector, source); //expected-error {{this builtin is only available on 64-bit targets}}
 }
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
index b218547c00d93..4773d6cb1a0cf 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
@@ -1,8 +1,10 @@
 // REQUIRES: powerpc-registered-target
 // RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu \
-// RUN:   -emit-llvm %s -o - -target-cpu pwr7 | FileCheck %s
+// RUN:   -emit-llvm %s -o - -target-cpu pwr7 | FileCheck %s \
+// RUN:   -check-prefixes=PPC64,CHECK
 // RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu \
-// RUN:   -emit-llvm %s -o - -target-cpu pwr8 | FileCheck %s
+// RUN:   -emit-llvm %s -o - -target-cpu pwr8 | FileCheck %s \
+// RUN:   -check-prefixes=PPC64,CHECK
 // RUN: %clang_cc1 -triple powerpc-unknown-aix \
 // RUN:   -emit-llvm %s -o - -target-cpu pwr7 | FileCheck %s
 // RUN: %clang_cc1 -triple powerpc64-unknown-aix \
@@ -11,18 +13,20 @@
 extern unsigned int ui;
 extern unsigned long long ull;
 
+#ifdef __PPC64__
 void test_builtin_ppc_rldimi() {
-  // CHECK-LABEL: test_builtin_ppc_rldimi
-  // CHECK:       %res = alloca i64, align 8
-  // CHECK-NEXT:  [[RA:%[0-9]+]] = load i64, ptr @ull, align 8
-  // CHECK-NEXT:  [[RB:%[0-9]+]] = load i64, ptr @ull, align 8
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i64 @llvm.ppc.rldimi(i64 [[RA]], i64 [[RB]], i32 63, i64 72057593769492480)
-  // CHECK-NEXT:  store i64 [[RC]], ptr %res, align 8
-  // CHECK-NEXT:  ret void
+  // PPC64-LABEL: test_builtin_ppc_rldimi
+  // PPC64:       %res = alloca i64, align 8
+  // PPC64-NEXT:  [[RA:%[0-9]+]] = load i64, ptr @ull, align 8
+  // PPC64-NEXT:  [[RB:%[0-9]+]] = load i64, ptr @ull, align 8
+  // PPC64-NEXT:  [[RC:%[0-9]+]] = call i64 @llvm.ppc.rldimi(i64 [[RA]], i64 [[RB]], i32 63, i64 72057593769492480)
+  // PPC64-NEXT:  store i64 [[RC]], ptr %res, align 8
+  // PPC64-NEXT:  ret void
 
   /*shift = 63, mask = 0x00FFFFFFF0000000 = 72057593769492480, ~mask = 0xFF0000000FFFFFFF = -72057593769492481*/
   unsigned long long res = __builtin_ppc_rldimi(ull, ull, 63, 0x00FFFFFFF0000000);
 }
+#endif
 
 void test_builtin_ppc_rlwimi() {
   // CHECK-LABEL: test_builtin_ppc_rlwimi
diff --git a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
new file mode 100644
index 0000000000000..3e5ab9e92a1d8
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
@@ -0,0 +1,99 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +fp-armv8 -S -target-abi aapcs      -verify=fp-hard %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs-soft -verify=nofp-soft %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs      -verify=nofp-hard %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs -O1  -verify=nofp-hard,nofp-hard-opt -emit-llvm %s
+// No run line needed for soft-float ABI with an FPU because that is rejected by the driver
+
+// With the hard-float ABI and a target with an FPU, FP arguments are passed in
+// FP registers, no diagnostics needed.
+// fp-hard-no-diagnostics
+
+// With the soft-float ABI, FP arguments are passed in integer registers, no
+// diagnostics needed.
+// nofp-soft-no-diagnostics
+
+// With the hard-float ABI but no FPU, FP arguments cannot be passed in an
+// ABI-compatible way, so we report errors for these cases:
+
+struct HFA {
+  float x, y;
+};
+
+struct non_HFA {
+  float x;
+  int y;
+};
+
+// Floating-point arguments are returns are rejected
+void test_fp16_arg(__fp16 a) {}
+// nofp-hard-error@-1 {{'a' requires '__fp16' type support, but ABI 'aapcs' does not support it}}
+__fp16 test_fp16_ret(void) { return 3.141; }
+// nofp-hard-error@-1 {{'test_fp16_ret' requires '__fp16' type support, but ABI 'aapcs' does not support it}}
+void test_float_arg(float a) {}
+// nofp-hard-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}}
+float test_float_ret(void) { return 3.141f; }
+// nofp-hard-error@-1 {{'test_float_ret' requires 'float' type support, but ABI 'aapcs' does not support it}}
+void test_double_arg(double a) {}
+// nofp-hard-error@-1 {{'a' requires 'double' type support, but ABI 'aapcs' does not support it}}
+double test_double_ret(void) { return 3.141; }
+// nofp-hard-error@-1 {{'test_double_ret' requires 'double' type support, but ABI 'aapcs' does not support it}}
+void test_long_double_arg(long double a) {}
+// nofp-hard-error@-1 {{'a' requires 'long double' type support, but ABI 'aapcs' does not support it}}
+long double test_long_double_ret(void) { return 3.141L; }
+// nofp-hard-error@-1 {{'test_long_double_ret' requires 'long double' type support, but ABI 'aapcs' does not support it}}
+
+// HFAs would be passed in floating-point registers, so are rejected.
+void test_hfa_arg(struct HFA a) {}
+// nofp-hard-error@-1 {{'a' requires 'struct HFA' type support, but ABI 'aapcs' does not support it}}
+struct HFA test_hfa_ret(void) { return (struct HFA){}; }
+// nofp-hard-error@-1 {{'test_hfa_ret' requires 'struct HFA' type support, but ABI 'aapcs' does not support it}}
+
+// Note: vector types cannot be created at all for targets without an FPU, so
+// it is not possible to create a function which passes/returns them when using
+// either the default or soft-float ABI. This is tested elsewhere.
+
+// This struct contains a floating-point type, but is not an HFA, so can be
+// passed/returned without affecting the ABI.
+struct non_HFA test_non_hfa_ret(void) { return (struct non_HFA){}; }
+void test_non_hfa_arg(struct non_HFA a) {}
+
+// This inline function does not get code-generated because there is no use of
+// it in this file, so we we don't emit an error for it, matching GCC's
+// behaviour.
+inline void test_float_arg_inline(float a) {}
+
+// This inline function is used, so we emit the error if we generate code for
+// it. The code isn't generated at -O0, so no error is emitted there.
+inline void test_float_arg_inline_used(float a) {}
+// nofp-hard-opt-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}}
+void use_inline() { test_float_arg_inline_used(1.0f); }
+
+// The always_inline attribute causes an inline function to always be
+// code-genned, even at -O0, so we always emit the error.
+__attribute((always_inline))
+inline void test_float_arg_always_inline_used(float a) {}
+// nofp-hard-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}}
+void use_always_inline() { test_float_arg_always_inline_used(1.0f); }
+
+// Floating-point expressions, global variables and local variables do not
+// affect the ABI, so are allowed. GCC does reject some uses of floating point
+// types like this, but it does so after optimisation, which we can't
+// accurately match in clang.
+int test_expr_float(int a) { return a + 1.0f; }
+int test_expr_double(int a) { return a + 1.0; }
+
+float global_float = 2.0f * 3.5f;
+float global_double = 2.0 * 3.5;
+
+int test_var_float(int a) {
+  float f = a;
+  f *= 6.0;
+  return (int)f;
+}
+int test_var_double(int a) {
+  double d = a;
+  d *= 6.0;
+  return (int)d;
+}
diff --git a/clang/test/CodeGen/aarch64-soft-float-abi.c b/clang/test/CodeGen/aarch64-soft-float-abi.c
new file mode 100644
index 0000000000000..143377e218a19
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-soft-float-abi.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -triple aarch64 -target-feature +fp-armv8 -target-abi aapcs -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD
+// RUN: %clang_cc1 -triple aarch64 -target-feature -fp-armv8 -target-abi aapcs-soft -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,SOFT
+
+// See also llvm/test/CodeGen/AArch64/soft-float-abi.ll, which checks the LLVM
+// backend parts of the soft-float ABI.
+
+// The va_list type does not change between the ABIs
+// CHECK: %struct.__va_list = type { ptr, ptr, ptr, i32, i32 }
+
+// Floats are passed in integer registers, this will be handled by the backend.
+// CHECK: define dso_local half @test0(half noundef %a)
+// CHECK: define dso_local bfloat @test1(bfloat noundef %a)
+// CHECK: define dso_local float @test2(float noundef %a)
+// CHECK: define dso_local double @test3(double noundef %a)
+// CHECK: define dso_local fp128 @test4(fp128 noundef %a)
+__fp16 test0(__fp16 a) { return a; }
+__bf16 test1(__bf16 a) { return a; }
+float test2(float a) { return a; }
+double test3(double a) { return a; }
+long double test4(long double a) { return a; }
+
+// No types are considered to be HFAs or HVAs by the soft-float PCS, so these
+// are converted to integer types.
+struct A {
+  float x;
+};
+// SOFT: define dso_local i32 @test10(i64 %a.coerce)
+// HARD: define dso_local %struct.A @test10([1 x float] alignstack(8) %a.coerce)
+struct A test10(struct A a) { return a; }
+
+struct B {
+  double x;
+  double y;
+};
+// SOFT: define dso_local [2 x i64] @test11([2 x i64] %a.coerce)
+// HARD: define dso_local %struct.B @test11([2 x double] alignstack(8) %a.coerce)
+struct B test11(struct B a) { return a; }
+
+#include <stdarg.h>
+
+// The layout of the va_list struct is unchanged between the ABIs, but for
+// aapcs-soft, floating-point arguments will be retreived from the GPR save
+// area, as if they were an integer type of the same size.
+// CHECK-LABEL: define dso_local double @test20(i32 noundef %a, ...)
+// CHECK: %vl = alloca %struct.__va_list, align 8
+// SOFT: %gr_offs_p = getelementptr inbounds %struct.__va_list, ptr %vl, i32 0, i32 3
+// SOFT: %reg_top_p = getelementptr inbounds %struct.__va_list, ptr %vl, i32 0, i32 1
+// HARD: %vr_offs_p = getelementptr inbounds %struct.__va_list, ptr %vl, i32 0, i32 4
+// HARD: %reg_top_p = getelementptr inbounds %struct.__va_list, ptr %vl, i32 0, i32 2
+double test20(int a, ...) {
+  va_list vl;
+  va_start(vl, a);
+  return va_arg(vl, double);
+}
+
+// Vector types are only available for targets with the correct hardware, and
+// their calling-convention is left undefined by the soft-float ABI, so they
+// aren't tested here.
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 276a7b87b7a1b..94095f9aa3e1f 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -23,8 +23,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 
 
 
-
-
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
 // CHECK: @ftc.ifunc = weak_odr alias i32 (), ptr @ftc
@@ -177,15 +175,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @ftc_dup2._McrcMdotprod
 // CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 256
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 256
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
 // CHECK-NEXT:    ret ptr @ftc_dup2._Mfp
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @ftc_dup2.default
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
diff --git a/clang/test/CodeGen/builtins-arm64.c b/clang/test/CodeGen/builtins-arm64.c
index 05ea1c719edff..8bd68d9ceb48e 100644
--- a/clang/test/CodeGen/builtins-arm64.c
+++ b/clang/test/CodeGen/builtins-arm64.c
@@ -156,4 +156,10 @@ int rndrrs(uint64_t *__addr) {
   return __builtin_arm_rndrrs(__addr);
 }
 
+// CHECK-LABEL: @trap(
+// CHECK: call void @llvm.aarch64.break(i32 42)
+void trap() {
+  __builtin_arm_trap(42);
+}
+
 // CHECK: ![[M0]] = !{!"1:2:3:4:5"}
diff --git a/clang/test/CodeGen/complex-math-mixed.c b/clang/test/CodeGen/complex-math-mixed.c
new file mode 100644
index 0000000000000..050163cca80aa
--- /dev/null
+++ b/clang/test/CodeGen/complex-math-mixed.c
@@ -0,0 +1,146 @@
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -o - | FileCheck %s --check-prefix=X86
+// RUN: %clang_cc1 %s -O0 -triple x86_64-unknown-unknown -fsyntax-only -ast-dump | FileCheck %s --check-prefix=AST
+
+// Check that for 'F _Complex + int' (F = real floating-point type), we emit an
+// implicit cast from 'int' to 'F', but NOT to 'F _Complex' (i.e. that we do
+// 'F _Complex + F', NOT 'F _Complex + F _Complex'), and likewise for -/*.
+
+// AST-NOT: FloatingRealToComplex
+
+float _Complex add_float_ci(float _Complex a, int b) {
+  // X86-LABEL: @add_float_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fadd float {{.*}}, [[I]]
+  // X86-NOT: fadd
+  return a + b;
+}
+
+float _Complex add_float_ic(int a, float _Complex b) {
+  // X86-LABEL: @add_float_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fadd float [[I]]
+  // X86-NOT: fadd
+  return a + b;
+}
+
+float _Complex sub_float_ci(float _Complex a, int b) {
+  // X86-LABEL: @sub_float_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fsub float {{.*}}, [[I]]
+  // X86-NOT: fsub
+  return a - b;
+}
+
+float _Complex sub_float_ic(int a, float _Complex b) {
+  // X86-LABEL: @sub_float_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fsub float [[I]]
+  // X86: fneg
+  // X86-NOT: fsub
+  return a - b;
+}
+
+float _Complex mul_float_ci(float _Complex a, int b) {
+  // X86-LABEL: @mul_float_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fmul float {{.*}}, [[I]]
+  // X86: fmul float {{.*}}, [[I]]
+  // X86-NOT: fmul
+  return a * b;
+}
+
+float _Complex mul_float_ic(int a, float _Complex b) {
+  // X86-LABEL: @mul_float_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fmul float [[I]]
+  // X86: fmul float [[I]]
+  // X86-NOT: fmul
+  return a * b;
+}
+
+float _Complex div_float_ci(float _Complex a, int b) {
+  // X86-LABEL: @div_float_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fdiv float {{.*}}, [[I]]
+  // X86: fdiv float {{.*}}, [[I]]
+  // X86-NOT: @__divsc3
+  return a / b;
+}
+
+// There is no good way of doing this w/o converting the 'int' to a complex
+// number, so we expect complex division here.
+float _Complex div_float_ic(int a, float _Complex b) {
+  // X86-LABEL: @div_float_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: call {{.*}} @__divsc3(float {{.*}} [[I]], float noundef 0.{{0+}}e+00, float {{.*}}, float {{.*}})
+  return a / b;
+}
+
+double _Complex add_double_ci(double _Complex a, int b) {
+  // X86-LABEL: @add_double_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fadd double {{.*}}, [[I]]
+  // X86-NOT: fadd
+  return a + b;
+}
+
+double _Complex add_double_ic(int a, double _Complex b) {
+  // X86-LABEL: @add_double_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fadd double [[I]]
+  // X86-NOT: fadd
+  return a + b;
+}
+
+double _Complex sub_double_ci(double _Complex a, int b) {
+  // X86-LABEL: @sub_double_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fsub double {{.*}}, [[I]]
+  // X86-NOT: fsub
+  return a - b;
+}
+
+double _Complex sub_double_ic(int a, double _Complex b) {
+  // X86-LABEL: @sub_double_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fsub double [[I]]
+  // X86: fneg
+  // X86-NOT: fsub
+  return a - b;
+}
+
+double _Complex mul_double_ci(double _Complex a, int b) {
+  // X86-LABEL: @mul_double_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fmul double {{.*}}, [[I]]
+  // X86: fmul double {{.*}}, [[I]]
+  // X86-NOT: fmul
+  return a * b;
+}
+
+double _Complex mul_double_ic(int a, double _Complex b) {
+  // X86-LABEL: @mul_double_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fmul double [[I]]
+  // X86: fmul double [[I]]
+  // X86-NOT: fmul
+  return a * b;
+}
+
+double _Complex div_double_ci(double _Complex a, int b) {
+  // X86-LABEL: @div_double_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fdiv double {{.*}}, [[I]]
+  // X86: fdiv double {{.*}}, [[I]]
+  // X86-NOT: @__divdc3
+  return a / b;
+}
+
+// There is no good way of doing this w/o converting the 'int' to a complex
+// number, so we expect complex division here.
+double _Complex div_double_ic(int a, double _Complex b) {
+  // X86-LABEL: @div_double_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: call {{.*}} @__divdc3(double {{.*}} [[I]], double noundef 0.{{0+}}e+00, double {{.*}}, double {{.*}})
+  return a / b;
+}
diff --git a/clang/test/CodeGen/ptrauth-intrinsics.c b/clang/test/CodeGen/ptrauth-intrinsics.c
new file mode 100644
index 0000000000000..17f28dddb3801
--- /dev/null
+++ b/clang/test/CodeGen/ptrauth-intrinsics.c
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-intrinsics -emit-llvm %s  -o - | FileCheck %s
+
+void (*fnptr)(void);
+long int_discriminator;
+void *ptr_discriminator;
+long signature;
+
+// CHECK-LABEL: define void @test_auth()
+void test_auth() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC0:%.*]] = load ptr, ptr @ptr_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[DISC:%.*]] = ptrtoint ptr [[DISC0]] to i64
+  // CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T0]], i32 0, i64 [[DISC]])
+  // CHECK-NEXT: [[RESULT:%.*]] = inttoptr  i64 [[T1]] to ptr
+  // CHECK-NEXT: store ptr [[RESULT]], ptr @fnptr,
+  fnptr = __builtin_ptrauth_auth(fnptr, 0, ptr_discriminator);
+}
+
+// CHECK-LABEL: define void @test_strip()
+void test_strip() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.strip(i64 [[T0]], i32 0)
+  // CHECK-NEXT: [[RESULT:%.*]] = inttoptr  i64 [[T1]] to ptr
+  // CHECK-NEXT: store ptr [[RESULT]], ptr @fnptr,
+  fnptr = __builtin_ptrauth_strip(fnptr, 0);
+}
+
+// CHECK-LABEL: define void @test_sign_unauthenticated()
+void test_sign_unauthenticated() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC0:%.*]] = load ptr, ptr @ptr_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[DISC:%.*]] = ptrtoint ptr [[DISC0]] to i64
+  // CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[T0]], i32 0, i64 [[DISC]])
+  // CHECK-NEXT: [[RESULT:%.*]] = inttoptr  i64 [[T1]] to ptr
+  // CHECK-NEXT: store ptr [[RESULT]], ptr @fnptr,
+  fnptr = __builtin_ptrauth_sign_unauthenticated(fnptr, 0, ptr_discriminator);
+}
+
+// CHECK-LABEL: define void @test_auth_and_resign()
+void test_auth_and_resign() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC0:%.*]] = load ptr, ptr @ptr_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[DISC:%.*]] = ptrtoint ptr [[DISC0]] to i64
+  // CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 0, i64 [[DISC]], i32 3, i64 15)
+  // CHECK-NEXT: [[RESULT:%.*]] = inttoptr  i64 [[T1]] to ptr
+  // CHECK-NEXT: store ptr [[RESULT]], ptr @fnptr,
+  fnptr = __builtin_ptrauth_auth_and_resign(fnptr, 0, ptr_discriminator, 3, 15);
+}
+
+// CHECK-LABEL: define void @test_blend_discriminator()
+void test_blend_discriminator() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC:%.*]] = load i64, ptr @int_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 [[DISC]])
+  // CHECK-NEXT: store i64 [[RESULT]], ptr @int_discriminator,
+  int_discriminator = __builtin_ptrauth_blend_discriminator(fnptr, int_discriminator);
+}
+
+// CHECK-LABEL: define void @test_sign_generic_data()
+void test_sign_generic_data() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC0:%.*]] = load ptr, ptr @ptr_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[DISC:%.*]] = ptrtoint ptr [[DISC0]] to i64
+  // CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.ptrauth.sign.generic(i64 [[T0]], i64 [[DISC]])
+  // CHECK-NEXT: store i64 [[RESULT]], ptr @signature,
+  signature = __builtin_ptrauth_sign_generic_data(fnptr, ptr_discriminator);
+}
diff --git a/clang/test/CodeGen/target-avx-abi-diag.c b/clang/test/CodeGen/target-avx-abi-diag.c
index 84be9e252db5c..dfbbc3213ca6b 100644
--- a/clang/test/CodeGen/target-avx-abi-diag.c
+++ b/clang/test/CodeGen/target-avx-abi-diag.c
@@ -2,13 +2,9 @@
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx -verify=no512 -o - -S
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -verify=both -o - -S
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature +evex512 -verify=both -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -verify=avx512-256 -DAVX512_ERR=1 -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -verify=avx512-256 -DAVX512_ERR=2 -DNOEVEX512 -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -verify=avx512-256 -DAVX512_ERR=3 -DNOEVEX512 -o - -S
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -verify=avx512-256 -DNOEVEX512 -o - -S
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-512 -verify=both -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-256 -verify=avx512-256 -DAVX512_ERR=1 -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-256 -verify=avx512-256 -DAVX512_ERR=2 -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-256 -verify=avx512-256 -DAVX512_ERR=3 -o - -S
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-256 -verify=avx512-256 -o - -S
 // REQUIRES: x86-registered-target
 
 // both-no-diagnostics
@@ -25,7 +21,6 @@ void takesAvx512_no_target(avx512fType t);
 void variadic(int i, ...);
 __attribute__((target("avx512f"))) void variadic_err(int i, ...);
 
-#if !defined(AVX512_ERR) || AVX512_ERR == 1
 // If neither side has an attribute, warn.
 void call_warn(void) {
   avx256Type t1;
@@ -39,9 +34,7 @@ void call_warn(void) {
   // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
   variadic(3, t2); // no512-warning {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
 }
-#endif
 
-#if !defined(AVX512_ERR) || AVX512_ERR == 2
 // If only 1 side has an attribute, error.
 void call_errors(void) {
   avx256Type t1;
@@ -54,16 +47,21 @@ void call_errors(void) {
   // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
   variadic_err(3, t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
 }
-#if defined(__AVX10_1__)
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-#endif
-#endif
 
-#if !defined(AVX512_ERR) || AVX512_ERR == 3
+// Check that these errors are treated as non-fatal, so we can report them for
+// all functions, not just the first.
+void call_errors_2(void) {
+  avx256Type t1;
+  takesAvx256(t1); // no256-error {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}}
+  avx512fType t2;
+  // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
+  takesAvx512(t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
+
+  variadic_err(1, t1); // no256-error {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}}
+  // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
+  variadic_err(3, t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
+}
+
 __attribute__((target("avx"))) void call_avx256_ok(void) {
   avx256Type t;
   takesAvx256(t);
@@ -93,5 +91,11 @@ __attribute__((target("avx512f"))) void call_avx512_ok2(void) {
 // avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
 // avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
 // avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-#endif
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
 #endif
diff --git a/clang/test/CodeGen/volatile.cpp b/clang/test/CodeGen/volatile.cpp
index 38724659ad8a3..70f523b93852e 100644
--- a/clang/test/CodeGen/volatile.cpp
+++ b/clang/test/CodeGen/volatile.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -O2 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o -  | FileCheck %s -check-prefix CHECK
+// RUN: %clang_cc1 -O2 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o -  | FileCheck %s
 struct agg 
 {
 int a ;
@@ -10,34 +10,32 @@ _Complex float cf;
 int volatile vol =10;
 void f0() {
     const_cast<volatile _Complex float &>(cf) = const_cast<volatile _Complex float&>(cf) + 1;
-//  CHECK: %cf.real = load volatile float, ptr @cf
-//  CHECK: %cf.imag = load volatile float, ptr getelementptr
-//  CHECK: %add.r = fadd float %cf.real, 1.000000e+00
-//  CHECK: %add.i = fadd float %cf.imag, 0.000000e+00
-//  CHECK: store volatile float %add.r
-//  CHECK: store volatile float %add.i, ptr getelementptr
+//  CHECK: [[Re1:%.*]] = load volatile float, ptr @cf
+//  CHECK: [[Im1:%.*]] = load volatile float, ptr getelementptr
+//  CHECK: [[Add1:%.*]] = fadd float [[Re1]], 1.000000e+00
+//  CHECK: store volatile float [[Add1]], ptr @cf
+//  CHECK: store volatile float [[Im1]], ptr getelementptr
       static_cast<volatile _Complex float &>(cf) = static_cast<volatile _Complex float&>(cf) + 1;
-//  CHECK: %cf.real1 = load volatile float, ptr @cf
-//  CHECK: %cf.imag2 = load volatile float, ptr getelementptr
-//  CHECK: %add.r3 = fadd float %cf.real1, 1.000000e+00
-//  CHECK: %add.i4 = fadd float %cf.imag2, 0.000000e+00
-//  CHECK: store volatile float %add.r3, ptr @cf
-//  CHECK: store volatile float %add.i4, ptr getelementptr
+//  CHECK: [[Re2:%.*]] = load volatile float, ptr @cf
+//  CHECK: [[Im2:%.*]] = load volatile float, ptr getelementptr
+//  CHECK: [[Add2:%.*]] = fadd float [[Re2]], 1.000000e+00
+//  CHECK: store volatile float [[Add2]], ptr @cf
+//  CHECK: store volatile float [[Im2]], ptr getelementptr
     const_cast<volatile  int  &>(a.a) = const_cast<volatile int &>(t.a) ;
-//  CHECK: %0 = load volatile i32, ptr @t
-//  CHECK: store volatile i32 %0, ptr @a
+//  CHECK: [[I1:%.*]] = load volatile i32, ptr @t
+//  CHECK: store volatile i32 [[I1]], ptr @a
     static_cast<volatile  int  &>(a.b) = static_cast<volatile int  &>(t.a) ;
-//  CHECK: %1 = load volatile i32, ptr @t
-//  CHECK: store volatile i32 %1, ptr getelementptr
+//  CHECK: [[I2:%.*]] = load volatile i32, ptr @t
+//  CHECK: store volatile i32 [[I2]], ptr getelementptr
     const_cast<volatile int&>(vt) = const_cast<volatile int&>(vt) + 1;
-//  CHECK: %2 = load volatile i32, ptr @vt
-//  CHECK: %add = add nsw i32 %2, 1
-//  CHECK: store volatile i32 %add, ptr @vt
+//  CHECK: [[I3:%.*]] = load volatile i32, ptr @vt
+//  CHECK: [[Add3:%.*]] = add nsw i32 [[I3]], 1
+//  CHECK: store volatile i32 [[Add3]], ptr @vt
      static_cast<volatile int&>(vt) = static_cast<volatile int&>(vt) + 1;
-//  CHECK: %3 = load volatile i32, ptr @vt
-//  CHECK: %add5 = add nsw i32 %3, 1
-//  CHECK: store volatile i32 %add5, ptr @vt
+//  CHECK: [[I4:%.*]] = load volatile i32, ptr @vt
+//  CHECK: [[Add4:%.*]] = add nsw i32 [[I4]], 1
+//  CHECK: store volatile i32 [[Add4]], ptr @vt
     vt = const_cast<int&>(vol);
-//  %4 = load i32, ptr @vol
-//  store i32 %4, ptr @vt
+//  [[I5:%.*]] = load i32, ptr @vol
+//  store i32 [[I5]], ptr @vt
 }
diff --git a/clang/test/CodeGenCXX/arm-swiftcall.cpp b/clang/test/CodeGenCXX/arm-swiftcall.cpp
index e60c1482700a4..45eea7bfd853b 100644
--- a/clang/test/CodeGenCXX/arm-swiftcall.cpp
+++ b/clang/test/CodeGenCXX/arm-swiftcall.cpp
@@ -1,5 +1,9 @@
 // RUN: %clang_cc1 -triple armv7-apple-darwin9 -emit-llvm -o - %s -Wno-return-type-c-linkage -std=c++03 | FileCheck %s -check-prefixes=CHECK
 
+// For now just check that the RISC-V triples are accepted, but don't check the IR, as swiftcall is not yet supported.
+// RUN: %clang_cc1 -triple riscv32-unknown-linux-gnu -emit-llvm -o - %s -Wno-return-type-c-linkage -std=c++03
+// RUN: %clang_cc1 -triple riscv64-unknown-linux-gnu -emit-llvm -o - %s -Wno-return-type-c-linkage -std=c++03
+
 // This isn't really testing anything ARM-specific; it's just a convenient
 // 32-bit platform.
 
diff --git a/clang/test/CodeGenCXX/mangle-ms-back-references.cpp b/clang/test/CodeGenCXX/mangle-ms-back-references.cpp
index cb95c100b3d22..b27a9c5acacb7 100644
--- a/clang/test/CodeGenCXX/mangle-ms-back-references.cpp
+++ b/clang/test/CodeGenCXX/mangle-ms-back-references.cpp
@@ -83,3 +83,20 @@ class H;
 
 void ManyParams(T01 &, T02 &, T03 &, T04 &, T05 &, T06 &, T07 &, T08 &, T09 &, T10 &, H<T11> &, H<T11> &) {}
 // CHECK: "?ManyParams@@YAXAAVT01@@AAVT02@@AAVT03@@AAVT04@@AAVT05@@AAVT06@@AAVT07@@AAVT08@@AAVT09@@AAVT10@@AAV?$H@VT11@@@@AAV?$H@VT11@@@@@Z"
+
+namespace NS {
+// The name "TSS0" for the name of the class below has been specifically
+// chosen to ensure that back reference lookup does not match against the
+// implicitly generated "$TSS0" name of the thread safe static initialization
+// variable.
+struct __declspec(dllexport) TSS0 {
+  static TSS0& get();
+  __forceinline static TSS0& singleton() {
+    static TSS0& lsv = get();
+    return lsv;
+  }
+};
+}
+// CHECK: "?singleton@TSS0@NS@@SAAAU12@XZ"
+// CHECK: "?lsv@?1??singleton@TSS0@NS@@SAAAU23@XZ@4AAU23@A"
+// CHECK: "?$TSS0@?1??singleton@TSS0@NS@@SAAAU23@XZ@4HA"
diff --git a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
new file mode 100644
index 0000000000000..e3ef26429e7e4
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// CHECK-LABEL: builtin_test_clamp_int4
+// CHECK: %dx.clamp = call <4 x i32> @llvm.dx.clamp.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+// CHECK: ret <4 x i32> %dx.clamp
+int4 builtin_test_clamp_int4(int4 p0, int4 p1, int4 p2) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p2);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
new file mode 100644
index 0000000000000..029e48ffe2586
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
@@ -0,0 +1,134 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+#ifdef __HLSL_ENABLE_16_BIT
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.dx.clamp.i16(
+int16_t test_clamp_short(int16_t p0, int16_t p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.dx.clamp.v2i16(
+int16_t2 test_clamp_short2(int16_t2 p0, int16_t2 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.dx.clamp.v3i16
+int16_t3 test_clamp_short3(int16_t3 p0, int16_t3 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.dx.clamp.v4i16
+int16_t4 test_clamp_short4(int16_t4 p0, int16_t4 p1) { return clamp(p0, p1,p1); }
+
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.dx.uclamp.i16(
+uint16_t test_clamp_ushort(uint16_t p0, uint16_t p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.dx.uclamp.v2i16
+uint16_t2 test_clamp_ushort2(uint16_t2 p0, uint16_t2 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.dx.uclamp.v3i16
+uint16_t3 test_clamp_ushort3(uint16_t3 p0, uint16_t3 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.dx.uclamp.v4i16
+uint16_t4 test_clamp_ushort4(uint16_t4 p0, uint16_t4 p1) { return clamp(p0, p1,p1); }
+#endif
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.dx.clamp.i32(
+int test_clamp_int(int p0, int p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.dx.clamp.v2i32
+int2 test_clamp_int2(int2 p0, int2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.dx.clamp.v3i32
+int3 test_clamp_int3(int3 p0, int3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.dx.clamp.v4i32
+int4 test_clamp_int4(int4 p0, int4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.dx.uclamp.i32(
+int test_clamp_uint(uint p0, uint p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.dx.uclamp.v2i32
+uint2 test_clamp_uint2(uint2 p0, uint2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.dx.uclamp.v3i32
+uint3 test_clamp_uint3(uint3 p0, uint3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.dx.uclamp.v4i32
+uint4 test_clamp_uint4(uint4 p0, uint4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.dx.clamp.i64(
+int64_t test_clamp_long(int64_t p0, int64_t p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.dx.clamp.v2i64
+int64_t2 test_clamp_long2(int64_t2 p0, int64_t2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.dx.clamp.v3i64
+int64_t3 test_clamp_long3(int64_t3 p0, int64_t3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.dx.clamp.v4i64
+int64_t4 test_clamp_long4(int64_t4 p0, int64_t4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.dx.uclamp.i64(
+uint64_t test_clamp_long(uint64_t p0, uint64_t p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.dx.uclamp.v2i64
+uint64_t2 test_clamp_long2(uint64_t2 p0, uint64_t2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.dx.uclamp.v3i64
+uint64_t3 test_clamp_long3(uint64_t3 p0, uint64_t3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.dx.uclamp.v4i64
+uint64_t4 test_clamp_long4(uint64_t4 p0, uint64_t4 p1) { return clamp(p0, p1,p1); }
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.dx.clamp.f16(
+// NO_HALF: define noundef float @"?test_clamp_half
+// NO_HALF: call float @llvm.dx.clamp.f32(
+half test_clamp_half(half p0, half p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.dx.clamp.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_clamp_half2
+// NO_HALF: call <2 x float> @llvm.dx.clamp.v2f32(
+half2 test_clamp_half2(half2 p0, half2 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.dx.clamp.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_clamp_half3
+// NO_HALF: call <3 x float> @llvm.dx.clamp.v3f32(
+half3 test_clamp_half3(half3 p0, half3 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.dx.clamp.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_clamp_half4
+// NO_HALF: call <4 x float> @llvm.dx.clamp.v4f32(
+half4 test_clamp_half4(half4 p0, half4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef float @"?test_clamp_float
+// CHECK: call float @llvm.dx.clamp.f32(
+float test_clamp_float(float p0, float p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x float> @"?test_clamp_float2
+// CHECK: call <2 x float> @llvm.dx.clamp.v2f32
+float2 test_clamp_float2(float2 p0, float2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x float> @"?test_clamp_float3
+// CHECK: call <3 x float> @llvm.dx.clamp.v3f32
+float3 test_clamp_float3(float3 p0, float3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x float> @"?test_clamp_float4
+// CHECK: call <4 x float> @llvm.dx.clamp.v4f32
+float4 test_clamp_float4(float4 p0, float4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef double @
+// CHECK: call double @llvm.dx.clamp.f64(
+double test_clamp_double(double p0, double p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x double> @
+// CHECK: call <2 x double> @llvm.dx.clamp.v2f64
+double2 test_clamp_double2(double2 p0, double2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x double> @
+// CHECK: call <3 x double> @llvm.dx.clamp.v3f64
+double3 test_clamp_double3(double3 p0, double3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x double> @
+// CHECK: call <4 x double> @llvm.dx.clamp.v4f64
+double4 test_clamp_double4(double4 p0, double4 p1) { return clamp(p0, p1,p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl
index c064d118caf3e..0f993193c00cc 100644
--- a/clang/test/CodeGenHLSL/builtins/dot.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl
@@ -11,15 +11,15 @@
 // NATIVE_HALF: ret i16 %dx.dot
 int16_t test_dot_short(int16_t p0, int16_t p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v2i16(<2 x i16> %0, <2 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.sdot.v2i16(<2 x i16> %0, <2 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 int16_t test_dot_short2(int16_t2 p0, int16_t2 p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v3i16(<3 x i16> %0, <3 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.sdot.v3i16(<3 x i16> %0, <3 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 int16_t test_dot_short3(int16_t3 p0, int16_t3 p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v4i16(<4 x i16> %0, <4 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.sdot.v4i16(<4 x i16> %0, <4 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 int16_t test_dot_short4(int16_t4 p0, int16_t4 p1) { return dot(p0, p1); }
 
@@ -27,15 +27,15 @@ int16_t test_dot_short4(int16_t4 p0, int16_t4 p1) { return dot(p0, p1); }
 // NATIVE_HALF: ret i16 %dx.dot
 uint16_t test_dot_ushort(uint16_t p0, uint16_t p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v2i16(<2 x i16> %0, <2 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.udot.v2i16(<2 x i16> %0, <2 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 uint16_t test_dot_ushort2(uint16_t2 p0, uint16_t2 p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v3i16(<3 x i16> %0, <3 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.udot.v3i16(<3 x i16> %0, <3 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 uint16_t test_dot_ushort3(uint16_t3 p0, uint16_t3 p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v4i16(<4 x i16> %0, <4 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.udot.v4i16(<4 x i16> %0, <4 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 uint16_t test_dot_ushort4(uint16_t4 p0, uint16_t4 p1) { return dot(p0, p1); }
 #endif
@@ -44,15 +44,15 @@ uint16_t test_dot_ushort4(uint16_t4 p0, uint16_t4 p1) { return dot(p0, p1); }
 // CHECK: ret i32 %dx.dot
 int test_dot_int(int p0, int p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v2i32(<2 x i32> %0, <2 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.sdot.v2i32(<2 x i32> %0, <2 x i32> %1)
 // CHECK: ret i32 %dx.dot
 int test_dot_int2(int2 p0, int2 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v3i32(<3 x i32> %0, <3 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.sdot.v3i32(<3 x i32> %0, <3 x i32> %1)
 // CHECK: ret i32 %dx.dot
 int test_dot_int3(int3 p0, int3 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v4i32(<4 x i32> %0, <4 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.sdot.v4i32(<4 x i32> %0, <4 x i32> %1)
 // CHECK: ret i32 %dx.dot
 int test_dot_int4(int4 p0, int4 p1) { return dot(p0, p1); }
 
@@ -60,15 +60,15 @@ int test_dot_int4(int4 p0, int4 p1) { return dot(p0, p1); }
 // CHECK: ret i32 %dx.dot
 uint test_dot_uint(uint p0, uint p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v2i32(<2 x i32> %0, <2 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.udot.v2i32(<2 x i32> %0, <2 x i32> %1)
 // CHECK: ret i32 %dx.dot
 uint test_dot_uint2(uint2 p0, uint2 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v3i32(<3 x i32> %0, <3 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.udot.v3i32(<3 x i32> %0, <3 x i32> %1)
 // CHECK: ret i32 %dx.dot
 uint test_dot_uint3(uint3 p0, uint3 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v4i32(<4 x i32> %0, <4 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.udot.v4i32(<4 x i32> %0, <4 x i32> %1)
 // CHECK: ret i32 %dx.dot
 uint test_dot_uint4(uint4 p0, uint4 p1) { return dot(p0, p1); }
 
@@ -76,15 +76,15 @@ uint test_dot_uint4(uint4 p0, uint4 p1) { return dot(p0, p1); }
 // CHECK: ret i64 %dx.dot
 int64_t test_dot_long(int64_t p0, int64_t p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v2i64(<2 x i64> %0, <2 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.sdot.v2i64(<2 x i64> %0, <2 x i64> %1)
 // CHECK: ret i64 %dx.dot
 int64_t test_dot_long2(int64_t2 p0, int64_t2 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v3i64(<3 x i64> %0, <3 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.sdot.v3i64(<3 x i64> %0, <3 x i64> %1)
 // CHECK: ret i64 %dx.dot
 int64_t test_dot_long3(int64_t3 p0, int64_t3 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v4i64(<4 x i64> %0, <4 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.sdot.v4i64(<4 x i64> %0, <4 x i64> %1)
 // CHECK: ret i64 %dx.dot
 int64_t test_dot_long4(int64_t4 p0, int64_t4 p1) { return dot(p0, p1); }
 
@@ -92,15 +92,15 @@ int64_t test_dot_long4(int64_t4 p0, int64_t4 p1) { return dot(p0, p1); }
 // CHECK: ret i64 %dx.dot
 uint64_t test_dot_ulong(uint64_t p0, uint64_t p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v2i64(<2 x i64> %0, <2 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.udot.v2i64(<2 x i64> %0, <2 x i64> %1)
 // CHECK: ret i64 %dx.dot
 uint64_t test_dot_ulong2(uint64_t2 p0, uint64_t2 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v3i64(<3 x i64> %0, <3 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.udot.v3i64(<3 x i64> %0, <3 x i64> %1)
 // CHECK: ret i64 %dx.dot
 uint64_t test_dot_ulong3(uint64_t3 p0, uint64_t3 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v4i64(<4 x i64> %0, <4 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.udot.v4i64(<4 x i64> %0, <4 x i64> %1)
 // CHECK: ret i64 %dx.dot
 uint64_t test_dot_ulong4(uint64_t4 p0, uint64_t4 p1) { return dot(p0, p1); }
 
diff --git a/clang/test/CodeGenHLSL/builtins/isinf.hlsl b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
new file mode 100644
index 0000000000000..df44fc4a91dfd
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// CHECK: define noundef i1 @
+// NATIVE_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f16(
+// NO_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f32(
+// CHECK: ret i1 %dx.isinf
+bool test_isinf_half(half p0) { return isinf(p0); }
+// CHECK: define noundef <2 x i1> @
+// NATIVE_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f16
+// NO_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32(
+// CHECK: ret <2 x i1> %dx.isinf
+bool2 test_isinf_half2(half2 p0) { return isinf(p0); }
+// NATIVE_HALF: define noundef <3 x i1> @
+// NATIVE_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f16
+// NO_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32(
+// CHECK: ret <3 x i1> %dx.isinf
+bool3 test_isinf_half3(half3 p0) { return isinf(p0); }
+// NATIVE_HALF: define noundef <4 x i1> @
+// NATIVE_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f16
+// NO_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32(
+// CHECK: ret <4 x i1> %dx.isinf
+bool4 test_isinf_half4(half4 p0) { return isinf(p0); }
+
+// CHECK: define noundef i1 @
+// CHECK: %dx.isinf = call i1 @llvm.dx.isinf.f32(
+// CHECK: ret i1 %dx.isinf
+bool test_isinf_float(float p0) { return isinf(p0); }
+// CHECK: define noundef <2 x i1> @
+// CHECK: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32
+// CHECK: ret <2 x i1> %dx.isinf
+bool2 test_isinf_float2(float2 p0) { return isinf(p0); }
+// CHECK: define noundef <3 x i1> @
+// CHECK: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32
+// CHECK: ret <3 x i1> %dx.isinf
+bool3 test_isinf_float3(float3 p0) { return isinf(p0); }
+// CHECK: define noundef <4 x i1> @
+// CHECK: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32
+// CHECK: ret <4 x i1> %dx.isinf
+bool4 test_isinf_float4(float4 p0) { return isinf(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
index 1f16dec68212e..2fd5a19fc3352 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
@@ -1,27 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-
-
-// CHECK-LABEL: builtin_lerp_half_scalar
-// CHECK: %3 = fsub double %conv1, %conv
-// CHECK: %4 = fmul double %conv2, %3
-// CHECK: %dx.lerp = fadd double %conv, %4
-// CHECK: %conv3 = fptrunc double %dx.lerp to half
-// CHECK: ret half %conv3
-half builtin_lerp_half_scalar (half p0) {
-  return __builtin_hlsl_lerp ( p0, p0, p0 );
-}
-
-// CHECK-LABEL: builtin_lerp_float_scalar
-// CHECK: %3 = fsub double %conv1, %conv
-// CHECK: %4 = fmul double %conv2, %3
-// CHECK: %dx.lerp = fadd double %conv, %4
-// CHECK: %conv3 = fptrunc double %dx.lerp to float
-// CHECK: ret float %conv3
-float builtin_lerp_float_scalar ( float p0) {
-  return __builtin_hlsl_lerp ( p0, p0, p0 );
-}
-
 // CHECK-LABEL: builtin_lerp_half_vector
 // CHECK: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
 // CHECK: ret <3 x half> %dx.lerp
diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
index a6b3d9643d674..49cd04a10115a 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
@@ -6,13 +6,10 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: %3 = fsub half %1, %0
-// NATIVE_HALF: %4 = fmul half %2, %3
-// NATIVE_HALF: %dx.lerp = fadd half %0, %4
+
+// NATIVE_HALF: %dx.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
 // NATIVE_HALF: ret half %dx.lerp
-// NO_HALF: %3 = fsub float %1, %0
-// NO_HALF: %4 = fmul float %2, %3
-// NO_HALF: %dx.lerp = fadd float %0, %4
+// NO_HALF: %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
 // NO_HALF: ret float %dx.lerp
 half test_lerp_half(half p0) { return lerp(p0, p0, p0); }
 
@@ -20,37 +17,35 @@ half test_lerp_half(half p0) { return lerp(p0, p0, p0); }
 // NATIVE_HALF: ret <2 x half> %dx.lerp
 // NO_HALF: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
 // NO_HALF: ret <2 x float> %dx.lerp
-half2 test_lerp_half2(half2 p0, half2 p1) { return lerp(p0, p0, p0); }
+half2 test_lerp_half2(half2 p0) { return lerp(p0, p0, p0); }
 
 // NATIVE_HALF: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
 // NATIVE_HALF: ret <3 x half> %dx.lerp
 // NO_HALF: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
 // NO_HALF: ret <3 x float> %dx.lerp
-half3 test_lerp_half3(half3 p0, half3 p1) { return lerp(p0, p0, p0); }
+half3 test_lerp_half3(half3 p0) { return lerp(p0, p0, p0); }
 
 // NATIVE_HALF: %dx.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
 // NATIVE_HALF: ret <4 x half> %dx.lerp
 // NO_HALF: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
 // NO_HALF: ret <4 x float> %dx.lerp
-half4 test_lerp_half4(half4 p0, half4 p1) { return lerp(p0, p0, p0); }
+half4 test_lerp_half4(half4 p0) { return lerp(p0, p0, p0); }
 
-// CHECK: %3 = fsub float %1, %0
-// CHECK: %4 = fmul float %2, %3
-// CHECK: %dx.lerp = fadd float %0, %4
+// CHECK: %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
 // CHECK: ret float %dx.lerp
-float test_lerp_float(float p0, float p1) { return lerp(p0, p0, p0); }
+float test_lerp_float(float p0) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
 // CHECK: ret <2 x float> %dx.lerp
-float2 test_lerp_float2(float2 p0, float2 p1) { return lerp(p0, p0, p0); }
+float2 test_lerp_float2(float2 p0) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
 // CHECK: ret <3 x float> %dx.lerp
-float3 test_lerp_float3(float3 p0, float3 p1) { return lerp(p0, p0, p0); }
+float3 test_lerp_float3(float3 p0) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
 // CHECK: ret <4 x float> %dx.lerp
-float4 test_lerp_float4(float4 p0, float4 p1) { return lerp(p0, p0, p0); }
+float4 test_lerp_float4(float4 p0) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
 // CHECK: ret <2 x float> %dx.lerp
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
new file mode 100644
index 0000000000000..c87a8c404b08e
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: %dx.rsqrt = call half @llvm.dx.rsqrt.f16(
+// NATIVE_HALF: ret half %dx.rsqrt
+// NO_HALF: define noundef float @"?test_rsqrt_half@@YA$halff@$halff@@Z"(
+// NO_HALF: %dx.rsqrt = call float @llvm.dx.rsqrt.f32(
+// NO_HALF: ret float %dx.rsqrt
+half test_rsqrt_half(half p0) { return rsqrt(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: %dx.rsqrt = call <2 x half> @llvm.dx.rsqrt.v2f16
+// NATIVE_HALF: ret <2 x half> %dx.rsqrt
+// NO_HALF: define noundef <2 x float> @
+// NO_HALF: %dx.rsqrt = call <2 x float> @llvm.dx.rsqrt.v2f32(
+// NO_HALF: ret <2 x float> %dx.rsqrt
+half2 test_rsqrt_half2(half2 p0) { return rsqrt(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: %dx.rsqrt = call <3 x half> @llvm.dx.rsqrt.v3f16
+// NATIVE_HALF: ret <3 x half> %dx.rsqrt
+// NO_HALF: define noundef <3 x float> @
+// NO_HALF: %dx.rsqrt = call <3 x float> @llvm.dx.rsqrt.v3f32(
+// NO_HALF: ret <3 x float> %dx.rsqrt
+half3 test_rsqrt_half3(half3 p0) { return rsqrt(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: %dx.rsqrt = call <4 x half> @llvm.dx.rsqrt.v4f16
+// NATIVE_HALF: ret <4 x half> %dx.rsqrt
+// NO_HALF: define noundef <4 x float> @
+// NO_HALF: %dx.rsqrt = call <4 x float> @llvm.dx.rsqrt.v4f32(
+// NO_HALF: ret <4 x float> %dx.rsqrt
+half4 test_rsqrt_half4(half4 p0) { return rsqrt(p0); }
+
+// CHECK: define noundef float @
+// CHECK: %dx.rsqrt = call float @llvm.dx.rsqrt.f32(
+// CHECK: ret float %dx.rsqrt
+float test_rsqrt_float(float p0) { return rsqrt(p0); }
+// CHECK: define noundef <2 x float> @
+// CHECK: %dx.rsqrt = call <2 x float> @llvm.dx.rsqrt.v2f32
+// CHECK: ret <2 x float> %dx.rsqrt
+float2 test_rsqrt_float2(float2 p0) { return rsqrt(p0); }
+// CHECK: define noundef <3 x float> @
+// CHECK: %dx.rsqrt = call <3 x float> @llvm.dx.rsqrt.v3f32
+// CHECK: ret <3 x float> %dx.rsqrt
+float3 test_rsqrt_float3(float3 p0) { return rsqrt(p0); }
+// CHECK: define noundef <4 x float> @
+// CHECK: %dx.rsqrt = call <4 x float> @llvm.dx.rsqrt.v4f32
+// CHECK: ret <4 x float> %dx.rsqrt
+float4 test_rsqrt_float4(float4 p0) { return rsqrt(p0); }
diff --git a/clang/test/CodeGenObjC/debug-info-blocks.m b/clang/test/CodeGenObjC/debug-info-blocks.m
index 14b29f222fbe8..59171da016da1 100644
--- a/clang/test/CodeGenObjC/debug-info-blocks.m
+++ b/clang/test/CodeGenObjC/debug-info-blocks.m
@@ -5,8 +5,8 @@
 
 // CHECK: define {{.*}}_block_invoke
 // CHECK: store ptr %.block_descriptor, ptr %[[ALLOCA:block.addr]], align
-// CHECK-NEXT: call void @llvm.dbg.declare(metadata ptr %[[ALLOCA]], metadata ![[SELF:[0-9]+]], metadata !{{.*}})
 // CHECK-NEXT: call void @llvm.dbg.declare(metadata ptr %d, metadata ![[D:[0-9]+]], metadata !{{.*}})
+// CHECK-NEXT: call void @llvm.dbg.declare(metadata ptr %[[ALLOCA]], metadata ![[SELF:[0-9]+]], metadata !{{.*}})
 
 // Test that we do emit scope info for the helper functions, and that the
 // parameters to these functions are marked as artificial (so the debugger
diff --git a/clang/test/Driver/aarch64-mcpu.c b/clang/test/Driver/aarch64-mcpu.c
index cacfc691058d1..77ba43122b245 100644
--- a/clang/test/Driver/aarch64-mcpu.c
+++ b/clang/test/Driver/aarch64-mcpu.c
@@ -56,6 +56,8 @@
 // CORTEX-A715: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a715"
 // RUN: %clang --target=aarch64 -mcpu=cortex-a720  -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A720 %s
 // CORTEX-A720: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a720"
+// RUN: %clang --target=aarch64 -mcpu=cortex-a720ae  -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A720AE %s
+// CORTEX-A720AE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a720ae"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-e1  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-E1 %s
 // NEOVERSE-E1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-e1"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-v1  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-V1 %s
@@ -70,6 +72,8 @@
 // NEOVERSE-512TVB: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-512tvb"
 // RUN: %clang --target=aarch64 -mcpu=cortex-a520 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A520 %s
 // CORTEX-A520: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a520"
+// RUN: %clang --target=aarch64 -mcpu=cortex-a520ae -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A520AE %s
+// CORTEX-A520AE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a520ae"
 
 // RUN: %clang --target=aarch64 -mcpu=cortex-r82  -### -c %s 2>&1 | FileCheck -check-prefix=CORTEXR82 %s
 // CORTEXR82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-r82"
diff --git a/clang/test/Driver/aarch64-soft-float-abi.c b/clang/test/Driver/aarch64-soft-float-abi.c
new file mode 100644
index 0000000000000..0486d94e66072
--- /dev/null
+++ b/clang/test/Driver/aarch64-soft-float-abi.c
@@ -0,0 +1,26 @@
+// REQUIRES: aarch64-registered-target
+
+// Hard-float, valid
+// RUN: %clang --target=aarch64-none-elf                               -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf                   -mabi=aapcs -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r                -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r    -mabi=aapcs -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r+fp -mabi=aapcs -c %s -o /dev/null
+
+// Soft-float, no FP
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r+nofp -mabi=aapcs-soft -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -mgeneral-regs-only -mabi=aapcs-soft -c %s -o /dev/null
+
+// Soft-float, FP hardware: Rejected, to avoid having two incompatible ABIs for common targets.
+// RUN: not %clang --target=aarch64-none-elf                        -mabi=aapcs-soft -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SOFT
+// RUN: not %clang --target=aarch64-none-elf -march=armv8-r+fp      -mabi=aapcs-soft -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SOFT
+// RUN: not %clang --target=aarch64-none-elf -march=armv8-r+nofp+fp -mabi=aapcs-soft -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SOFT
+
+// No FP, hard-float. This is accepted by the driver, but functions which
+// require arguments or returns to be passed in FP registers will be rejected
+// (tested elsewhere).
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r+nofp             -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r+nofp -mabi=aapcs -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -mgeneral-regs-only -mabi=aapcs -c %s -o /dev/null
+
+// INVALID-SOFT: error: 'aapcs-soft' ABI is not supported with FPU
diff --git a/clang/test/Driver/apple-kext-mkernel.c b/clang/test/Driver/apple-kext-mkernel.c
index f03ed4a09b47d..de905e169c139 100644
--- a/clang/test/Driver/apple-kext-mkernel.c
+++ b/clang/test/Driver/apple-kext-mkernel.c
@@ -13,8 +13,8 @@
 // CHECK-X86-2: "-fno-rtti"
 // CHECK-X86-2-NOT: "-fno-common"
 
-// RUN: not %clang -target x86_64-apple-darwin11 -arch armv7 -mkernel -mstrict-align -### -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
-// RUN: not %clang -target x86_64-apple-darwin11 -arch armv7 -mkernel -mstrict-align -### -fsyntax-only -fbuiltin -fno-builtin -fcommon -fno-common %s 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
+// RUN: %clang --target=x86_64-apple-darwin11 -arch armv7 -mkernel -mstrict-align -### -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
+// RUN: %clang --target=x86_64-apple-darwin11 -arch armv7 -mkernel -mstrict-align -### -fsyntax-only -fbuiltin -fno-builtin -fcommon -fno-common %s 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
 
 // CHECK-ARM: "-target-feature" "+long-calls"
 // CHECK-ARM: "-target-feature" "+strict-align"
diff --git a/clang/test/Driver/clang-g-opts.c b/clang/test/Driver/clang-g-opts.c
index b73602a155b00..fdbe0b96420c5 100644
--- a/clang/test/Driver/clang-g-opts.c
+++ b/clang/test/Driver/clang-g-opts.c
@@ -42,8 +42,3 @@
 
 // CHECK-WITH-G-STANDALONE: "-debug-info-kind=standalone"
 // CHECK-WITH-G-STANDALONE: "-dwarf-version=2"
-
-/// TODO: Special case before R_RISCV_SET_ULEB128 linker support becomes more widely available.
-// RUN: %clang -### -S %s -g --target=riscv64-linux-gnu 2>&1 | FileCheck --check-prefix=VERSION4 %s
-// RUN: %clang -### -S %s -g --target=riscv64-unknown-elf 2>&1 | FileCheck --check-prefix=VERSION4 %s
-// VERSION4: "-dwarf-version=4"
diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index c0e992702576e..33a1071c9f302 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -80,11 +80,15 @@
 //
 // RUN: not %clang -target nvptx64-nvidia-cuda %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=MISSING %s
+// RUN: not %clang -target nvptx64-nvidia-cuda -march=generic %s -### 2>&1 \
+// RUN:   | FileCheck -check-prefix=MISSING %s
 
 // MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'ptxas'
 // MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'nvlink'
 
 // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=GENERIC %s
+// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_52 -march=generic -flto -c %s -### 2>&1 \
+// RUN:   | FileCheck -check-prefix=GENERIC %s
 
 // GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"
diff --git a/clang/test/Driver/fsanitize-signed-integer-overflow.c b/clang/test/Driver/fsanitize-signed-integer-overflow.c
new file mode 100644
index 0000000000000..4a9345493b6aa
--- /dev/null
+++ b/clang/test/Driver/fsanitize-signed-integer-overflow.c
@@ -0,0 +1,28 @@
+/// When -fwrapv (implied by -fno-strict-overflow) is enabled,
+/// -fsanitize=undefined does not expand to signed-integer-overflow.
+/// -fsanitize=signed-integer-overflow is unaffected by -fwrapv.
+
+// RUN: %clang -### --target=x86_64-linux -fwrapv -fsanitize=signed-integer-overflow %s 2>&1 | FileCheck %s
+// CHECK: -fsanitize=signed-integer-overflow
+// CHECK: -fsanitize-recover=signed-integer-overflow
+
+// RUN: %clang -### --target=x86_64-linux -fno-strict-overflow -fsanitize=undefined %s 2>&1 | FileCheck %s --check-prefix=EXCLUDE
+// RUN: %clang -### --target=x86_64-linux -fstrict-overflow -fwrapv -fsanitize=undefined %s 2>&1 | FileCheck %s --check-prefix=EXCLUDE
+// EXCLUDE:     -fsanitize=alignment,array-bounds,
+// EXCLUDE-NOT: signed-integer-overflow,
+// EXCLUDE:      -fsanitize-recover=alignment,array-bounds,
+// EXCLUDE-SAME: signed-integer-overflow
+
+// RUN: %clang -### --target=x86_64-linux -fwrapv -fsanitize=undefined -fsanitize=signed-integer-overflow %s 2>&1 | FileCheck %s --check-prefix=INCLUDE
+// RUN: %clang -### --target=x86_64-linux -fno-strict-overflow -fno-sanitize=signed-integer-overflow -fsanitize=undefined -fsanitize=signed-integer-overflow %s 2>&1 | FileCheck %s --check-prefix=INCLUDE
+// INCLUDE:      -fsanitize=alignment,array-bounds,
+// INCLUDE-SAME: signed-integer-overflow
+// INCLUDE:      -fsanitize-recover=alignment,array-bounds,
+// INCLUDE-SAME: signed-integer-overflow
+
+/// -fsanitize-trap=undefined expands to signed-integer-overflow regardless of -fwrapv.
+// RUN: %clang -### --target=x86_64-linux -fwrapv -fsanitize=undefined -fsanitize=signed-integer-overflow -fsanitize-trap=undefined %s 2>&1 | FileCheck %s --check-prefix=INCLUDE-TRAP
+// INCLUDE-TRAP:      -fsanitize=alignment,array-bounds,
+// INCLUDE-TRAP-SAME: signed-integer-overflow
+// INCLUDE-TRAP:      -fsanitize-trap=alignment,array-bounds,
+// INCLUDE-TRAP-SAME: signed-integer-overflow
diff --git a/clang/test/Driver/hexagon-default-build-attributes.s b/clang/test/Driver/hexagon-default-build-attributes.s
new file mode 100644
index 0000000000000..58f246c2f805d
--- /dev/null
+++ b/clang/test/Driver/hexagon-default-build-attributes.s
@@ -0,0 +1,21 @@
+/// Enabled by default for assembly
+// RUN: %clang --target=hexagon-unknown-elf -### %s 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-ENABLED
+
+/// Can be forced on or off for assembly.
+// RUN: %clang --target=hexagon-unknown-elf -### %s 2>&1 -mno-default-build-attributes \
+// RUN:    | FileCheck %s -check-prefix CHECK-DISABLED
+// RUN: %clang --target=hexagon-unknown-elf -### %s 2>&1 -mdefault-build-attributes \
+// RUN:    | FileCheck %s -check-prefix CHECK-ENABLED
+
+/// Option ignored C/C++ (since we always emit hardware and ABI build attributes
+/// during codegen).
+// RUN: %clang --target=hexagon-unknown-elf -### -x c %s -mdefault-build-attributes 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-DISABLED-C
+// RUN: %clang --target=hexagon-unknown-elf -### -x c++ %s -mdefault-build-attributes 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-DISABLED-C
+
+// CHECK-DISABLED-NOT: "-hexagon-add-build-attributes"
+// CHECK-DISABLED-C-NOT: "-hexagon-add-build-attributes"
+// CHECK-ENABLED: "-hexagon-add-build-attributes"
+// CHECK-DISABLED-C: argument unused during compilation: '-mdefault-build-attributes'
diff --git a/clang/test/Driver/hip-partial-link.hip b/clang/test/Driver/hip-partial-link.hip
index faa185972abc3..c8451ec81ed37 100644
--- a/clang/test/Driver/hip-partial-link.hip
+++ b/clang/test/Driver/hip-partial-link.hip
@@ -47,7 +47,7 @@
 // OBJ:  D __hip_gpubin_handle_[[ID2]]
 
 // RUN: %clang -v --target=x86_64-unknown-linux-gnu --no-offload-new-driver \
-// RUN:   --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \
+// RUN:   --hip-link -fgpu-rdc --offload-arch=gfx906 \
 // RUN:   -fuse-ld=lld -nostdlib -r %t.main.o %t.lib.o -o %t.final.o \
 // RUN:   2>&1 | FileCheck -check-prefix=LINK-O %s
 // LINK-O-NOT: Found undefined HIP {{.*}}symbol
diff --git a/clang/test/Driver/hip-runtime-libs-linux.hip b/clang/test/Driver/hip-runtime-libs-linux.hip
index 142582963c958..a4cd2733114b6 100644
--- a/clang/test/Driver/hip-runtime-libs-linux.hip
+++ b/clang/test/Driver/hip-runtime-libs-linux.hip
@@ -43,6 +43,11 @@
 // RUN:   --rocm-path=%S/Inputs/rocm %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefixes=NOHIPRT %s
 
+// Test HIP runtime lib is not linked with -r.
+// RUN: %clang -### --hip-link -r --target=x86_64-linux-gnu \
+// RUN:   --rocm-path=%S/Inputs/rocm %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefixes=NOHIPRT %s
+
 // Test HIP runtime lib is linked without hip-link if there is HIP input file.
 // RUN: %clang -### --target=x86_64-linux-gnu -nogpuinc -nogpulib \
 // RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
diff --git a/clang/test/Driver/loongarch-munaligned-access.c b/clang/test/Driver/loongarch-munaligned-access.c
index 44edb2eb17e6a..a545679949973 100644
--- a/clang/test/Driver/loongarch-munaligned-access.c
+++ b/clang/test/Driver/loongarch-munaligned-access.c
@@ -1,54 +1,25 @@
 /// Test -m[no-]unaligned-access and -m[no-]strict-align options.
 
-// RUN: %clang --target=loongarch64 -munaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
 // RUN: %clang --target=loongarch64 -mstrict-align -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
 // RUN: %clang --target=loongarch64 -mno-strict-align -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
-// RUN: %clang --target=loongarch64 -munaligned-access -mno-unaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -munaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
 // RUN: %clang --target=loongarch64 -mstrict-align -mno-strict-align -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
 // RUN: %clang --target=loongarch64 -mno-strict-align -mstrict-align -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -munaligned-access -mstrict-align -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -mstrict-align -munaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -mno-strict-align -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-strict-align -mno-unaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
 
-// RUN: %clang --target=loongarch64 -munaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
 // RUN: %clang --target=loongarch64 -mstrict-align -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
 // RUN: %clang --target=loongarch64 -mno-strict-align -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
-// RUN: %clang --target=loongarch64 -munaligned-access -mno-unaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -munaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
 // RUN: %clang --target=loongarch64 -mstrict-align -mno-strict-align -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
 // RUN: %clang --target=loongarch64 -mno-strict-align -mstrict-align -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -munaligned-access -mstrict-align -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -mstrict-align -munaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -mno-strict-align -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-strict-align -mno-unaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
+
+// RUN: not %clang -### --target=loongarch64 -mno-unaligned-access -munaligned-access %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR
 
 // CC1-UNALIGNED: "-target-feature" "+ual"
 // CC1-NO-UNALIGNED: "-target-feature" "-ual"
@@ -56,6 +27,9 @@
 // IR-UNALIGNED: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+ual{{(,.*)?}}"
 // IR-NO-UNALIGNED: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-ual{{(,.*)?}}"
 
+// ERR: error: unsupported option '-mno-unaligned-access' for target 'loongarch64'
+// ERR: error: unsupported option '-munaligned-access' for target 'loongarch64'
+
 int foo(void) {
   return 3;
 }
diff --git a/clang/test/Driver/mips-features.c b/clang/test/Driver/mips-features.c
index fd06b1400c312..5e92dccaa02ab 100644
--- a/clang/test/Driver/mips-features.c
+++ b/clang/test/Driver/mips-features.c
@@ -462,3 +462,29 @@
 // RUN:     -mrelax-pic-calls -mno-relax-pic-calls 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO-RELAX-PIC-CALLS %s
 // CHECK-NO-RELAX-PIC-CALLS: "-mllvm" "-mips-jalr-reloc=0"
+//
+// -mno-unaligned-access
+// RUN: %clang -target mips-unknown-linux-gnu -### -c %s \
+// RUN:     -munaligned-access -mno-strict-align \
+// RUN:     -mno-unaligned-access 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-STRICT-ALIGN %s
+// CHECK-STRICT-ALIGN: "-target-feature" "+strict-align"
+//
+// -munaligned-access
+// RUN: %clang -target mips-unknown-linux-gnu -### -c %s \
+// RUN:     -mno-unaligned-access -mstrict-align \
+// RUN:     -munaligned-access 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NO-STRICT-ALIGN %s
+// CHECK-NO-STRICT-ALIGN: "-target-feature" "-strict-align"
+//
+// -mstrict-align
+// RUN: %clang -target mips-unknown-linux-gnu -### -c %s \
+// RUN:     -munaligned-access -mno-strict-align \
+// RUN:     -mstrict-align 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-STRICT-ALIGN %s
+//
+// -mno-strict-align
+// RUN: %clang -target mips-unknown-linux-gnu -### -c %s \
+// RUN:     -mno-unaligned-access -mstrict-align \
+// RUN:     -mno-strict-align 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NO-STRICT-ALIGN %s
diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
new file mode 100644
index 0000000000000..24797002b80f5
--- /dev/null
+++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
@@ -0,0 +1,36 @@
+// Test that -print-library-module-manifest-path finds the correct file.
+
+// RUN: rm -rf %t && split-file %s %t && cd %t
+// RUN: mkdir -p %t/Inputs/usr/lib/x86_64-linux-gnu
+// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.so
+
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libc++ \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libcxx-no-module-json.cpp
+
+// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/modules.json
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libc++ \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libcxx.cpp
+
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libstdc++ \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libstdcxx.cpp
+
+//--- libcxx-no-module-json.cpp
+
+// CHECK: <NOT PRESENT>
+
+//--- libcxx.cpp
+
+// CHECK: {{.*}}/Inputs/usr/lib/x86_64-linux-gnu{{/|\\}}modules.json
+
+//--- libstdcxx.cpp
+
+// CHECK: <NOT PRESENT>
diff --git a/clang/test/Driver/munaligned-access-unused.c b/clang/test/Driver/munaligned-access-unused.c
index 1d86edb798ef2..4060b53c42fe5 100644
--- a/clang/test/Driver/munaligned-access-unused.c
+++ b/clang/test/Driver/munaligned-access-unused.c
@@ -1,8 +1,9 @@
-/// Check -m[no-]unaligned-access and -m[no-]strict-align are warned unused on a target that does not support them.
+/// Check -m[no-]unaligned-access and -m[no-]strict-align are errored on a target that does not support them.
 
 // RUN: not %clang --target=x86_64 -munaligned-access -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=unaligned-access
 // RUN: not %clang --target=x86_64 -mno-unaligned-access -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=no-unaligned-access
-// RUN: not %clang --target=x86_64 -mstrict-align -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=strict-align
-// RUN: not %clang --target=x86_64 -mno-strict-align -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=no-strict-align
+// RUN: not %clang --target=x86_64 -mno-strict-align -mstrict-align -fsyntax-only %s -### 2>&1 | FileCheck %s --check-prefix=ALIGN
 
 // CHECK: error: unsupported option '-m{{(no-)?}}unaligned-access' for target '{{.*}}'
+// ALIGN: error: unsupported option '-mno-strict-align' for target '{{.*}}'
+// ALIGN: error: unsupported option '-mstrict-align' for target '{{.*}}'
diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c
index fc5fb0f27e3af..fe74ac773ef8c 100644
--- a/clang/test/Driver/riscv-features.c
+++ b/clang/test/Driver/riscv-features.c
@@ -33,8 +33,6 @@
 // NO-FORCE-SW-SCS: "-target-feature" "-forced-sw-shadow-stack"
 // DEFAULT-NOT: "-target-feature" "+forced-sw-shadow-stack"
 
-// RUN: %clang --target=riscv32-unknown-elf -### %s -munaligned-access 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
-// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-unaligned-access 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mstrict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS
 
diff --git a/clang/test/Driver/tls-dialect.c b/clang/test/Driver/tls-dialect.c
index f73915b28ec2a..a808dd81531ce 100644
--- a/clang/test/Driver/tls-dialect.c
+++ b/clang/test/Driver/tls-dialect.c
@@ -2,6 +2,7 @@
 // RUN: %clang -### --target=riscv64-linux -mtls-dialect=trad %s 2>&1 | FileCheck --check-prefix=NODESC %s
 // RUN: %clang -### --target=riscv64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s
 // RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu %s 2>&1 | FileCheck --check-prefix=NODESC %s
+// RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=DESC %s
 
 /// Android supports TLSDESC by default on RISC-V
 /// TLSDESC is not on by default in Linux, even on RISC-V, and is covered above
@@ -18,7 +19,6 @@
 
 /// Unsupported argument
 // RUN: not %clang -### --target=riscv64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s
-// RUN: not %clang -### --target=x86_64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s
 
 // DESC:       "-cc1" {{.*}}"-enable-tlsdesc"
 // NODESC-NOT: "-enable-tlsdesc"
diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c
index f950283ec42aa..88590a3ba4c45 100644
--- a/clang/test/Driver/wasm-toolchain.c
+++ b/clang/test/Driver/wasm-toolchain.c
@@ -197,3 +197,27 @@
 // RUN: not %clang -### %s --target=wasm32-unknown-unknown --sysroot=%s/no-sysroot-there -fPIC -mno-mutable-globals %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=PIC_NO_MUTABLE_GLOBALS %s
 // PIC_NO_MUTABLE_GLOBALS: error: invalid argument '-fPIC' not allowed with '-mno-mutable-globals'
+
+// Test that `wasm32-wasip2` invokes the `wasm-component-ld` linker by default
+// instead of `wasm-ld`.
+
+// RUN: %clang -### -O2 --target=wasm32-wasip2 %s --sysroot /foo 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK_WASIP2 %s
+// LINK_WASIP2: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_WASIP2: wasm-component-ld{{.*}}" "-L/foo/lib/wasm32-wasip2" "crt1.o" "[[temp]]" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
+
+// Test that on `wasm32-wasip2` the `wasm-component-ld` programs is told where
+// to find `wasm-ld` by default.
+
+// RUN: %clang -### -O2 --target=wasm32-wasip2 %s --sysroot /foo 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK_WASIP2_FIND_WASMLD %s
+// LINK_WASIP2_FIND_WASMLD: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_WASIP2_FIND_WASMLD: wasm-component-ld{{.*}}" {{.*}} "--wasm-ld-path" "{{.*}}wasm-ld{{.*}}" {{.*}} "[[temp]]" {{.*}}
+
+// If `wasm32-wasip2` is configured with `wasm-ld` as a linker then don't pass
+// the `--wasm-ld-path` flag.
+
+// RUN: %clang -### -O2 --target=wasm32-wasip2 -fuse-ld=lld %s --sysroot /foo 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK_WASIP2_USE_WASMLD %s
+// LINK_WASIP2_USE_WASMLD: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_WASIP2_USE_WASMLD: wasm-ld{{.*}}" "-m" "wasm32" {{.*}} "[[temp]]" {{.*}}
diff --git a/clang/test/InstallAPI/asm.test b/clang/test/InstallAPI/asm.test
new file mode 100644
index 0000000000000..b6af7f643d72f
--- /dev/null
+++ b/clang/test/InstallAPI/asm.test
@@ -0,0 +1,90 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+// RUN: clang-installapi -target arm64-apple-macos13.1 \
+// RUN: -I%t/usr/include \
+// RUN: -install_name @rpath/lib/libasm.dylib \
+// RUN: %t/inputs.json -o %t/output.tbd 2>&1 | FileCheck %s --allow-empty
+// RUN: llvm-readtapi -compare %t/output.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
+
+// CHECK-NOT: error: 
+// CHECK-NOT: warning: 
+
+//--- usr/include/asm.h
+#ifndef ASM_H
+#define ASM_H
+
+extern int ivar __asm("_OBJC_IVAR_$_SomeClass._ivar1");
+extern int objcClass1 __asm("_OBJC_CLASS_$_SomeClass");
+extern int objcClass2 __asm("_OBJC_METACLASS_$_SomeClass");
+extern int objcClass3 __asm("_OBJC_EHTYPE_$_SomeClass");
+extern int objcClass4 __asm(".objc_class_name_SomeClass");
+
+__attribute__((visibility("hidden")))
+@interface NSString {
+}
+@end
+
+extern int ivarExtra __asm("_OBJC_IVAR_$_NSString._ivar1");
+#endif // ASM_H
+
+//--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/usr/include/asm.h",
+    "type" : "public"
+  }],
+  "version": "3"
+}
+
+//--- expected.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "objc_class": [
+            "SomeClass"
+          ],
+          "objc_eh_type": [
+            "SomeClass"
+          ],
+          "objc_ivar": [
+            "NSString._ivar1",
+            "SomeClass._ivar1"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/lib/libasm.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13.1",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/clang/test/InstallAPI/availability.test b/clang/test/InstallAPI/availability.test
new file mode 100644
index 0000000000000..55c890b6d882a
--- /dev/null
+++ b/clang/test/InstallAPI/availability.test
@@ -0,0 +1,631 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+; RUN: yaml2obj %t/Availability.yaml -o %t/System/Library/Frameworks/Availability.framework/Availability
+
+; RUN: clang-installapi \
+; RUN: --target=x86_64-apple-macos13 \
+; RUN: -install_name /System/Library/Frameworks/Availability.framework/Versions/A/Availability \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -F %t/System/Library/Frameworks \
+; RUN: %t/inputs.json -o %t/output.tbd \
+; RUN: --verify-against=%t/System/Library/Frameworks/Availability.framework/Availability \
+; RUN: --verify-mode=ErrorsOnly --filetype=tbd-v5 2> %t/errors.log
+; RUN: FileCheck -allow-empty -check-prefix=ERRORSONLY -input-file %t/errors.log %s
+
+; RUN: clang-installapi \
+; RUN: --target=x86_64-apple-macos13 \
+; RUN: -install_name /System/Library/Frameworks/Availability.framework/Versions/A/Availability \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -F %t/System/Library/Frameworks \
+; RUN: %t/inputs.json -o %t/output-warnings.tbd \
+; RUN: --verify-against=%t/System/Library/Frameworks/Availability.framework/Availability \
+; RUN: --verify-mode=ErrorsAndWarnings 2> %t/errors.log
+; RUN: FileCheck -check-prefixes=VIOLATIONS,ERRORSANDWARNINGS -input-file %t/errors.log %s
+
+; RUN: not clang-installapi \
+; RUN: --target=x86_64-apple-macos13 \
+; RUN: -install_name /System/Library/Frameworks/Availability.framework/Versions/A/Availability \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -F %t/System/Library/Frameworks \
+; RUN: %t/inputs.json -o %t/output-pedantic.tbd \
+; RUN: --verify-against=%t/System/Library/Frameworks/Availability.framework/Availability \
+; RUN: --verify-mode=Pedantic 2> %t/errors.log
+; RUN: FileCheck -check-prefixes=VIOLATIONS,PEDANTIC -input-file %t/errors.log %s
+
+; ERRORSONLY-NOT:        error
+; ERRORSONLY-NOT:        warning
+
+; ERRORSANDWARNINGS-NOT: error
+; VIOLATIONS:            warning: violations found for x86_64-apple-macos
+; VIOLATIONS:            declaration 'publicGlobalVariable' is marked unavailable, but symbol is exported in dynamic library
+; VIOLATIONS-NEXT:       extern int publicGlobalVariable NS_AVAILABLE
+; VIOLATIONS:            declaration 'Foo' is marked unavailable, but symbol is exported in dynamic library
+; VIOLATIONS-NEXT:       @interface Foo : NSObject
+; VIOLATIONS:            declaration 'publicGlobalVariable3' is marked unavailable, but symbol is exported in dynamic library
+; VIOLATIONS-NEXT:       extern int publicGlobalVariable3 __attribute__((unavailable))
+; VIOLATIONS:            declaration 'privateGlobalVariable' is marked unavailable, but symbol is exported in dynamic library
+; VIOLATIONS-NEXT:       extern int privateGlobalVariable;
+
+; ERRORSANDWARNINGS-NOT: warning 
+; PEDANTIC-NOT:          error
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/System/Library/Frameworks/Availability.framework/Headers/Availability.h",
+    "type" : "public"
+  }, 
+  {
+    "path" : "DSTROOT/System/Library/Frameworks/Availability.framework/PrivateHeaders/AvailabilityPrivate.h",
+    "type" : "private"
+  }
+  ],
+  "version": "3"
+}
+
+;--- System/Library/Frameworks/Availability.framework/Headers/AV_Defines.h
+#ifndef AV_DEFINES
+#define AV_DEFINES 
+
+#define NS_AVAILABLE __attribute__((availability(macosx,introduced=NA)))
+
+@interface NSObject 
+@end
+
+#endif //AV_DEFINES
+
+;--- System/Library/Frameworks/Availability.framework/PrivateHeaders/AvailabilityPrivate.h
+#import <Availability/AV_Defines.h>
+// Test private global variable.
+NS_AVAILABLE 
+extern int privateGlobalVariable;
+
+;--- System/Library/Frameworks/Availability.framework/Headers/Availability.h
+#import <Availability/AV_Defines.h>
+extern int publicGlobalVariable NS_AVAILABLE;
+
+// Test public ObjC class
+NS_AVAILABLE
+@interface Foo : NSObject
+@end
+
+// Test unavailable attribute.
+#ifdef __i386__
+#define UNAVAILABLE_I386 __attribute__((unavailable))
+#else
+#define UNAVAILABLE_I386
+#endif
+extern int publicGlobalVariable2 UNAVAILABLE_I386;
+
+extern int publicGlobalVariable3 __attribute__((unavailable))
+__attribute__((availability(macosx, introduced = 10.9)));
+
+// Test obsoleted with exported variable.
+extern int publicGlobalVariable4 __attribute__((availability(
+    macosx, introduced = 10.9, deprecated = 10.10, obsoleted = 10.11)));
+// Test obsoleted with non-existent variable.
+extern int publicGlobalVariable5 __attribute__((availability(
+    macosx, introduced = 10.9, deprecated = 10.10, obsoleted = 10.11)));
+
+#ifdef __i386__
+#define OBSOLETE_I386 __attribute__((availability(macosx, obsoleted = 10.11)))
+#else
+#define OBSOLETE_I386
+#endif
+extern int publicGlobalVariable6 OBSOLETE_I386;
+
+
+/// Created from: 
+// int publicGlobalVariable; int privateGlobalVariable;
+// 
+// @implementation Foo
+// @end
+// 
+// #ifndef __i386__
+// int publicGlobalVariable2;
+// #endif
+// 
+// int publicGlobalVariable3;
+// int publicGlobalVariable4;
+// 
+// #ifndef __i386__
+// int publicGlobalVariable6;
+// #endif
+;--- Availability.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x6
+  ncmds:           14
+  sizeofcmds:      1312
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          8192
+    fileoff:         0
+    filesize:        8192
+    maxprot:         5
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x1140
+        size:            0
+        offset:          0x1140
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         ''
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x1140
+        size:            4
+        offset:          0x1140
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x2
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         466F6F00
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA_CONST
+    vmaddr:          8192
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          2
+    flags:           16
+    Sections:
+      - sectname:        __objc_classlist
+        segname:         __DATA_CONST
+        addr:            0x2000
+        size:            8
+        offset:          0x2000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x10000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         B830000000000000
+      - sectname:        __objc_imageinfo
+        segname:         __DATA_CONST
+        addr:            0x2008
+        size:            8
+        offset:          0x2008
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000040000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         __DATA
+    vmaddr:          12288
+    vmsize:          4096
+    fileoff:         12288
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __objc_const
+        segname:         __DATA
+        addr:            0x3000
+        size:            144
+        offset:          0x3000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '010000002800000028000000000000000000000000000000401100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800000008000000000000000000000000000000401100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+      - sectname:        __objc_data
+        segname:         __DATA
+        addr:            0x3090
+        size:            80
+        offset:          0x3090
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000000000000000000000000000000000000000000000000000000000000003000000000000090300000000000000000000000000000000000000000000000000000000000004830000000000000'
+      - sectname:        __common
+        segname:         __DATA
+        addr:            0x30E0
+        size:            24
+        offset:          0x0
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x1
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          16384
+    vmsize:          824
+    fileoff:         16384
+    filesize:        824
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      16384
+    rebase_size:     16
+    bind_off:        16400
+    bind_size:       104
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      16504
+    export_size:     152
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          16664
+    nsyms:           14
+    stroff:          16888
+    strsize:         320
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       2
+    iextdefsym:      2
+    nextdefsym:      8
+    iundefsym:       10
+    nundefsym:       4
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         112
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/Availability.framework/Versions/A/Availability'
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C4470-5555-3144-A142-4EE44DA08D2F
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 14942208
+      compatibility_version: 65536
+    Content:         '/usr/lib/libobjc.A.dylib'
+    ZeroPadBytes:    8
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88473600
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         16656
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         16664
+    datasize:        0
+LinkEditData:
+  RebaseOpcodes:
+    - Opcode:          REBASE_OPCODE_SET_TYPE_IMM
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ExtraData:       [ 0x0 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ExtraData:       [ 0x18 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB
+      Imm:             0
+      ExtraData:       [ 0x2, 0x40 ]
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             2
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             3
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DONE
+      Imm:             0
+  BindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          '_OBJC_METACLASS_$_NSObject'
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ULEBExtraData:   [ 0x90 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __objc_empty_cache
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_ADD_ADDR_ULEB
+      Imm:             0
+      ULEBExtraData:   [ 0x20 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          '_OBJC_CLASS_$_NSObject'
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_ADD_ADDR_ULEB
+      Imm:             0
+      ULEBExtraData:   [ 0xFFFFFFFFFFFFFFF0 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      5
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    0
+            NodeOffset:      17
+            Name:            OBJC_
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      49
+                Name:            'METACLASS_$_Foo'
+                Flags:           0x0
+                Address:         0x3090
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      54
+                Name:            'CLASS_$_Foo'
+                Flags:           0x0
+                Address:         0x30B8
+                Other:           0x0
+                ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      59
+            Name:            p
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      104
+                Name:            rivateGlobalVariable
+                Flags:           0x0
+                Address:         0x30E0
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      109
+                Name:            ublicGlobalVariable
+                Flags:           0x0
+                Address:         0x30E4
+                Other:           0x0
+                ImportName:      ''
+                Children:
+                  - TerminalSize:    3
+                    NodeOffset:      130
+                    Name:            '4'
+                    Flags:           0x0
+                    Address:         0x30F0
+                    Other:           0x0
+                    ImportName:      ''
+                  - TerminalSize:    3
+                    NodeOffset:      135
+                    Name:            '3'
+                    Flags:           0x0
+                    Address:         0x30EC
+                    Other:           0x0
+                    ImportName:      ''
+                  - TerminalSize:    3
+                    NodeOffset:      140
+                    Name:            '2'
+                    Flags:           0x0
+                    Address:         0x30E8
+                    Other:           0x0
+                    ImportName:      ''
+                  - TerminalSize:    3
+                    NodeOffset:      145
+                    Name:            '6'
+                    Flags:           0x0
+                    Address:         0x30F4
+                    Other:           0x0
+                    ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xE
+      n_sect:          5
+      n_desc:          0
+      n_value:         12288
+    - n_strx:          28
+      n_type:          0xE
+      n_sect:          5
+      n_desc:          0
+      n_value:         12360
+    - n_strx:          50
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12512
+    - n_strx:          73
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12516
+    - n_strx:          95
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12520
+    - n_strx:          118
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12524
+    - n_strx:          141
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12528
+    - n_strx:          164
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12532
+    - n_strx:          187
+      n_type:          0xF
+      n_sect:          6
+      n_desc:          0
+      n_value:         12432
+    - n_strx:          209
+      n_type:          0xF
+      n_sect:          6
+      n_desc:          0
+      n_value:         12472
+    - n_strx:          227
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          250
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          277
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          296
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+  StringTable:
+    - ' '
+    - '__OBJC_METACLASS_RO_$_Foo'
+    - '__OBJC_CLASS_RO_$_Foo'
+    - _privateGlobalVariable
+    - _publicGlobalVariable
+    - _publicGlobalVariable2
+    - _publicGlobalVariable3
+    - _publicGlobalVariable4
+    - _publicGlobalVariable6
+    - '_OBJC_METACLASS_$_Foo'
+    - '_OBJC_CLASS_$_Foo'
+    - '_OBJC_CLASS_$_NSObject'
+    - '_OBJC_METACLASS_$_NSObject'
+    - __objc_empty_cache
+    - dyld_stub_binder
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+...
diff --git a/clang/test/InstallAPI/basic.test b/clang/test/InstallAPI/basic.test
index 5b41ccd517b0a..096911039d114 100644
--- a/clang/test/InstallAPI/basic.test
+++ b/clang/test/InstallAPI/basic.test
@@ -2,7 +2,8 @@
 // RUN: split-file %s %t
 /// Check basic arguments are captured.
 // RUN: clang-installapi -x objective-c -target arm64-apple-ios13.0.0 \
-// RUN: -fapplication-extension -current_version 1 -install_name /usr/lib/basic.dylib \
+// RUN: -fapplication-extension -current_version 1 -compatibility_version 1 \
+// RUN: -install_name /usr/lib/basic.dylib \
 // RUN: %t/basic_inputs.json -o %t/basic.tbd 2>&1 | FileCheck %s --allow-empty
 // RUN: llvm-readtapi -compare %t/basic.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
 
@@ -25,11 +26,6 @@
 //--- expected.tbd
 {
   "main_library": {
-    "compatibility_versions": [
-      {
-        "version": "0"
-      }
-    ],
     "install_names": [
       {
         "name": "/usr/lib/basic.dylib"
diff --git a/clang/test/InstallAPI/diagnostics-cpp.test b/clang/test/InstallAPI/diagnostics-cpp.test
new file mode 100644
index 0000000000000..9319a7a61d483
--- /dev/null
+++ b/clang/test/InstallAPI/diagnostics-cpp.test
@@ -0,0 +1,462 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+// RUN: yaml2obj %t/Mismatch.yaml -o %t/System/Library/Frameworks/MismatchCpp.framework/MismatchCpp
+
+// RUN: not clang-installapi --target=arm64-apple-macos13 -x objective-c++  \
+// RUN: -F %t/System/Library/Frameworks \
+// RUN: -install_name /System/Library/Frameworks/MismatchCpp.framework/Versions/A/MismatchCpp \
+// RUN: -current_version 1 -compatibility_version 1 %t/inputs.json \
+// RUN: --verify-against=%t/System/Library/Frameworks/MismatchCpp.framework/MismatchCpp \
+// RUN: --verify-mode=Pedantic -o %t/output.tbd --demangle 2> %t/errors.log
+// RUN: FileCheck -input-file %t/errors.log %s 
+
+CHECK: warning: violations found for arm64-apple-macos13
+CHECK: CPP.h:5:7: error: declaration has external linkage, but symbol has internal linkage in dynamic library 'vtable for Bar'
+CHECK-NEXT: class Bar : Foo {
+CHECK-NEXT:       ^
+CHECK-NEXT: CPP.h:5:7: error: declaration has external linkage, but symbol has internal linkage in dynamic library 'typeinfo for Bar'
+CHECK-NEXT: CPP.h:5:7: error: declaration has external linkage, but symbol has internal linkage in dynamic library 'typeinfo name for Bar'
+CHECK-NEXT: CPP.h:6:7: error: dynamic library symbol '(weak-def) Bar::init()' is weak defined, but its declaration is not
+CHECK-NEXT:   int init();
+CHECK-NEXT:       ^
+
+//--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/System/Library/Frameworks/MismatchCpp.framework/Headers/CPP.h",
+    "type" : "public"
+  }
+  ],
+  "version": "3"
+}
+
+//--- System/Library/Frameworks/MismatchCpp.framework/Headers/CPP.h
+class Foo {
+  virtual int init() = 0;
+};
+
+class Bar : Foo { 
+  int init();
+};
+
+/// Created from: 
+// With LD flags: -exported_symbol,"__ZN3Bar4initEv" -exported_symbol,"__Z3fooIjEiT_"
+// class Foo { virtual int init() = 0;};
+// 
+// class Bar : Foo {int init() { return 1;}};
+// Bar bar;
+// 
+// template <typename T> int foo(T val) { return 1; }
+// template <> int foo(unsigned val) { return 1; }
+
+//--- Mismatch.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x6
+  ncmds:           15
+  sizeofcmds:      1224
+  flags:           0x118085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          16384
+    fileoff:         0
+    filesize:        16384
+    maxprot:         5
+    initprot:        5
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x10E8
+        size:            16
+        offset:          0x10E8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         20008052C0035FD620008052C0035FD6
+      - sectname:        __const
+        segname:         __TEXT
+        addr:            0x10F8
+        size:            10
+        offset:          0x10F8
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         334261720033466F6F00
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x1104
+        size:            4152
+        offset:          0x1104
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000010000002000000000000000200000000200000000000002E81000003800000038000000F81000000000000038000000030000000C0001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA_CONST
+    vmaddr:          16384
+    vmsize:          16384
+    fileoff:         16384
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          2
+    flags:           16
+    Sections:
+      - sectname:        __const
+        segname:         __DATA_CONST
+        addr:            0x4000
+        size:            80
+        offset:          0x4000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00000000000000002840000000000000F0100000000000001000000000000000FD100000000000801000000000000000F810000000000080000000000100000018400000000000000000000000000000
+      - sectname:        __objc_imageinfo
+        segname:         __DATA_CONST
+        addr:            0x4050
+        size:            8
+        offset:          0x4050
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000040000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA
+    vmaddr:          32768
+    vmsize:          16384
+    fileoff:         32768
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x8000
+        size:            8
+        offset:          0x8000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '1040000000000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          49152
+    vmsize:          1104
+    fileoff:         49152
+    filesize:        1104
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      49152
+    rebase_size:     16
+    bind_off:        49168
+    bind_size:       96
+    weak_bind_off:   49264
+    weak_bind_size:  24
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      49288
+    export_size:     48
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          49344
+    nsyms:           11
+    stroff:          49520
+    strsize:         192
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       6
+    iextdefsym:      6
+    nextdefsym:      2
+    iundefsym:       8
+    nundefsym:       3
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         96
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/MismatchCpp.framework/Versions/A/MismatchCpp'
+    ZeroPadBytes:    1
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C44F3-5555-3144-A13F-B3FE15787197
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         48
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 117985024
+      compatibility_version: 65536
+    Content:         '/usr/lib/libc++.1.dylib'
+    ZeroPadBytes:    1
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88473600
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         49336
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         49344
+    datasize:        0
+  - cmd:             LC_CODE_SIGNATURE
+    cmdsize:         16
+    dataoff:         49712
+    datasize:        544
+LinkEditData:
+  RebaseOpcodes:
+    - Opcode:          REBASE_OPCODE_SET_TYPE_IMM
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ExtraData:       [ 0x8 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             2
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB
+      Imm:             0
+      ExtraData:       [ 0x3, 0x8 ]
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ExtraData:       [ 0x0 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DONE
+      Imm:             0
+  BindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __ZTVN10__cxxabiv117__class_type_infoE
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ULEBExtraData:   [ 0x18 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_ADDEND_SLEB
+      Imm:             0
+      SLEBExtraData:   [ 16 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __ZTVN10__cxxabiv121__vmi_class_type_infoE
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_ADD_ADDR_ULEB
+      Imm:             0
+      ULEBExtraData:   [ 0x8 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  WeakBindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __ZN3Bar4initEv
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ULEBExtraData:   [ 0x10 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      7
+        Name:            __Z
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      35
+            Name:            3fooIjEiT_
+            Flags:           0x0
+            Address:         0x10E8
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    3
+            NodeOffset:      40
+            Name:            N3Bar4initEv
+            Flags:           0x4
+            Address:         0x10F0
+            Other:           0x0
+            ImportName:      ''
+  NameList:
+    - n_strx:          32
+      n_type:          0x1E
+      n_sect:          4
+      n_desc:          0
+      n_value:         16384
+    - n_strx:          42
+      n_type:          0x1E
+      n_sect:          4
+      n_desc:          0
+      n_value:         16408
+    - n_strx:          52
+      n_type:          0x1E
+      n_sect:          4
+      n_desc:          0
+      n_value:         16424
+    - n_strx:          62
+      n_type:          0x1E
+      n_sect:          6
+      n_desc:          0
+      n_value:         32768
+    - n_strx:          67
+      n_type:          0x1E
+      n_sect:          2
+      n_desc:          0
+      n_value:         4344
+    - n_strx:          77
+      n_type:          0x1E
+      n_sect:          2
+      n_desc:          0
+      n_value:         4349
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4328
+    - n_strx:          16
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          128
+      n_value:         4336
+    - n_strx:          87
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          126
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          169
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+  StringTable:
+    - ' '
+    - __Z3fooIjEiT_
+    - __ZN3Bar4initEv
+    - __ZTV3Bar
+    - __ZTI3Foo
+    - __ZTI3Bar
+    - _bar
+    - __ZTS3Bar
+    - __ZTS3Foo
+    - __ZTVN10__cxxabiv117__class_type_infoE
+    - __ZTVN10__cxxabiv121__vmi_class_type_infoE
+    - dyld_stub_binder
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+  FunctionStarts:  [ 0x10E8, 0x10F0 ]
+...
diff --git a/clang/test/InstallAPI/driver-invalid-options.test b/clang/test/InstallAPI/driver-invalid-options.test
index a2e008e1eb03e..69f3b2d66ab8b 100644
--- a/clang/test/InstallAPI/driver-invalid-options.test
+++ b/clang/test/InstallAPI/driver-invalid-options.test
@@ -1,4 +1,9 @@
 /// Check non-darwin triple is rejected.
-// RUN: not clang-installapi -target x86_64-unknown-unknown %s 2> %t 
+// RUN: not clang-installapi -target x86_64-unknown-unknown %s -o tmp.tbd 2> %t 
 // RUN: FileCheck --check-prefix INVALID_INSTALLAPI -input-file %t %s
 // INVALID_INSTALLAPI: error: unsupported option 'installapi' for target 'x86_64-unknown-unknown'
+
+/// Check that missing install_name is reported.
+// RUN: not clang-installapi -target x86_64-apple-ios-simulator  %s -o tmp.tbd 2> %t 
+// RUN: FileCheck --check-prefix INVALID_INSTALL_NAME -input-file %t %s
+// INVALID_INSTALL_NAME: error: no install name specified: add -install_name <path>
diff --git a/clang/test/InstallAPI/functions.test b/clang/test/InstallAPI/functions.test
index 527965303cb35..5b5fd1308842e 100644
--- a/clang/test/InstallAPI/functions.test
+++ b/clang/test/InstallAPI/functions.test
@@ -4,7 +4,7 @@
 
 // RUN: clang-installapi -target arm64-apple-macos13.1 \
 // RUN: -I%t/usr/include -I%t/usr/local/include \
-// RUN: -install_name @rpath/lib/libfunctions.dylib \
+// RUN: -install_name @rpath/lib/libfunctions.dylib --filetype=tbd-v4 \
 // RUN: %t/inputs.json -o %t/outputs.tbd 2>&1 | FileCheck %s --allow-empty
 // RUN: llvm-readtapi -compare %t/outputs.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
 
diff --git a/clang/test/InstallAPI/hiddens.test b/clang/test/InstallAPI/hiddens.test
new file mode 100644
index 0000000000000..b3196ef945cd1
--- /dev/null
+++ b/clang/test/InstallAPI/hiddens.test
@@ -0,0 +1,262 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+// RUN: yaml2obj %t/Hidden.yaml -o %t/System/Library/Frameworks/Hidden.framework/Hidden
+
+// RUN: clang-installapi --target=x86_64-apple-macos13 -x objective-c \
+// RUN: -F %t/System/Library/Frameworks \
+// RUN: -install_name /System/Library/Frameworks/Hidden.framework/Versions/A/Hidden\
+// RUN: -current_version 1 -compatibility_version 1 %t/inputs.json \
+// RUN: --verify-against=%t/System/Library/Frameworks/Hidden.framework/Hidden \
+// RUN: --verify-mode=Pedantic -o %t/output.tbd 2>&1 | FileCheck %s 
+
+// CHECK-NOT: error
+// CHECK:  warning: use of __private_extern__ 
+
+// RUN: llvm-readtapi --compare %t/output.tbd %t/expected.tbd 
+
+//--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/System/Library/Frameworks/Hidden.framework/Headers/Hidden.h",
+    "type" : "public"
+  }
+  ],
+  "version": "3"
+}
+
+//--- System/Library/Frameworks/Hidden.framework/Headers/Hidden.h
+__private_extern__ int foo(); // Clang doesn't warn on this, but should.
+__private_extern__ int baz;
+__attribute__((visibility("hidden"))) int bar();
+
+/// Created from: 
+/// #import "Hidden.h" int foo(void) { return 1; } int bar(void) { return 1; }
+//--- Hidden.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x6
+  ncmds:           12
+  sizeofcmds:      920
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          8192
+    fileoff:         0
+    filesize:        8192
+    maxprot:         5
+    initprot:        5
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0xBB8
+        size:            22
+        offset:          0xBB8
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         554889E5B8010000005DC3554889E5B8010000005DC3
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0xBD0
+        size:            4152
+        offset:          0xBD0
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000010000002000000000000000200000000200000000000001B80B00003800000038000000CE0B00000000000038000000030000000C0001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+      - sectname:        __eh_frame
+        segname:         __TEXT
+        addr:            0x1C08
+        size:            24
+        offset:          0x1C08
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x6000000B
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         1400000000000000017A520001781001100C070890010000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA_CONST
+    vmaddr:          8192
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           16
+    Sections:
+      - sectname:        __objc_imageinfo
+        segname:         __DATA_CONST
+        addr:            0x2000
+        size:            8
+        offset:          0x2000
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000040000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          12288
+    vmsize:          104
+    fileoff:         12288
+    filesize:        104
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      0
+    rebase_size:     0
+    bind_off:        0
+    bind_size:       0
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      12288
+    export_size:     16
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          12312
+    nsyms:           3
+    stroff:          12360
+    strsize:         32
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       1
+    iextdefsym:      1
+    nextdefsym:      1
+    iundefsym:       2
+    nundefsym:       1
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         88
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/Hidden.framework/Versions/A/Hidden'
+    ZeroPadBytes:    3
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C44E7-5555-3144-A133-0271E799C487
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88473600
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         12304
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         12312
+    datasize:        0
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    3
+        NodeOffset:      8
+        Name:            _foo
+        Flags:           0x0
+        Address:         0xBB8
+        Other:           0x0
+        ImportName:      ''
+  NameList:
+    - n_strx:          7
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         3011
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         3000
+    - n_strx:          12
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+  StringTable:
+    - ' '
+    - _foo
+    - _bar
+    - dyld_stub_binder
+    - ''
+    - ''
+    - ''
+  FunctionStarts:  [ 0xBB8, 0xBC3 ]
+...
+
+//--- expected.tbd
+--- !tapi-tbd
+tbd-version:     4
+targets:         [ x86_64-macos ]
+flags:           [ not_app_extension_safe ]
+install-name:    '/System/Library/Frameworks/Hidden.framework/Versions/A/Hidden'
+...
+
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index b65a8fb057ee5..9c91c4157cd6a 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -5,11 +5,11 @@
 
 // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64
 // AARCH64: error: unknown target CPU 'not-a-cpu'
-// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
+// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64
 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu'
-// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
+// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86
 // X86: error: unknown target CPU 'not-a-cpu'
diff --git a/clang/test/Modules/bounds-safety-attributed-type.c b/clang/test/Modules/bounds-safety-attributed-type.c
new file mode 100644
index 0000000000000..0aa2f16b321c5
--- /dev/null
+++ b/clang/test/Modules/bounds-safety-attributed-type.c
@@ -0,0 +1,27 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -verify %s
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -ast-dump-all %s | FileCheck %s
+// expected-no-diagnostics
+
+#pragma clang module build bounds_safety
+module bounds_safety {}
+#pragma clang module contents
+#pragma clang module begin bounds_safety
+struct Test {
+  int count;
+  int fam[] __attribute__((counted_by(count)));
+};
+#pragma clang module end
+#pragma clang module endbuild
+
+#pragma clang module import bounds_safety
+
+struct Test *p;
+
+// CHECK: |-RecordDecl {{.*}}bounds_safety.map:4:1, line:7:1> line:4:8 imported in bounds_safety <undeserialized declarations> struct Test definition
+// CHECK: | |-FieldDecl {{.*}} imported in bounds_safety referenced count 'int'
+// CHECK: | `-FieldDecl {{.*}} imported in bounds_safety fam 'int[] __counted_by(count)':'int[]'
+
+// CHECK: |-ImportDecl {{.*}}bounds-safety-attributed-type.c:17:22> col:22 implicit bounds_safety
+// CHECK: |-RecordDecl {{.*}} struct Test
+// CHECK: `-VarDecl {{.*}} p 'struct Test *'
diff --git a/clang/test/Modules/no-undeclared-includes-builtins.cpp b/clang/test/Modules/no-undeclared-includes-builtins.cpp
index c9bffc5561990..f9eefd24a33c7 100644
--- a/clang/test/Modules/no-undeclared-includes-builtins.cpp
+++ b/clang/test/Modules/no-undeclared-includes-builtins.cpp
@@ -8,7 +8,7 @@
 // headers.
 
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -I %S/Inputs/no-undeclared-includes-builtins/libcxx -I %S/Inputs/no-undeclared-includes-builtins/glibc %s
+// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fbuiltin-headers-in-system-modules -fimplicit-module-maps -I %S/Inputs/no-undeclared-includes-builtins/libcxx -I %S/Inputs/no-undeclared-includes-builtins/glibc %s
 // expected-no-diagnostics
 
 #include <stddef.h>
diff --git a/clang/test/Modules/reduced-bmi-generating-codes.cppm b/clang/test/Modules/reduced-bmi-generating-codes.cppm
new file mode 100644
index 0000000000000..13dcda06437b2
--- /dev/null
+++ b/clang/test/Modules/reduced-bmi-generating-codes.cppm
@@ -0,0 +1,40 @@
+// Although the reduced BMI are not designed to be generated,
+// it is helpful for testing whether we've reduced the definitions.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.cppm \
+// RUN:     -emit-reduced-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.cpp \
+// RUN:     -fmodule-file=a=%t/a.pcm -S -emit-llvm -o - \
+// RUN:     | FileCheck %t/b.cpp
+
+//--- a.cppm
+export module a;
+
+export template <class T>
+class A {
+public:
+    int member() {
+        return 43;
+    }
+};
+
+// Instantiate `A<int>::member()`.
+export int a_member = A<int>().member();
+
+export const int a = 43;
+
+//--- b.cpp
+import a;
+
+static_assert(a == 43);
+
+int b() {
+    A<int> a;
+    return a.member();
+}
+
+// CHECK: define{{.*}}@_ZNW1a1AIiE6memberEv
diff --git a/clang/test/Modules/stddef.c b/clang/test/Modules/stddef.c
index 5bc0d1e44c856..7623982614681 100644
--- a/clang/test/Modules/stddef.c
+++ b/clang/test/Modules/stddef.c
@@ -1,29 +1,33 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fbuiltin-headers-in-system-modules -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify -fno-modules-error-recovery
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fbuiltin-headers-in-system-modules -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify=builtin-headers-in-system-modules -fno-modules-error-recovery
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify -fno-modules-error-recovery
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/StdDef %s -verify=no-builtin-headers-in-system-modules -fno-modules-error-recovery
 
 #include "ptrdiff_t.h"
 
 ptrdiff_t pdt;
 
-// size_t is declared in both size_t.h and __stddef_size_t.h, both of which are
-// modular headers. Regardless of whether stddef.h joins the StdDef test module
-// or is in its _Builtin_stddef module, __stddef_size_t.h will be in
-// _Builtin_stddef.size_t. It's not defined which module will win as the expected
-// provider of size_t. For the purposes of this test it doesn't matter which header
-// gets reported, just as long as it isn't other.h or include_again.h.
-size_t st; // expected-error-re {{missing '#include "{{size_t|__stddef_size_t}}.h"'; 'size_t' must be declared before it is used}}
-// expected-note@size_t.h:* 0+ {{here}}
-// expected-note@__stddef_size_t.h:* 0+ {{here}}
+// size_t is declared in both size_t.h and __stddef_size_t.h. If
+// -fbuiltin-headers-in-system-modules is set, then __stddef_size_t.h is a
+// non-modular header that will be transitively pulled in the StdDef test module
+// by include_again.h. Otherwise it will be in the _Builtin_stddef module. In
+// any case it's not defined which module will win as the expected provider of
+// size_t. For the purposes of this test it doesn't matter which of the two
+// providing headers get reported.
+size_t st; // builtin-headers-in-system-modules-error-re {{missing '#include "{{size_t|include_again}}.h"'; 'size_t' must be declared before it is used}} \
+              no-builtin-headers-in-system-modules-error-re {{missing '#include "{{size_t|__stddef_size_t}}.h"'; 'size_t' must be declared before it is used}}
+// builtin-headers-in-system-modules-note@size_t.h:* 0+ {{here}} \
+   no-builtin-headers-in-system-modules-note@size_t.h:* 0+ {{here}}
+// builtin-headers-in-system-modules-note@__stddef_size_t.h:* 0+ {{here}} \
+   no-builtin-headers-in-system-modules-note@__stddef_size_t.h:* 0+ {{here}}
 
 #include "include_again.h"
-// Includes <stddef.h> which includes <__stddef_size_t.h> which imports the
-// _Builtin_stddef.size_t module.
+// Includes <stddef.h> which includes <__stddef_size_t.h>.
 
 size_t st2;
 
 #include "size_t.h"
-// Redeclares size_t, but the type merger should figure it out.
+// Redeclares size_t when -fbuiltin-headers-in-system-modules is not passed, but
+// the type merger should figure it out.
 
 size_t st3;
diff --git a/clang/test/OpenMP/bug60602.cpp b/clang/test/OpenMP/bug60602.cpp
index 48dc341a08224..9c92ec27b86ee 100644
--- a/clang/test/OpenMP/bug60602.cpp
+++ b/clang/test/OpenMP/bug60602.cpp
@@ -95,7 +95,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -181,7 +181,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP71]], 1
 // CHECK-NEXT:    [[TMP72:%.*]] = zext i32 [[ADD]] to i64
 // CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP73]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP73]], align 4
 // CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK-NEXT:    store i32 3, ptr [[TMP74]], align 4
 // CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp
index 31ec6ff911905..34d14c89fedae 100644
--- a/clang/test/OpenMP/distribute_codegen.cpp
+++ b/clang/test/OpenMP/distribute_codegen.cpp
@@ -163,7 +163,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -354,7 +354,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -545,7 +545,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -744,7 +744,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[TMP10:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -911,7 +911,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1084,7 +1084,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1271,7 +1271,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1458,7 +1458,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1653,7 +1653,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK3-NEXT:    [[TMP10:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1820,7 +1820,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
index 800a002e43968..567d12119db21 100644
--- a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
@@ -542,7 +542,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -838,7 +838,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1207,7 +1207,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1501,7 +1501,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
index 772372076e947..5d1a12d27145e 100644
--- a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
@@ -527,7 +527,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -841,7 +841,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1229,7 +1229,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1541,7 +1541,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
index 9cb0a15530651..a8892a06d4b30 100644
--- a/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
@@ -4372,7 +4372,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4447,7 +4447,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK9-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK9-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK9-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK9-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4531,7 +4531,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK9-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK9-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK9-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK9-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -4606,7 +4606,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK9-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK9-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK9-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK9-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -4690,7 +4690,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK9-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK9-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK9-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK9-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -4765,7 +4765,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK9-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK9-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK9-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK9-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -4849,7 +4849,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK9-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK9-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK9-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK9-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -6671,7 +6671,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6746,7 +6746,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK9-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK9-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK9-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK9-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -6830,7 +6830,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK9-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK9-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK9-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK9-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -6905,7 +6905,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK9-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK9-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK9-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK9-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -6989,7 +6989,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK9-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK9-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK9-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK9-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -7064,7 +7064,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK9-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK9-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK9-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK9-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -7148,7 +7148,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK9-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK9-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK9-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK9-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -8986,7 +8986,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9061,7 +9061,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK11-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK11-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK11-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -9145,7 +9145,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK11-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK11-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK11-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK11-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -9220,7 +9220,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK11-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK11-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK11-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -9304,7 +9304,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK11-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK11-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK11-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK11-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -9379,7 +9379,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK11-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK11-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK11-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK11-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -9463,7 +9463,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK11-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK11-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK11-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK11-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -11234,7 +11234,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -11309,7 +11309,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK11-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK11-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK11-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -11393,7 +11393,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK11-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK11-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK11-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK11-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -11468,7 +11468,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK11-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK11-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK11-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -11552,7 +11552,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK11-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK11-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK11-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK11-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -11627,7 +11627,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK11-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK11-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK11-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK11-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -11711,7 +11711,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK11-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK11-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK11-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK11-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
index 6084a9a6cf931..faae599bb32af 100644
--- a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
@@ -825,7 +825,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK8-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK8-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK8-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK8-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK8-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1251,7 +1251,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK8-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK8-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK8-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1744,7 +1744,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK10-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK10-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK10-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK10-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK10-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2164,7 +2164,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK10-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK10-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK10-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK10-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK10-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
index d3b6654e57e2c..4b3911daefbad 100644
--- a/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
@@ -128,7 +128,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -161,7 +161,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -492,7 +492,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -525,7 +525,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -569,7 +569,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1060,7 +1060,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1093,7 +1093,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1137,7 +1137,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
index 1f069c4070aec..18a86c5e56663 100644
--- a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
@@ -801,7 +801,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1263,7 +1263,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1793,7 +1793,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2249,7 +2249,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
index b6ae783b74d04..dfe79d6a94934 100644
--- a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
@@ -116,7 +116,7 @@ int main() {
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -169,7 +169,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -565,7 +565,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -598,7 +598,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -641,7 +641,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -674,7 +674,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1368,7 +1368,7 @@ int main() {
 // CHECK5:       invoke.cont:
 // CHECK5-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1421,7 +1421,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1817,7 +1817,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1850,7 +1850,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1893,7 +1893,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1926,7 +1926,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2620,7 +2620,7 @@ int main() {
 // CHECK9:       invoke.cont:
 // CHECK9-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2673,7 +2673,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3069,7 +3069,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3102,7 +3102,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3145,7 +3145,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3178,7 +3178,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3872,7 +3872,7 @@ int main() {
 // CHECK13:       invoke.cont:
 // CHECK13-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3925,7 +3925,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4321,7 +4321,7 @@ int main() {
 // CHECK13-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4354,7 +4354,7 @@ int main() {
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4397,7 +4397,7 @@ int main() {
 // CHECK13-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4430,7 +4430,7 @@ int main() {
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
index e2b0d64093ebc..0969a0ca21e7a 100644
--- a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
@@ -521,7 +521,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -842,7 +842,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1232,7 +1232,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1547,7 +1547,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
index 040f90b9ef78b..c1203349b7c1b 100644
--- a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
@@ -63,7 +63,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -96,7 +96,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -414,7 +414,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp
index 77336877e2be0..a23d538910edd 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp
@@ -4762,7 +4762,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4837,7 +4837,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK9-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK9-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK9-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK9-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4921,7 +4921,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK9-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK9-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK9-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK9-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -4996,7 +4996,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK9-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK9-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK9-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK9-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -5080,7 +5080,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK9-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK9-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK9-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK9-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -5155,7 +5155,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK9-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK9-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK9-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK9-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -5239,7 +5239,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK9-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK9-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK9-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK9-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -7229,7 +7229,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7304,7 +7304,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK9-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK9-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK9-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK9-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -7388,7 +7388,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK9-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK9-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK9-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK9-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -7463,7 +7463,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK9-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK9-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK9-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK9-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -7547,7 +7547,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK9-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK9-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK9-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK9-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -7622,7 +7622,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK9-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK9-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK9-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK9-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -7706,7 +7706,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK9-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK9-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK9-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK9-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -9697,7 +9697,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9772,7 +9772,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK11-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK11-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK11-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -9856,7 +9856,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK11-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK11-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK11-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK11-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -9931,7 +9931,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK11-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK11-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK11-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -10015,7 +10015,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK11-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK11-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK11-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK11-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -10090,7 +10090,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK11-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK11-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK11-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK11-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -10174,7 +10174,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK11-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK11-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK11-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK11-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -12113,7 +12113,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12188,7 +12188,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK11-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK11-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK11-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -12272,7 +12272,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK11-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK11-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK11-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK11-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -12347,7 +12347,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK11-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK11-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK11-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -12431,7 +12431,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK11-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK11-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK11-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK11-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -12506,7 +12506,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK11-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK11-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK11-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK11-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -12590,7 +12590,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK11-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK11-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK11-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK11-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
index f47b64b1e299d..545ea9f0f43fd 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -888,7 +888,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK8-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK8-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK8-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK8-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK8-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1328,7 +1328,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK8-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK8-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK8-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1835,7 +1835,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK10-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK10-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK10-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK10-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK10-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2269,7 +2269,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK10-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK10-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK10-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK10-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK10-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp
index b3e964fddcf0d..4a321b24b8c31 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp
@@ -125,7 +125,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -158,7 +158,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -517,7 +517,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -550,7 +550,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -594,7 +594,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1127,7 +1127,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1160,7 +1160,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1204,7 +1204,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1727,7 +1727,7 @@ int main() {
 // CHECK3-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1760,7 +1760,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2119,7 +2119,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2152,7 +2152,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2196,7 +2196,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -2964,7 +2964,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2997,7 +2997,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3041,7 +3041,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -4140,7 +4140,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4173,7 +4173,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4532,7 +4532,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4565,7 +4565,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4609,7 +4609,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK9-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -5142,7 +5142,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5175,7 +5175,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -5219,7 +5219,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK9-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -5742,7 +5742,7 @@ int main() {
 // CHECK11-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5775,7 +5775,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6134,7 +6134,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6167,7 +6167,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6211,7 +6211,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK11-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK11-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -6979,7 +6979,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7012,7 +7012,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -7056,7 +7056,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK11-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK11-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
index 7656fb7bc2c51..675f1313460e6 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -868,7 +868,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1344,7 +1344,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1888,7 +1888,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2358,7 +2358,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
index 9f4fffbea63d1..5c1ca41b1c62d 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
@@ -116,7 +116,7 @@ int main() {
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -169,7 +169,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -593,7 +593,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -626,7 +626,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -669,7 +669,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -702,7 +702,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1791,7 +1791,7 @@ int main() {
 // CHECK5:       invoke.cont:
 // CHECK5-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1844,7 +1844,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2268,7 +2268,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2301,7 +2301,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2344,7 +2344,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2377,7 +2377,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3127,7 +3127,7 @@ int main() {
 // CHECK9:       invoke.cont:
 // CHECK9-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3180,7 +3180,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3604,7 +3604,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3637,7 +3637,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3680,7 +3680,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3713,7 +3713,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4802,7 +4802,7 @@ int main() {
 // CHECK13:       invoke.cont:
 // CHECK13-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4855,7 +4855,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -5279,7 +5279,7 @@ int main() {
 // CHECK13-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5312,7 +5312,7 @@ int main() {
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -5355,7 +5355,7 @@ int main() {
 // CHECK13-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5388,7 +5388,7 @@ int main() {
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
index 61f35d8b456ce..a74af2d7c3932 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
@@ -575,7 +575,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -910,7 +910,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1314,7 +1314,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1643,7 +1643,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
index 334965f59a826..777d4a25ea9f3 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -63,7 +63,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -96,7 +96,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -442,7 +442,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_private_codegen.cpp b/clang/test/OpenMP/distribute_private_codegen.cpp
index 8a47b15a24d24..b6e796fb8027c 100644
--- a/clang/test/OpenMP/distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_private_codegen.cpp
@@ -351,7 +351,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -384,7 +384,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 2
@@ -668,7 +668,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -957,7 +957,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -990,7 +990,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 2
@@ -1272,7 +1272,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index 16d909e329514..9a192502125f9 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -191,7 +191,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -391,7 +391,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -589,7 +589,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -806,7 +806,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK1-NEXT:    [[TMP15:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -996,7 +996,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1176,7 +1176,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1372,7 +1372,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1566,7 +1566,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1779,7 +1779,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK3-NEXT:    [[TMP15:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1969,7 +1969,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2149,7 +2149,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2349,7 +2349,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2547,7 +2547,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2764,7 +2764,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK5-NEXT:    [[TMP15:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 2, ptr [[TMP17]], align 4
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2985,7 +2985,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3165,7 +3165,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3361,7 +3361,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3555,7 +3555,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3768,7 +3768,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK7-NEXT:    [[TMP15:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 2, ptr [[TMP17]], align 4
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3989,7 +3989,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK7-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK7-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
index e388393c37f18..1b5950ce11d7f 100644
--- a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
@@ -595,7 +595,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -898,7 +898,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1274,7 +1274,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1575,7 +1575,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
index 5720c3aaaaff5..92b73b220bcc2 100644
--- a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
@@ -582,7 +582,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -903,7 +903,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1298,7 +1298,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1617,7 +1617,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_simd_private_codegen.cpp
index eac5bce7d5da8..93b2bd8b12a8e 100644
--- a/clang/test/OpenMP/distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_private_codegen.cpp
@@ -396,7 +396,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -440,7 +440,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 2
@@ -742,7 +742,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1042,7 +1042,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1086,7 +1086,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 2
@@ -1386,7 +1386,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp
index 31d276e984eee..b6cc1adfc76be 100644
--- a/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp
@@ -105,7 +105,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -255,7 +255,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -402,7 +402,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -552,7 +552,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/map_struct_ordering.cpp b/clang/test/OpenMP/map_struct_ordering.cpp
index dd0bb8e9d6e7c..e422010f3dcdf 100644
--- a/clang/test/OpenMP/map_struct_ordering.cpp
+++ b/clang/test/OpenMP/map_struct_ordering.cpp
@@ -97,7 +97,7 @@ int map_struct() {
 // CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP24]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP25]], align 4
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
index 5d7da793e7326..86035e17ef3b6 100644
--- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp
+++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
@@ -191,7 +191,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [11 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [11 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP66]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 11, ptr [[TMP67]], align 4
 // CHECK1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -317,7 +317,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [11 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [11 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP138]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 11, ptr [[TMP139]], align 4
 // CHECK1-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -541,7 +541,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -599,7 +599,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP46]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP47]], align 4
 // CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -744,7 +744,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP13]], align 4
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index 7305c56289e01..0f67cdc56ddcf 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -318,7 +318,7 @@ int main()
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -359,7 +359,7 @@ int main()
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -890,7 +890,7 @@ int main()
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK2-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -971,7 +971,7 @@ int main()
 // CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP69]], 1
 // CHECK2-NEXT:    [[TMP70:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK2-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP72]], align 4
 // CHECK2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -1022,7 +1022,7 @@ int main()
 // CHECK2-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP96]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP96]], align 4
 // CHECK2-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 2, ptr [[TMP97]], align 4
 // CHECK2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -1073,7 +1073,7 @@ int main()
 // CHECK2-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS31]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS32]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS34]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP121]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP121]], align 4
 // CHECK2-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS34]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 2, ptr [[TMP122]], align 4
 // CHECK2-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS34]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_codegen_global_capture.cpp b/clang/test/OpenMP/target_codegen_global_capture.cpp
index 0fb52c3fa6c55..d2c800a56a104 100644
--- a/clang/test/OpenMP/target_codegen_global_capture.cpp
+++ b/clang/test/OpenMP/target_codegen_global_capture.cpp
@@ -288,7 +288,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP51]], align 4
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -615,7 +615,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP54]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP54]], align 4
 // CHECK1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP55]], align 4
 // CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -938,7 +938,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP54]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP54]], align 4
 // CHECK1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP55]], align 4
 // CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1184,7 +1184,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP44]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP44]], align 4
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1511,7 +1511,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP49]], align 4
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1834,7 +1834,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP49]], align 4
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_firstprivate_codegen.cpp b/clang/test/OpenMP/target_firstprivate_codegen.cpp
index 6314940730470..02de48e6aa395 100644
--- a/clang/test/OpenMP/target_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_firstprivate_codegen.cpp
@@ -6159,7 +6159,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP24]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK0-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK0-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6260,7 +6260,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP76]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP76]], align 4
 // CHECK0-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 9, ptr [[TMP77]], align 4
 // CHECK0-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -6308,7 +6308,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK0-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 2, ptr [[TMP101]], align 4
 // CHECK0-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 2
@@ -6564,7 +6564,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK0-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 5, ptr [[TMP29]], align 4
 // CHECK0-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6651,7 +6651,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK0-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK0-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6718,7 +6718,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK0-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK0-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6930,7 +6930,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP24]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7031,7 +7031,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP76]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP76]], align 4
 // CHECK1-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP77]], align 4
 // CHECK1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -7079,7 +7079,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK1-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP101]], align 4
 // CHECK1-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 2
@@ -7335,7 +7335,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7422,7 +7422,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7489,7 +7489,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7699,7 +7699,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7802,7 +7802,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP76]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP76]], align 4
 // CHECK2-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 9, ptr [[TMP77]], align 4
 // CHECK2-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -7850,7 +7850,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK2-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 2, ptr [[TMP101]], align 4
 // CHECK2-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 2
@@ -8106,7 +8106,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 5, ptr [[TMP29]], align 4
 // CHECK2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8193,7 +8193,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8260,7 +8260,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8470,7 +8470,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8573,7 +8573,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP76]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP76]], align 4
 // CHECK3-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP77]], align 4
 // CHECK3-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -8621,7 +8621,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK3-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP101]], align 4
 // CHECK3-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 2
@@ -8877,7 +8877,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8964,7 +8964,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9031,7 +9031,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_has_device_addr_codegen.cpp b/clang/test/OpenMP/target_has_device_addr_codegen.cpp
index e6a0e7bb38d64..ba1b618ed8bdd 100644
--- a/clang/test/OpenMP/target_has_device_addr_codegen.cpp
+++ b/clang/test/OpenMP/target_has_device_addr_codegen.cpp
@@ -328,7 +328,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -372,7 +372,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP29]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -413,7 +413,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -457,7 +457,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS14]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -498,7 +498,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP90]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP90]], align 4
 // CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP91]], align 4
 // CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -539,7 +539,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS26]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS27]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP110]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP110]], align 4
 // CHECK-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP111]], align 4
 // CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 2
@@ -705,7 +705,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -749,7 +749,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP29]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -790,7 +790,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -831,7 +831,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP68]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP68]], align 4
 // CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP69]], align 4
 // CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -911,7 +911,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -955,7 +955,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP29]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -996,7 +996,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -1037,7 +1037,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP68]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP68]], align 4
 // CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP69]], align 4
 // CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -1239,7 +1239,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
 // CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp b/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp
index efd0a1ee88fe2..6bbabdf51f6e7 100644
--- a/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp
+++ b/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp
@@ -102,7 +102,7 @@ int main() {
 // CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [6 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [6 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 6, ptr [[TMP27]], align 4
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -211,7 +211,7 @@ int main() {
 // CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
index 162f18529a489..190032023aed3 100644
--- a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
+++ b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
@@ -1827,7 +1827,7 @@ void bar() {
 // CK10-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK10-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK10-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK10-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK10-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1869,7 +1869,7 @@ void bar() {
 // CK10-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CK10-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CK10-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CK10-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CK10-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -1911,7 +1911,7 @@ void bar() {
 // CK10-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CK10-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CK10-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CK10-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CK10-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -1956,7 +1956,7 @@ void bar() {
 // CK10-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CK10-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CK10-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK10-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CK10-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -2001,7 +2001,7 @@ void bar() {
 // CK10-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CK10-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CK10-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CK10-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP95]], align 4
 // CK10-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2046,7 +2046,7 @@ void bar() {
 // CK10-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CK10-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CK10-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP117]], align 4
 // CK10-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP118]], align 4
 // CK10-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2101,7 +2101,7 @@ void bar() {
 // CK10-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CK10-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CK10-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP146]], align 4
 // CK10-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CK10-NEXT:    store i32 2, ptr [[TMP147]], align 4
 // CK10-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -2298,7 +2298,7 @@ void bar() {
 // CK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2340,7 +2340,7 @@ void bar() {
 // CK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -2382,7 +2382,7 @@ void bar() {
 // CK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CK11-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -2427,7 +2427,7 @@ void bar() {
 // CK11-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CK11-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CK11-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK11-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CK11-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -2472,7 +2472,7 @@ void bar() {
 // CK11-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CK11-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CK11-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CK11-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP95]], align 4
 // CK11-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2517,7 +2517,7 @@ void bar() {
 // CK11-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CK11-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CK11-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP117]], align 4
 // CK11-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP118]], align 4
 // CK11-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2572,7 +2572,7 @@ void bar() {
 // CK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CK11-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CK11-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP146]], align 4
 // CK11-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CK11-NEXT:    store i32 2, ptr [[TMP147]], align 4
 // CK11-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -2769,7 +2769,7 @@ void bar() {
 // CK12-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK12-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK12-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK12-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK12-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2811,7 +2811,7 @@ void bar() {
 // CK12-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CK12-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CK12-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CK12-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CK12-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -2853,7 +2853,7 @@ void bar() {
 // CK12-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CK12-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CK12-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CK12-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CK12-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -2898,7 +2898,7 @@ void bar() {
 // CK12-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CK12-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CK12-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK12-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CK12-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -2943,7 +2943,7 @@ void bar() {
 // CK12-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CK12-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CK12-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CK12-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP95]], align 4
 // CK12-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2988,7 +2988,7 @@ void bar() {
 // CK12-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CK12-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CK12-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP117]], align 4
 // CK12-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP118]], align 4
 // CK12-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -3043,7 +3043,7 @@ void bar() {
 // CK12-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CK12-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CK12-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP146]], align 4
 // CK12-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CK12-NEXT:    store i32 2, ptr [[TMP147]], align 4
 // CK12-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -3240,7 +3240,7 @@ void bar() {
 // CK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK13-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3282,7 +3282,7 @@ void bar() {
 // CK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CK13-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CK13-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -3324,7 +3324,7 @@ void bar() {
 // CK13-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CK13-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CK13-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -3369,7 +3369,7 @@ void bar() {
 // CK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CK13-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -3414,7 +3414,7 @@ void bar() {
 // CK13-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CK13-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CK13-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP95]], align 4
 // CK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3459,7 +3459,7 @@ void bar() {
 // CK13-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CK13-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CK13-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP117]], align 4
 // CK13-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP118]], align 4
 // CK13-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -3514,7 +3514,7 @@ void bar() {
 // CK13-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CK13-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CK13-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP146]], align 4
 // CK13-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CK13-NEXT:    store i32 2, ptr [[TMP147]], align 4
 // CK13-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -4003,7 +4003,7 @@ void bar() {
 // CK20-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK20-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK20-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK20-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CK20-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK20-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CK20-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4061,7 +4061,7 @@ void bar() {
 // CK20-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CK20-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CK20-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CK20-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK20-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CK20-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CK20-NEXT:    store i32 2, ptr [[TMP37]], align 4
 // CK20-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4126,7 +4126,7 @@ void bar() {
 // CK20-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CK20-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
 // CK20-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CK20-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK20-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CK20-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CK20-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK20-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4283,7 +4283,7 @@ void bar() {
 // CK21-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK21-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK21-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK21-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK21-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CK21-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK21-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CK21-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4341,7 +4341,7 @@ void bar() {
 // CK21-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CK21-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CK21-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CK21-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK21-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CK21-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CK21-NEXT:    store i32 2, ptr [[TMP37]], align 4
 // CK21-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4406,7 +4406,7 @@ void bar() {
 // CK21-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CK21-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
 // CK21-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CK21-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK21-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CK21-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CK21-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK21-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4563,7 +4563,7 @@ void bar() {
 // CK22-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK22-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK22-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK22-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK22-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CK22-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK22-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CK22-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4621,7 +4621,7 @@ void bar() {
 // CK22-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CK22-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CK22-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CK22-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK22-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CK22-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CK22-NEXT:    store i32 2, ptr [[TMP37]], align 4
 // CK22-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4686,7 +4686,7 @@ void bar() {
 // CK22-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CK22-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
 // CK22-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CK22-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK22-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CK22-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CK22-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK22-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4843,7 +4843,7 @@ void bar() {
 // CK23-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK23-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK23-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK23-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK23-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CK23-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK23-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CK23-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4901,7 +4901,7 @@ void bar() {
 // CK23-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CK23-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CK23-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CK23-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK23-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CK23-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CK23-NEXT:    store i32 2, ptr [[TMP37]], align 4
 // CK23-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4966,7 +4966,7 @@ void bar() {
 // CK23-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CK23-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
 // CK23-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CK23-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK23-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CK23-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CK23-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK23-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -5376,7 +5376,7 @@ void bar() {
 // CK30-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK30-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK30-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK30-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK30-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK30-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK30-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK30-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5439,7 +5439,7 @@ void bar() {
 // CK31-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK31-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK31-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK31-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK31-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK31-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK31-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK31-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5502,7 +5502,7 @@ void bar() {
 // CK32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK32-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK32-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK32-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5565,7 +5565,7 @@ void bar() {
 // CK33-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK33-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK33-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK33-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK33-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK33-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK33-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK33-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_map_codegen_03.cpp b/clang/test/OpenMP/target_map_codegen_03.cpp
index cd28e7bf848e0..959018647906f 100644
--- a/clang/test/OpenMP/target_map_codegen_03.cpp
+++ b/clang/test/OpenMP/target_map_codegen_03.cpp
@@ -96,7 +96,7 @@ void implicit_maps_nested_integer (int a){
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP9]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -195,7 +195,7 @@ void implicit_maps_nested_integer (int a){
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_map_codegen_hold.cpp b/clang/test/OpenMP/target_map_codegen_hold.cpp
index 81306ccb4cf55..c6ee673d9598e 100644
--- a/clang/test/OpenMP/target_map_codegen_hold.cpp
+++ b/clang/test/OpenMP/target_map_codegen_hold.cpp
@@ -245,7 +245,7 @@ void ST::test_present_members() {
 // CHECK-USE-PPC64LE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [7 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [7 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-USE-PPC64LE-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK-USE-PPC64LE-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-USE-PPC64LE-NEXT:    store i32 7, ptr [[TMP38]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -286,7 +286,7 @@ void ST::test_present_members() {
 // CHECK-USE-PPC64LE-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK-USE-PPC64LE-NEXT:    store i32 2, ptr [[TMP57]], align 4
+// CHECK-USE-PPC64LE-NEXT:    store i32 3, ptr [[TMP57]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK-USE-PPC64LE-NEXT:    store i32 1, ptr [[TMP58]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -410,7 +410,7 @@ void ST::test_present_members() {
 // CHECK-USE-PPC64LE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-USE-PPC64LE-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK-USE-PPC64LE-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-USE-PPC64LE-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -546,7 +546,7 @@ void ST::test_present_members() {
 // CHECK-USE-I386-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [7 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [7 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-USE-I386-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK-USE-I386-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-USE-I386-NEXT:    store i32 7, ptr [[TMP38]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -587,7 +587,7 @@ void ST::test_present_members() {
 // CHECK-USE-I386-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK-USE-I386-NEXT:    store i32 2, ptr [[TMP57]], align 4
+// CHECK-USE-I386-NEXT:    store i32 3, ptr [[TMP57]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK-USE-I386-NEXT:    store i32 1, ptr [[TMP58]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -711,7 +711,7 @@ void ST::test_present_members() {
 // CHECK-USE-I386-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-USE-I386-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK-USE-I386-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-USE-I386-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -847,7 +847,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [7 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [7 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NOUSE-PPC64LE-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK-NOUSE-PPC64LE-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NOUSE-PPC64LE-NEXT:    store i32 7, ptr [[TMP38]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -888,7 +888,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK-NOUSE-PPC64LE-NEXT:    store i32 2, ptr [[TMP57]], align 4
+// CHECK-NOUSE-PPC64LE-NEXT:    store i32 3, ptr [[TMP57]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK-NOUSE-PPC64LE-NEXT:    store i32 1, ptr [[TMP58]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -978,7 +978,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NOUSE-PPC64LE-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK-NOUSE-PPC64LE-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NOUSE-PPC64LE-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1103,7 +1103,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-I386-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [7 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [7 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NOUSE-I386-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK-NOUSE-I386-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NOUSE-I386-NEXT:    store i32 7, ptr [[TMP38]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1144,7 +1144,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-I386-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK-NOUSE-I386-NEXT:    store i32 2, ptr [[TMP57]], align 4
+// CHECK-NOUSE-I386-NEXT:    store i32 3, ptr [[TMP57]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK-NOUSE-I386-NEXT:    store i32 1, ptr [[TMP58]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1234,7 +1234,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-I386-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NOUSE-I386-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK-NOUSE-I386-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NOUSE-I386-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_map_deref_array_codegen.cpp b/clang/test/OpenMP/target_map_deref_array_codegen.cpp
index da4176f20109c..b854333002299 100644
--- a/clang/test/OpenMP/target_map_deref_array_codegen.cpp
+++ b/clang/test/OpenMP/target_map_deref_array_codegen.cpp
@@ -91,7 +91,7 @@ void foo(int **t1d)
 // CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 2, ptr [[TMP20]], align 4
 // CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -143,7 +143,7 @@ void foo(int **t1d)
 // CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP47]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP47]], align 4
 // CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK-NEXT:    store i32 2, ptr [[TMP48]], align 4
 // CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -224,7 +224,7 @@ void foo(int **t1d)
 // CHECK-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS12]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS13]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP88]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP88]], align 4
 // CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK-NEXT:    store i32 4, ptr [[TMP89]], align 4
 // CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_map_member_expr_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
index 84844cff09b93..9979f381a7067 100644
--- a/clang/test/OpenMP/target_map_member_expr_codegen.cpp
+++ b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
@@ -138,7 +138,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP14]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP14]], align 4
 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 4, ptr [[TMP15]], align 4
 // CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -295,7 +295,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP55]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP55]], align 4
 // CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 2, ptr [[TMP56]], align 4
 // CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -379,7 +379,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES21]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP102]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP102]], align 4
 // CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 1
 // CHECK-NEXT:    store i32 3, ptr [[TMP103]], align 4
 // CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_offload_mandatory_codegen.cpp b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp
index 04360f1ea03bd..3bf15277aa1c3 100644
--- a/clang/test/OpenMP/target_offload_mandatory_codegen.cpp
+++ b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp
@@ -33,7 +33,7 @@ void host_dev(int device) {
 // MANDATORY-NEXT:  entry:
 // MANDATORY-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // MANDATORY-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// MANDATORY-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// MANDATORY-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // MANDATORY-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // MANDATORY-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // MANDATORY-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -79,7 +79,7 @@ void host_dev(int device) {
 // MANDATORY-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
 // MANDATORY:       omp_if.then:
 // MANDATORY-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// MANDATORY-NEXT:    store i32 2, ptr [[TMP1]], align 4
+// MANDATORY-NEXT:    store i32 3, ptr [[TMP1]], align 4
 // MANDATORY-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // MANDATORY-NEXT:    store i32 0, ptr [[TMP2]], align 4
 // MANDATORY-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -129,7 +129,7 @@ void host_dev(int device) {
 // MANDATORY-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // MANDATORY-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 // MANDATORY-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// MANDATORY-NEXT:    store i32 2, ptr [[TMP3]], align 4
+// MANDATORY-NEXT:    store i32 3, ptr [[TMP3]], align 4
 // MANDATORY-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // MANDATORY-NEXT:    store i32 0, ptr [[TMP4]], align 4
 // MANDATORY-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp b/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp
index e8b074a9d5f8b..10c4ac38e6a89 100644
--- a/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp
+++ b/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp
@@ -256,7 +256,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -298,7 +298,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP39]], align 4
 // CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -408,7 +408,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -493,7 +493,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -561,7 +561,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -958,7 +958,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META27]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META27]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META27]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1138,7 +1138,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1180,7 +1180,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP39]], align 4
 // CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1290,7 +1290,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1375,7 +1375,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1443,7 +1443,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -1836,7 +1836,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META28]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META28]]
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META28]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META28]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META28]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_codegen.cpp b/clang/test/OpenMP/target_parallel_codegen.cpp
index 84b7f1ae4ec04..ad5a9d53bcf65 100644
--- a/clang/test/OpenMP/target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_codegen.cpp
@@ -356,7 +356,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -413,7 +413,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP47]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP47]], align 4
 // CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP48]], align 4
 // CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -523,7 +523,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK1-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP101]], align 4
 // CHECK1-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -614,7 +614,7 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META21]]
 // CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META21]]
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -972,7 +972,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1080,7 +1080,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1168,7 +1168,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1450,7 +1450,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP17]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1507,7 +1507,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1619,7 +1619,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK3-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP101]], align 4
 // CHECK3-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -1710,7 +1710,7 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META22]]
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -2068,7 +2068,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2176,7 +2176,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2264,7 +2264,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_for_codegen.cpp b/clang/test/OpenMP/target_parallel_for_codegen.cpp
index 31f6d0b8ad9ae..73504cf7a8024 100644
--- a/clang/test/OpenMP/target_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_codegen.cpp
@@ -366,7 +366,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP6]], align 8
 // CHECK1-NEXT:    store i64 [[TMP5]], ptr [[__VLA_EXPR1]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -487,7 +487,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP73]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP73]], align 4
 // CHECK1-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP74]], align 4
 // CHECK1-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -608,7 +608,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP132]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP132]], align 4
 // CHECK1-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 10, ptr [[TMP133]], align 4
 // CHECK1-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -1047,7 +1047,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias [[META24]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias [[META24]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1475,7 +1475,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1583,7 +1583,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1671,7 +1671,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2032,7 +2032,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP4]], align 8
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[__VLA_EXPR1]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2150,7 +2150,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP69]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP69]], align 4
 // CHECK3-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP70]], align 4
 // CHECK3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -2273,7 +2273,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP130]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP130]], align 4
 // CHECK3-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 10, ptr [[TMP131]], align 4
 // CHECK3-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -2710,7 +2710,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias [[META25]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias [[META25]]
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -3138,7 +3138,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3246,7 +3246,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3334,7 +3334,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5256,7 +5256,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP6]], align 8
 // CHECK17-NEXT:    store i64 [[TMP5]], ptr [[__VLA_EXPR1]], align 8
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK17-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 0, ptr [[TMP8]], align 4
 // CHECK17-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5377,7 +5377,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP73]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP73]], align 4
 // CHECK17-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP74]], align 4
 // CHECK17-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -5498,7 +5498,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP132]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP132]], align 4
 // CHECK17-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 10, ptr [[TMP133]], align 4
 // CHECK17-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -5937,7 +5937,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias [[META24]]
 // CHECK17-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK17-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK17-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]]
+// CHECK17-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]]
 // CHECK17-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias [[META24]]
 // CHECK17-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -6365,7 +6365,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK17-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK17-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6473,7 +6473,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK17-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6561,7 +6561,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK17-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK17-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6922,7 +6922,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP4]], align 8
 // CHECK19-NEXT:    store i32 [[TMP3]], ptr [[__VLA_EXPR1]], align 4
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 0, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7040,7 +7040,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP69]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP69]], align 4
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP70]], align 4
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -7163,7 +7163,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP130]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP130]], align 4
 // CHECK19-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 10, ptr [[TMP131]], align 4
 // CHECK19-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -7600,7 +7600,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias [[META25]]
 // CHECK19-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK19-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK19-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK19-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK19-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias [[META25]]
 // CHECK19-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -8028,7 +8028,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK19-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK19-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8136,7 +8136,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK19-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8224,7 +8224,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK19-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK19-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
index 7751ae93d59fa..8009e06666c82 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -395,7 +395,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP31]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP31]], align 4
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -452,7 +452,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP60]], align 4
 // CHECK1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -573,7 +573,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP118]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP118]], align 4
 // CHECK1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 10, ptr [[TMP119]], align 4
 // CHECK1-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -721,7 +721,7 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META25]]
 // CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]]
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1403,7 +1403,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1511,7 +1511,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1599,7 +1599,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2013,7 +2013,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2070,7 +2070,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP55]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP55]], align 4
 // CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP56]], align 4
 // CHECK3-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -2193,7 +2193,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP116]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP116]], align 4
 // CHECK3-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 10, ptr [[TMP117]], align 4
 // CHECK3-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -2341,7 +2341,7 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]]
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -3021,7 +3021,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3129,7 +3129,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3217,7 +3217,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3637,7 +3637,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP31]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP31]], align 4
 // CHECK5-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK5-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3694,7 +3694,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK5-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 2, ptr [[TMP60]], align 4
 // CHECK5-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -3815,7 +3815,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP118]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP118]], align 4
 // CHECK5-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 10, ptr [[TMP119]], align 4
 // CHECK5-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -3963,7 +3963,7 @@ int bar(int n){
 // CHECK5-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META25]]
 // CHECK5-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]]
 // CHECK5-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]]
-// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK5-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META25]]
 // CHECK5-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -4666,7 +4666,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP36:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK5-NEXT:    [[TMP37:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP36]], 0
 // CHECK5-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK5-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 6, ptr [[TMP39]], align 4
 // CHECK5-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4774,7 +4774,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK5-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4862,7 +4862,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5362,7 +5362,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK7-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK7-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5419,7 +5419,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP55]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP55]], align 4
 // CHECK7-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 2, ptr [[TMP56]], align 4
 // CHECK7-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -5542,7 +5542,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP116]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP116]], align 4
 // CHECK7-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 10, ptr [[TMP117]], align 4
 // CHECK7-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -5690,7 +5690,7 @@ int bar(int n){
 // CHECK7-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META26]]
 // CHECK7-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]]
 // CHECK7-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]]
-// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK7-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK7-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META26]]
 // CHECK7-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -6391,7 +6391,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP36:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK7-NEXT:    [[TMP37:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP36]], 0
 // CHECK7-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK7-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 6, ptr [[TMP39]], align 4
 // CHECK7-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6499,7 +6499,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK7-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6587,7 +6587,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
index fa3d182b62ad2..bbefab195b9ae 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
@@ -4185,7 +4185,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4394,7 +4394,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4565,7 +4565,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4708,7 +4708,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4879,7 +4879,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5022,7 +5022,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5193,7 +5193,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5336,7 +5336,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5486,7 +5486,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5571,7 +5571,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5695,7 +5695,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5760,7 +5760,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
index a7b7278dff8d9..a92f052085c74 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
@@ -88,7 +88,7 @@ int nested(int a){
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -306,7 +306,7 @@ int nested(int a){
 // CHECK-X86-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-X86-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-X86-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-X86-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK-X86-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK-X86-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-X86-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK-X86-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
index 472811c5c5667..0171d97e2f545 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
@@ -90,7 +90,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_if_codegen.cpp b/clang/test/OpenMP/target_parallel_if_codegen.cpp
index 841c49e31ccb9..6447a76dd467c 100644
--- a/clang/test/OpenMP/target_parallel_if_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_if_codegen.cpp
@@ -270,7 +270,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -335,7 +335,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP47:%.*]] = select i1 [[TOBOOL15]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP48:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP47]], 0
 // CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP49]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP49]], align 4
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP50]], align 4
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -416,7 +416,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP10:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP10]], 0
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP13]], align 4
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -459,7 +459,7 @@ int bar(int n){
 // CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_IF_THEN5:%.*]], label [[OMP_IF_ELSE9:%.*]]
 // CHECK1:       omp_if.then5:
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -532,7 +532,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -586,7 +586,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP34]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP34]], align 4
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP35]], align 4
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -932,7 +932,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -997,7 +997,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP47:%.*]] = select i1 [[TOBOOL15]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP48:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP47]], 0
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP49]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP49]], align 4
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP50]], align 4
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -1078,7 +1078,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP10:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP10]], 0
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP13]], align 4
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1121,7 +1121,7 @@ int bar(int n){
 // CHECK3-NEXT:    br i1 [[CMP4]], label [[OMP_IF_THEN5:%.*]], label [[OMP_IF_ELSE9:%.*]]
 // CHECK3:       omp_if.then5:
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1194,7 +1194,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1248,7 +1248,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP34]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP34]], align 4
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP35]], align 4
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
index 0778bca8c3f43..a14518286dc91 100644
--- a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
@@ -271,7 +271,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -313,7 +313,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -384,7 +384,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -433,7 +433,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP33]], 0
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -488,7 +488,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -556,7 +556,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP34:%.*]] = zext i16 [[TMP33]] to i32
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -849,7 +849,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -891,7 +891,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -962,7 +962,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1011,7 +1011,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP33]], 0
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1066,7 +1066,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1134,7 +1134,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP34:%.*]] = zext i16 [[TMP33]] to i32
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_task_affinity_codegen.cpp b/clang/test/OpenMP/target_task_affinity_codegen.cpp
index 472c50235bf9e..2ef6e5a2f7a3d 100644
--- a/clang/test/OpenMP/target_task_affinity_codegen.cpp
+++ b/clang/test/OpenMP/target_task_affinity_codegen.cpp
@@ -118,7 +118,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -341,7 +341,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp
index e22fcbbcd277b..24dc2fd2e49f4 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -453,7 +453,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP53]], align 4
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -510,7 +510,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP80]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP80]], align 4
 // CHECK1-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP81]], align 4
 // CHECK1-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 2
@@ -568,7 +568,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP107]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP107]], align 4
 // CHECK1-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP108]], align 4
 // CHECK1-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 2
@@ -673,7 +673,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP159:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP160:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP160]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP160]], align 4
 // CHECK1-NEXT:    [[TMP161:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP161]], align 4
 // CHECK1-NEXT:    [[TMP162:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -723,7 +723,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP182:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP182]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP182]], align 4
 // CHECK1-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP183]], align 4
 // CHECK1-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -767,7 +767,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP202:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS42]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP203:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS43]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP204:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP204]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP204]], align 4
 // CHECK1-NEXT:    [[TMP205:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP205]], align 4
 // CHECK1-NEXT:    [[TMP206:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -919,7 +919,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1317,7 +1317,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1482,7 +1482,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1590,7 +1590,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1678,7 +1678,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2040,7 +2040,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP51]], align 4
 // CHECK3-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2097,7 +2097,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP78]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP78]], align 4
 // CHECK3-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP79]], align 4
 // CHECK3-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 2
@@ -2155,7 +2155,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP105]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP105]], align 4
 // CHECK3-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP106]], align 4
 // CHECK3-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 2
@@ -2262,7 +2262,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP159:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP160:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP160]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP160]], align 4
 // CHECK3-NEXT:    [[TMP161:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP161]], align 4
 // CHECK3-NEXT:    [[TMP162:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2312,7 +2312,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP182:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP182]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP182]], align 4
 // CHECK3-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP183]], align 4
 // CHECK3-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -2356,7 +2356,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP202:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS42]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP203:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS43]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP204:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP204]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP204]], align 4
 // CHECK3-NEXT:    [[TMP205:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP205]], align 4
 // CHECK3-NEXT:    [[TMP206:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -2508,7 +2508,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -2905,7 +2905,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3070,7 +3070,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3178,7 +3178,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3266,7 +3266,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
index 1ad3d8333ba60..bd33e52ca3ef0 100644
--- a/clang/test/OpenMP/target_teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
@@ -427,7 +427,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP53]], align 4
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -484,7 +484,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP80]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP80]], align 4
 // CHECK1-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP81]], align 4
 // CHECK1-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -605,7 +605,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP139]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP139]], align 4
 // CHECK1-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 10, ptr [[TMP140]], align 4
 // CHECK1-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -812,7 +812,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1397,7 +1397,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1534,7 +1534,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1623,7 +1623,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2163,7 +2163,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP51]], align 4
 // CHECK3-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2220,7 +2220,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP78]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP78]], align 4
 // CHECK3-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP79]], align 4
 // CHECK3-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -2343,7 +2343,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP139]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP139]], align 4
 // CHECK3-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 10, ptr [[TMP140]], align 4
 // CHECK3-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -2550,7 +2550,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -3135,7 +3135,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3272,7 +3272,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK3-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3361,7 +3361,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp
index 2d886109b95b9..52b40ede902ce 100644
--- a/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp
@@ -124,7 +124,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -285,7 +285,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -514,7 +514,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -750,7 +750,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -975,7 +975,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1209,7 +1209,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp
index 714ed708e71a8..e898f6cff7e7d 100644
--- a/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp
@@ -161,7 +161,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -203,7 +203,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -245,7 +245,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -573,7 +573,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -615,7 +615,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -657,7 +657,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1030,7 +1030,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1101,7 +1101,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1183,7 +1183,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK9-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK9-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1619,7 +1619,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1660,7 +1660,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1701,7 +1701,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK9-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2071,7 +2071,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2143,7 +2143,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2226,7 +2226,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK11-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK11-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2659,7 +2659,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2700,7 +2700,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2741,7 +2741,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
index 1d6f69079df96..da69e19a890ef 100644
--- a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
@@ -303,7 +303,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -584,7 +584,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1105,7 +1105,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1384,7 +1384,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
index ffba1c2221c24..e23435d13e9ae 100644
--- a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
@@ -522,7 +522,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -837,7 +837,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1223,7 +1223,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1536,7 +1536,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
index 7e06d8c16169f..0d0d996747214 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
@@ -653,7 +653,7 @@ int target_teams_fun(int *g){
 // CHECK2-NEXT:    [[TMP29:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
 // CHECK2-NEXT:    [[TMP30:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP31]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP31]], align 4
 // CHECK2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 4, ptr [[TMP32]], align 4
 // CHECK2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -720,7 +720,7 @@ int target_teams_fun(int *g){
 // CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP62]], 1
 // CHECK2-NEXT:    [[TMP63:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP64]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP64]], align 4
 // CHECK2-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -1295,7 +1295,7 @@ int target_teams_fun(int *g){
 // CHECK4-NEXT:    [[TMP29:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
 // CHECK4-NEXT:    [[TMP30:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK4-NEXT:    store i32 2, ptr [[TMP31]], align 4
+// CHECK4-NEXT:    store i32 3, ptr [[TMP31]], align 4
 // CHECK4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK4-NEXT:    store i32 4, ptr [[TMP32]], align 4
 // CHECK4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1362,7 +1362,7 @@ int target_teams_fun(int *g){
 // CHECK4-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP62]], 1
 // CHECK4-NEXT:    [[TMP63:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK4-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK4-NEXT:    store i32 2, ptr [[TMP64]], align 4
+// CHECK4-NEXT:    store i32 3, ptr [[TMP64]], align 4
 // CHECK4-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK4-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK4-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
index b812011ea4ca0..da85b23de2e29 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
@@ -129,7 +129,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -365,7 +365,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -665,7 +665,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1027,7 +1027,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1327,7 +1327,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1691,7 +1691,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
index 0aa75952a3c2b..e0b722bd80467 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
@@ -173,7 +173,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -215,7 +215,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -257,7 +257,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -807,7 +807,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -849,7 +849,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -891,7 +891,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1476,7 +1476,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1547,7 +1547,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1629,7 +1629,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK9-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK9-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2394,7 +2394,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2435,7 +2435,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2487,7 +2487,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK9-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -3096,7 +3096,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3168,7 +3168,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3251,7 +3251,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK11-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK11-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4001,7 +4001,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4042,7 +4042,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4094,7 +4094,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK11-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
index 375279d963608..bf1fc1d7cbae5 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -364,7 +364,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -775,7 +775,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1426,7 +1426,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1831,7 +1831,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
index 9baf13385bef5..9be3ca8fd7587 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
@@ -113,7 +113,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -146,7 +146,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -476,7 +476,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -535,7 +535,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1035,7 +1035,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1090,7 +1090,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
index e54c230eea7ee..ae900d2dc53b9 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -783,7 +783,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1252,7 +1252,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1783,7 +1783,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2246,7 +2246,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
index feab9cdc948bc..a64110f581df8 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
@@ -27,7 +27,7 @@ void gtid_test() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
index 066d926e5abcb..49e4681c05153 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
@@ -301,7 +301,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -571,7 +571,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1034,7 +1034,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1298,7 +1298,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
index 9f3e50fe20a62..caac6edfc51dd 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
@@ -58,7 +58,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -91,7 +91,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -409,7 +409,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
index 1036ffd195de0..d5f1b0b9e59dc 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
@@ -109,7 +109,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -400,7 +400,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -688,7 +688,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -975,7 +975,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
index 3d031e09aa07d..fb0b2893d9529 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
@@ -243,7 +243,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -285,7 +285,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -327,7 +327,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -369,7 +369,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -411,7 +411,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1270,7 +1270,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1312,7 +1312,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1354,7 +1354,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1396,7 +1396,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -1438,7 +1438,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2270,7 +2270,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2312,7 +2312,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2354,7 +2354,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK5-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK5-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2396,7 +2396,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK5-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK5-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2438,7 +2438,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK5-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK5-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3297,7 +3297,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK7-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3339,7 +3339,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3381,7 +3381,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK7-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK7-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3423,7 +3423,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK7-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK7-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3465,7 +3465,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK7-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK7-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4357,7 +4357,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK13-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4428,7 +4428,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK13-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK13-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK13-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK13-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -4510,7 +4510,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK13-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK13-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK13-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4581,7 +4581,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP132]], 1
 // CHECK13-NEXT:    [[TMP133:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK13-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK13-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP135]], align 4
 // CHECK13-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -4663,7 +4663,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP173]], 1
 // CHECK13-NEXT:    [[TMP174:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK13-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP175]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP175]], align 4
 // CHECK13-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP176]], align 4
 // CHECK13-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -5881,7 +5881,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5922,7 +5922,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -5974,7 +5974,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK13-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK13-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -6015,7 +6015,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK13-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6067,7 +6067,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK13-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -7008,7 +7008,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK15-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7080,7 +7080,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK15-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK15-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK15-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK15-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -7163,7 +7163,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK15-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK15-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK15-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK15-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -7235,7 +7235,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP135]], 1
 // CHECK15-NEXT:    [[TMP136:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK15-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP137]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK15-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK15-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -7318,7 +7318,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP177]], 1
 // CHECK15-NEXT:    [[TMP178:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK15-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP179]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP179]], align 4
 // CHECK15-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP180]], align 4
 // CHECK15-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -8511,7 +8511,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK15-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8552,7 +8552,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK15-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK15-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -8604,7 +8604,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK15-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK15-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -8645,7 +8645,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK15-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK15-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -8697,7 +8697,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK15-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK15-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -9611,7 +9611,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK17-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9682,7 +9682,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK17-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK17-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK17-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK17-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -9764,7 +9764,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK17-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK17-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK17-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -9835,7 +9835,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP132]], 1
 // CHECK17-NEXT:    [[TMP133:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK17-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK17-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP135]], align 4
 // CHECK17-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -9917,7 +9917,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP173]], 1
 // CHECK17-NEXT:    [[TMP174:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK17-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP175]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP175]], align 4
 // CHECK17-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP176]], align 4
 // CHECK17-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -11135,7 +11135,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -11176,7 +11176,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -11228,7 +11228,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK17-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK17-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -11269,7 +11269,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK17-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -11321,7 +11321,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK17-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -12262,7 +12262,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK19-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12334,7 +12334,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK19-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK19-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK19-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK19-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -12417,7 +12417,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK19-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK19-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK19-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK19-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -12489,7 +12489,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP135]], 1
 // CHECK19-NEXT:    [[TMP136:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK19-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP137]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK19-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK19-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -12572,7 +12572,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP177]], 1
 // CHECK19-NEXT:    [[TMP178:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK19-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP179]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP179]], align 4
 // CHECK19-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP180]], align 4
 // CHECK19-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -13765,7 +13765,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -13806,7 +13806,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -13858,7 +13858,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK19-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -13899,7 +13899,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK19-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK19-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -13951,7 +13951,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK19-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK19-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
index 109063c623a11..a4119a07f61f2 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -181,7 +181,7 @@ void test_target_teams_atomic() {
 // CHECK1-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP28]], 0
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP29]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -248,7 +248,7 @@ void test_target_teams_atomic() {
 // CHECK1-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP67]], 1
 // CHECK1-NEXT:    [[TMP68:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK1-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP69]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP69]], align 4
 // CHECK1-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -821,7 +821,7 @@ void test_target_teams_atomic() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1116,7 +1116,7 @@ void test_target_teams_atomic() {
 // CHECK3-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP28]], 0
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP29]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1183,7 +1183,7 @@ void test_target_teams_atomic() {
 // CHECK3-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP67]], 1
 // CHECK3-NEXT:    [[TMP68:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK3-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP69]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP69]], align 4
 // CHECK3-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -1746,7 +1746,7 @@ void test_target_teams_atomic() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
index 306ec1db2ab21..53a13e129d3b2 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
@@ -129,7 +129,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -381,7 +381,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -837,7 +837,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1235,7 +1235,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1551,7 +1551,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1951,7 +1951,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
index c265c1c616b14..d449406927e71 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
@@ -173,7 +173,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -215,7 +215,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -257,7 +257,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -849,7 +849,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -891,7 +891,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -933,7 +933,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1807,7 +1807,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1878,7 +1878,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1960,7 +1960,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK9-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK9-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2797,7 +2797,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2838,7 +2838,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2890,7 +2890,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK9-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -3541,7 +3541,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3613,7 +3613,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3696,7 +3696,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK11-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK11-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4518,7 +4518,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4559,7 +4559,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4611,7 +4611,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK11-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index 37c1f428ef9a3..ea05aecebf371 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -362,7 +362,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -787,7 +787,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1452,7 +1452,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1871,7 +1871,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
index df5dd7ba68052..71f4506fb6348 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
@@ -130,7 +130,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -163,7 +163,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP23]], align 4
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -536,7 +536,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -595,7 +595,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1137,7 +1137,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1192,7 +1192,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1735,7 +1735,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1768,7 +1768,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP23]], align 4
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2141,7 +2141,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2200,7 +2200,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -2972,7 +2972,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3027,7 +3027,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4163,7 +4163,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK9-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK9-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4196,7 +4196,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4569,7 +4569,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4628,7 +4628,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK9-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK9-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -5170,7 +5170,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5225,7 +5225,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK9-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -5768,7 +5768,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK11-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5801,7 +5801,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6174,7 +6174,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6233,7 +6233,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK11-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -7005,7 +7005,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7060,7 +7060,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK11-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index 3fdc0c4825410..b2b9e2082fa3a 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -811,7 +811,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1294,7 +1294,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1839,7 +1839,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2316,7 +2316,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
index bba97a750adc6..0cc07c86aaea7 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
@@ -301,7 +301,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -585,7 +585,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1062,7 +1062,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1340,7 +1340,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
index c347743bd4fdd..7619bf7502e78 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -58,7 +58,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -91,7 +91,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -437,7 +437,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
index 52430d51cde1b..3bff9904850f5 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
@@ -109,7 +109,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -414,7 +414,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -716,7 +716,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1017,7 +1017,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
index f7c0666d58b66..b92dd14de108d 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
@@ -243,7 +243,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -285,7 +285,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -327,7 +327,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -369,7 +369,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -411,7 +411,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1340,7 +1340,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1382,7 +1382,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1424,7 +1424,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1466,7 +1466,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -1508,7 +1508,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2410,7 +2410,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2452,7 +2452,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2494,7 +2494,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK5-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK5-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2536,7 +2536,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK5-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK5-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2578,7 +2578,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK5-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK5-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3507,7 +3507,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK7-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3549,7 +3549,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3591,7 +3591,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK7-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK7-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3633,7 +3633,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK7-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK7-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3675,7 +3675,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK7-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK7-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -5022,7 +5022,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK13-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5093,7 +5093,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK13-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK13-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK13-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK13-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -5175,7 +5175,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK13-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK13-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK13-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -5246,7 +5246,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP132]], 1
 // CHECK13-NEXT:    [[TMP133:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK13-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK13-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP135]], align 4
 // CHECK13-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -5328,7 +5328,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP173]], 1
 // CHECK13-NEXT:    [[TMP174:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK13-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP175]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP175]], align 4
 // CHECK13-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP176]], align 4
 // CHECK13-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -6666,7 +6666,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6707,7 +6707,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -6759,7 +6759,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK13-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK13-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -6800,7 +6800,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK13-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6852,7 +6852,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK13-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -7863,7 +7863,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK15-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7935,7 +7935,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK15-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK15-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK15-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK15-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -8018,7 +8018,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK15-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK15-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK15-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK15-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -8090,7 +8090,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP135]], 1
 // CHECK15-NEXT:    [[TMP136:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK15-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP137]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK15-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK15-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -8173,7 +8173,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP177]], 1
 // CHECK15-NEXT:    [[TMP178:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK15-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP179]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP179]], align 4
 // CHECK15-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP180]], align 4
 // CHECK15-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -9486,7 +9486,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK15-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9527,7 +9527,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK15-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK15-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -9579,7 +9579,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK15-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK15-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -9620,7 +9620,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK15-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK15-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -9672,7 +9672,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK15-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK15-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -10656,7 +10656,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK17-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -10727,7 +10727,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK17-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK17-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK17-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK17-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -10809,7 +10809,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK17-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK17-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK17-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -10880,7 +10880,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP132]], 1
 // CHECK17-NEXT:    [[TMP133:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK17-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK17-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP135]], align 4
 // CHECK17-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -10962,7 +10962,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP173]], 1
 // CHECK17-NEXT:    [[TMP174:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK17-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP175]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP175]], align 4
 // CHECK17-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP176]], align 4
 // CHECK17-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -12300,7 +12300,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12341,7 +12341,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -12393,7 +12393,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK17-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK17-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -12434,7 +12434,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK17-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -12486,7 +12486,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK17-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -13497,7 +13497,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK19-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -13569,7 +13569,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK19-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK19-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK19-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK19-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -13652,7 +13652,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK19-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK19-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK19-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK19-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -13724,7 +13724,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP135]], 1
 // CHECK19-NEXT:    [[TMP136:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK19-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP137]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK19-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK19-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -13807,7 +13807,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP177]], 1
 // CHECK19-NEXT:    [[TMP178:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK19-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP179]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP179]], align 4
 // CHECK19-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP180]], align 4
 // CHECK19-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -15120,7 +15120,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -15161,7 +15161,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -15213,7 +15213,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK19-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -15254,7 +15254,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK19-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK19-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -15306,7 +15306,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK19-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK19-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
index 5bc78062c8309..3cb2d2c2f9b43 100644
--- a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
@@ -240,7 +240,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -409,7 +409,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -768,7 +768,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -935,7 +935,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp
index 9e9306a5e749f..47251b00c7a14 100644
--- a/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp
@@ -335,7 +335,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -376,7 +376,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -418,7 +418,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -460,7 +460,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -502,7 +502,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS22]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -544,7 +544,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS29]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS30]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP105]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP105]], align 4
 // CHECK1-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP106]], align 4
 // CHECK1-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 2
@@ -585,7 +585,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS37]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP125]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP125]], align 4
 // CHECK1-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP126]], align 4
 // CHECK1-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 2
@@ -626,7 +626,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS44]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP145]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP145]], align 4
 // CHECK1-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP146]], align 4
 // CHECK1-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 2
@@ -668,7 +668,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP163:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS50]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS51]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP165]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP165]], align 4
 // CHECK1-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP166]], align 4
 // CHECK1-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 2
@@ -710,7 +710,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS57]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS58]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK1-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP186]], align 4
 // CHECK1-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -2179,7 +2179,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2226,7 +2226,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2274,7 +2274,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2322,7 +2322,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP74]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP74]], align 4
 // CHECK1-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP75]], align 4
 // CHECK1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -2370,7 +2370,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS22]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK1-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -2418,7 +2418,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS29]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS30]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP120]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP120]], align 4
 // CHECK1-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP121]], align 4
 // CHECK1-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 2
@@ -2465,7 +2465,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS37]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP143]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP143]], align 4
 // CHECK1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP144]], align 4
 // CHECK1-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 2
@@ -2512,7 +2512,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS44]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP166]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP166]], align 4
 // CHECK1-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP167]], align 4
 // CHECK1-NEXT:    [[TMP168:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 2
@@ -2560,7 +2560,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS50]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP188:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS51]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP189:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP189]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP189]], align 4
 // CHECK1-NEXT:    [[TMP190:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP190]], align 4
 // CHECK1-NEXT:    [[TMP191:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 2
@@ -2608,7 +2608,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP210:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS57]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP211:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS58]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP212:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP212]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP212]], align 4
 // CHECK1-NEXT:    [[TMP213:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP213]], align 4
 // CHECK1-NEXT:    [[TMP214:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -4163,7 +4163,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4204,7 +4204,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4246,7 +4246,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -4288,7 +4288,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -4330,7 +4330,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS22]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -4372,7 +4372,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS29]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS30]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP105]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP105]], align 4
 // CHECK3-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP106]], align 4
 // CHECK3-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 2
@@ -4413,7 +4413,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS37]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP125]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP125]], align 4
 // CHECK3-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP126]], align 4
 // CHECK3-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 2
@@ -4454,7 +4454,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS44]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP145]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP145]], align 4
 // CHECK3-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP146]], align 4
 // CHECK3-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 2
@@ -4496,7 +4496,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP163:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS50]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS51]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP165]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP165]], align 4
 // CHECK3-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP166]], align 4
 // CHECK3-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 2
@@ -4538,7 +4538,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS57]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS58]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK3-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP186]], align 4
 // CHECK3-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -6007,7 +6007,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6054,7 +6054,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -6102,7 +6102,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK3-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -6150,7 +6150,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP74]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP74]], align 4
 // CHECK3-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP75]], align 4
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6198,7 +6198,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS22]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK3-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK3-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -6246,7 +6246,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS29]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS30]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP120]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP120]], align 4
 // CHECK3-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP121]], align 4
 // CHECK3-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 2
@@ -6293,7 +6293,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS37]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP143]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP143]], align 4
 // CHECK3-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP144]], align 4
 // CHECK3-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 2
@@ -6340,7 +6340,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS44]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP166]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP166]], align 4
 // CHECK3-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP167]], align 4
 // CHECK3-NEXT:    [[TMP168:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 2
@@ -6388,7 +6388,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS50]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP188:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS51]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP189:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP189]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP189]], align 4
 // CHECK3-NEXT:    [[TMP190:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP190]], align 4
 // CHECK3-NEXT:    [[TMP191:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 2
@@ -6436,7 +6436,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP210:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS57]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP211:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS58]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP212:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP212]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP212]], align 4
 // CHECK3-NEXT:    [[TMP213:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP213]], align 4
 // CHECK3-NEXT:    [[TMP214:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
index 5091d41eec4f3..d2a5d1f62a8b9 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -424,7 +424,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP53]], align 4
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -481,7 +481,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP80]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP80]], align 4
 // CHECK1-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP81]], align 4
 // CHECK1-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -591,7 +591,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK1-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP134]], align 4
 // CHECK1-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 2
@@ -803,7 +803,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias [[META26]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1392,7 +1392,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1529,7 +1529,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1618,7 +1618,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2186,7 +2186,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP51]], align 4
 // CHECK3-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2243,7 +2243,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP78]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP78]], align 4
 // CHECK3-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP79]], align 4
 // CHECK3-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -2355,7 +2355,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK3-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP134]], align 4
 // CHECK3-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 2
@@ -2567,7 +2567,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias [[META27]]
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -3156,7 +3156,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3293,7 +3293,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK3-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3382,7 +3382,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3952,7 +3952,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK5-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP53]], align 4
 // CHECK5-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4009,7 +4009,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP80]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP80]], align 4
 // CHECK5-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 2, ptr [[TMP81]], align 4
 // CHECK5-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4119,7 +4119,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK5-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 9, ptr [[TMP134]], align 4
 // CHECK5-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 2
@@ -4331,7 +4331,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK5-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias [[META26]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -4937,7 +4937,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [6 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [6 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK5-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 6, ptr [[TMP36]], align 4
 // CHECK5-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5074,7 +5074,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK5-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK5-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK5-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK5-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5163,7 +5163,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5783,7 +5783,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK7-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP51]], align 4
 // CHECK7-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5840,7 +5840,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP78]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP78]], align 4
 // CHECK7-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 2, ptr [[TMP79]], align 4
 // CHECK7-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -5952,7 +5952,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK7-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 9, ptr [[TMP134]], align 4
 // CHECK7-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 2
@@ -6164,7 +6164,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
+// CHECK7-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias [[META27]]
 // CHECK7-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -6770,7 +6770,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [6 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [6 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK7-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 6, ptr [[TMP36]], align 4
 // CHECK7-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6907,7 +6907,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK7-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK7-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK7-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK7-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6996,7 +6996,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp
index ac7fc7c1acf91..b052a2200479a 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp
@@ -124,7 +124,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -293,7 +293,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -670,7 +670,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -924,7 +924,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1157,7 +1157,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1409,7 +1409,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp
index 867ec89421402..7f7e83c4dee39 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp
@@ -161,7 +161,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -203,7 +203,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -245,7 +245,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -594,7 +594,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -636,7 +636,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -678,7 +678,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1319,7 +1319,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1390,7 +1390,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1472,7 +1472,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK9-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK9-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1944,7 +1944,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1985,7 +1985,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2026,7 +2026,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK9-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2417,7 +2417,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2489,7 +2489,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2572,7 +2572,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK11-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK11-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3041,7 +3041,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3082,7 +3082,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -3123,7 +3123,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
index 425bcaf6c3f3d..d22aeee120d5d 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
@@ -303,7 +303,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -591,7 +591,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1119,7 +1119,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1405,7 +1405,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
index 4c9e83afb3bb3..93c570ac0604f 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
@@ -572,7 +572,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -894,7 +894,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1287,7 +1287,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1607,7 +1607,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
index 6cfec0010d00c..4e9b4e1027e68 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
@@ -240,7 +240,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -416,7 +416,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -782,7 +782,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -956,7 +956,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp
index 1481b4fe85fde..667450c32f1dd 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp
@@ -97,7 +97,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -281,7 +281,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -462,7 +462,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -646,7 +646,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
index 190ea17e96070..ec806081a3e44 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
@@ -635,7 +635,7 @@ int target_teams_fun(int *g){
 // CHECK2-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK2-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
 // CHECK2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -702,7 +702,7 @@ int target_teams_fun(int *g){
 // CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP60]], 1
 // CHECK2-NEXT:    [[TMP61:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK2-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP62]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP62]], align 4
 // CHECK2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP63]], align 4
 // CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -1260,7 +1260,7 @@ int target_teams_fun(int *g){
 // CHECK4-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK4-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
 // CHECK4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK4-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK4-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK4-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1327,7 +1327,7 @@ int target_teams_fun(int *g){
 // CHECK4-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP60]], 1
 // CHECK4-NEXT:    [[TMP61:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK4-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK4-NEXT:    store i32 2, ptr [[TMP62]], align 4
+// CHECK4-NEXT:    store i32 3, ptr [[TMP62]], align 4
 // CHECK4-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK4-NEXT:    store i32 3, ptr [[TMP63]], align 4
 // CHECK4-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
index 82ad43373327a..f4aea067ba22a 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
@@ -129,7 +129,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -365,7 +365,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -665,7 +665,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1027,7 +1027,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1327,7 +1327,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1691,7 +1691,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
index b2ff4c20db7a1..1edcbfe2d7779 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
@@ -111,7 +111,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -437,7 +437,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -496,7 +496,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -991,7 +991,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1029,7 +1029,7 @@ int main() {
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
 // CHECK1:       omp_if.then:
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
index 85f6a85a11bd8..fbf7b004d5efb 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
@@ -27,7 +27,7 @@ void gtid_test() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
index 7503b69b92aa9..99416f76e409c 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
@@ -301,7 +301,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -571,7 +571,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1034,7 +1034,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1298,7 +1298,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
index 1036ffd195de0..d5f1b0b9e59dc 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
@@ -109,7 +109,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -400,7 +400,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -688,7 +688,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -975,7 +975,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
index 0dc2c95641e28..5001f9cc89151 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
@@ -291,7 +291,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_map_codegen.cpp b/clang/test/OpenMP/target_teams_map_codegen.cpp
index 9ccf74514691b..974ba36e6bb0d 100644
--- a/clang/test/OpenMP/target_teams_map_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_map_codegen.cpp
@@ -100,7 +100,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -178,7 +178,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -272,7 +272,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -416,7 +416,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -494,7 +494,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -572,7 +572,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -668,7 +668,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -721,7 +721,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -1031,7 +1031,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1084,7 +1084,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -1327,7 +1327,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1405,7 +1405,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1499,7 +1499,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1643,7 +1643,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1721,7 +1721,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1799,7 +1799,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1895,7 +1895,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1948,7 +1948,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
index e5618aefaf3cb..ba01a411670ce 100644
--- a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
@@ -256,7 +256,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -298,7 +298,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -369,7 +369,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -418,7 +418,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP33]], 0
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -473,7 +473,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -541,7 +541,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -834,7 +834,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -876,7 +876,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -947,7 +947,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -996,7 +996,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP33]], 0
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1051,7 +1051,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1119,7 +1119,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
index b6593b7e53685..c081f6e52d7af 100644
--- a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
@@ -256,7 +256,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -298,7 +298,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -385,7 +385,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP14]], 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -434,7 +434,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP41]], 0
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP44]], align 4
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -489,7 +489,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -557,7 +557,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -853,7 +853,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -895,7 +895,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -982,7 +982,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP14]], 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1031,7 +1031,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
 // CHECK3-NEXT:    [[TMP42:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP41]], 0
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP44]], align 4
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -1086,7 +1086,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1154,7 +1154,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_codegen.cpp b/clang/test/OpenMP/teams_codegen.cpp
index 4aab9dae2d6ad..91702fb47af73 100644
--- a/clang/test/OpenMP/teams_codegen.cpp
+++ b/clang/test/OpenMP/teams_codegen.cpp
@@ -361,7 +361,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -405,7 +405,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -460,7 +460,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP56:%.*]] = load i32, ptr [[LA]], align 4
 // CHECK1-NEXT:    [[TMP57:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP56]], 0
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -515,7 +515,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP85:%.*]] = load i32, ptr [[LA]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP85]], 0
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP87]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP87]], align 4
 // CHECK1-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP88]], align 4
 // CHECK1-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 2
@@ -605,7 +605,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP134:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD]], 0
 // CHECK1-NEXT:    [[TMP135:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP133]], 0
 // CHECK1-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK1-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP137]], align 4
 // CHECK1-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -670,7 +670,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP168:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD36]], 0
 // CHECK1-NEXT:    [[TMP169:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD38]], 0
 // CHECK1-NEXT:    [[TMP170:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP170]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP170]], align 4
 // CHECK1-NEXT:    [[TMP171:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP171]], align 4
 // CHECK1-NEXT:    [[TMP172:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 2
@@ -963,7 +963,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1007,7 +1007,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1062,7 +1062,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP56:%.*]] = load i32, ptr [[LA]], align 4
 // CHECK3-NEXT:    [[TMP57:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP56]], 0
 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -1117,7 +1117,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP85:%.*]] = load i32, ptr [[LA]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP85]], 0
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP87]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP87]], align 4
 // CHECK3-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP88]], align 4
 // CHECK3-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 2
@@ -1204,7 +1204,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP132:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD]], 0
 // CHECK3-NEXT:    [[TMP133:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP131]], 0
 // CHECK3-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK3-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP135]], align 4
 // CHECK3-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -1269,7 +1269,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP166:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD36]], 0
 // CHECK3-NEXT:    [[TMP167:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD38]], 0
 // CHECK3-NEXT:    [[TMP168:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP168]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP168]], align 4
 // CHECK3-NEXT:    [[TMP169:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP169]], align 4
 // CHECK3-NEXT:    [[TMP170:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 2
@@ -1552,7 +1552,7 @@ void foo() {
 // CHECK9-NEXT:    [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP13]], 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1616,7 +1616,7 @@ void foo() {
 // CHECK9-NEXT:    [[TMP50:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP47]], 0
 // CHECK9-NEXT:    [[TMP51:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP49]], 0
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK9-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP53]], align 4
 // CHECK9-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1779,7 +1779,7 @@ void foo() {
 // CHECK11-NEXT:    [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP13]], 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1843,7 +1843,7 @@ void foo() {
 // CHECK11-NEXT:    [[TMP50:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP47]], 0
 // CHECK11-NEXT:    [[TMP51:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP49]], 0
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK11-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP53]], align 4
 // CHECK11-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2006,7 +2006,7 @@ void foo() {
 // CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A2]], align 4
 // CHECK17-NEXT:    [[TMP11:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP10]], 0
 // CHECK17-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK17-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP13]], align 4
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2062,7 +2062,7 @@ void foo() {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 123
 // CHECK17-NEXT:    [[TMP38:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD]], 0
 // CHECK17-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK17-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP40]], align 4
 // CHECK17-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -2212,7 +2212,7 @@ void foo() {
 // CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A2]], align 4
 // CHECK19-NEXT:    [[TMP11:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP10]], 0
 // CHECK19-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK19-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP13]], align 4
 // CHECK19-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2268,7 +2268,7 @@ void foo() {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 123
 // CHECK19-NEXT:    [[TMP38:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD]], 0
 // CHECK19-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK19-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP40]], align 4
 // CHECK19-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_codegen.cpp b/clang/test/OpenMP/teams_distribute_codegen.cpp
index 0bfadcf70d9bc..11b5c9a212985 100644
--- a/clang/test/OpenMP/teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_codegen.cpp
@@ -254,7 +254,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -314,7 +314,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK1-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -645,7 +645,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -705,7 +705,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK3-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1016,7 +1016,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1226,7 +1226,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1403,7 +1403,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1548,7 +1548,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1730,7 +1730,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK25-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK25-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK25-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK25-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1929,7 +1929,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK25-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK25-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK25-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK25-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2116,7 +2116,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK27-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK27-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK27-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK27-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2314,7 +2314,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK27-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK27-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK27-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK27-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp
index 178af730a8d4a..42b355f39e6b9 100644
--- a/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp
@@ -127,7 +127,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -288,7 +288,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -517,7 +517,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -747,7 +747,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -972,7 +972,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1200,7 +1200,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp
index f21c7e9be9205..39c260996fcbd 100644
--- a/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp
@@ -170,7 +170,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -212,7 +212,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -254,7 +254,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -582,7 +582,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -624,7 +624,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -666,7 +666,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1037,7 +1037,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1108,7 +1108,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1179,7 +1179,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP91]], 1
 // CHECK9-NEXT:    [[TMP92:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK9-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP93]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP93]], align 4
 // CHECK9-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CHECK9-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -1607,7 +1607,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1648,7 +1648,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1689,7 +1689,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK9-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2057,7 +2057,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2129,7 +2129,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2201,7 +2201,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP93]], 1
 // CHECK11-NEXT:    [[TMP94:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK11-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK11-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP96]], align 4
 // CHECK11-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2626,7 +2626,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2667,7 +2667,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2708,7 +2708,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
index 57175e72b79f8..8b3e657428ec6 100644
--- a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
@@ -306,7 +306,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -589,7 +589,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1110,7 +1110,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1391,7 +1391,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
index 5ee4a5ce9e29d..66c952f3281fb 100644
--- a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
@@ -518,7 +518,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -829,7 +829,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1214,7 +1214,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1523,7 +1523,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp
index a13b565cb38d8..85b20e9b21cfa 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp
@@ -254,7 +254,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -314,7 +314,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK1-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -856,7 +856,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -916,7 +916,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK3-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1430,7 +1430,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1741,7 +1741,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2015,7 +2015,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2233,7 +2233,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2484,7 +2484,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK25-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK25-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK25-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK25-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2784,7 +2784,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK25-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK25-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK25-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK25-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3044,7 +3044,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK27-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK27-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK27-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK27-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3339,7 +3339,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK27-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK27-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK27-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK27-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp
index 64ee660b706cc..f1c74bd6a0d79 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp
@@ -132,7 +132,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -368,7 +368,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -668,7 +668,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1018,7 +1018,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1318,7 +1318,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1670,7 +1670,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
index 37d19f86a9509..2fda35f3e0009 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
@@ -123,7 +123,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -347,7 +347,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -578,7 +578,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -797,7 +797,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp
index eb3d2fc84568a..070bb1ce3fed2 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp
@@ -182,7 +182,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -224,7 +224,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -266,7 +266,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -816,7 +816,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -858,7 +858,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -900,7 +900,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1484,7 +1484,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1555,7 +1555,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1635,7 +1635,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK9-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK9-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2384,7 +2384,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2425,7 +2425,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2475,7 +2475,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK9-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -3086,7 +3086,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3158,7 +3158,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3239,7 +3239,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK11-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK11-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -3973,7 +3973,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4014,7 +4014,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4064,7 +4064,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK11-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
index 904e6a1c1ce7b..0726fb659e7d1 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -340,7 +340,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -753,7 +753,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1404,7 +1404,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1811,7 +1811,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp
index 49ff6a5d08bee..aed27c47fa1d3 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp
@@ -121,7 +121,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -154,7 +154,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -486,7 +486,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -519,7 +519,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -571,7 +571,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1073,7 +1073,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1106,7 +1106,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1158,7 +1158,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
index 7b8df7367b828..06fc87d7da4c6 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -766,7 +766,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1225,7 +1225,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1752,7 +1752,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2205,7 +2205,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp
index a7b1f6eb309ee..6b200403637b6 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp
@@ -97,7 +97,7 @@ int main() {
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -155,7 +155,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i32
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP27]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -556,7 +556,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -589,7 +589,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -634,7 +634,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -676,7 +676,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
 // CHECK1-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1384,7 +1384,7 @@ int main() {
 // CHECK5:       invoke.cont:
 // CHECK5-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1442,7 +1442,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i32
 // CHECK5-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP27]], 0
 // CHECK5-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK5-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK5-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1843,7 +1843,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1876,7 +1876,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1921,7 +1921,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1963,7 +1963,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
 // CHECK5-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp
index dc430fc55787c..4d10d16f47ed8 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp
@@ -263,7 +263,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -533,7 +533,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -996,7 +996,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1260,7 +1260,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp
index 386e772b78970..5e3962d96121c 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp
@@ -60,7 +60,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -93,7 +93,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -411,7 +411,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp
index 7ac42579e91b9..2e800745baf2f 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp
@@ -116,7 +116,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -410,7 +410,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -701,7 +701,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -991,7 +991,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp
index 7518179f47765..2dd2302c0f2cf 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp
@@ -257,7 +257,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -299,7 +299,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -341,7 +341,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -383,7 +383,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -425,7 +425,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1284,7 +1284,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1326,7 +1326,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1368,7 +1368,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1410,7 +1410,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -1452,7 +1452,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2284,7 +2284,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2326,7 +2326,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2368,7 +2368,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK5-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK5-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2410,7 +2410,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK5-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK5-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2452,7 +2452,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK5-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK5-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3311,7 +3311,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK7-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3353,7 +3353,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3395,7 +3395,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK7-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK7-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3437,7 +3437,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK7-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK7-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3479,7 +3479,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK7-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK7-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4369,7 +4369,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK13-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4440,7 +4440,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK13-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK13-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK13-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK13-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -4520,7 +4520,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK13-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK13-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK13-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -4591,7 +4591,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP131]], 1
 // CHECK13-NEXT:    [[TMP132:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK13-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK13-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK13-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -4671,7 +4671,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP171]], 1
 // CHECK13-NEXT:    [[TMP172:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK13-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP173]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP173]], align 4
 // CHECK13-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP174]], align 4
 // CHECK13-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -5863,7 +5863,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5904,7 +5904,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -5954,7 +5954,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK13-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK13-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -5995,7 +5995,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6045,7 +6045,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK13-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -6990,7 +6990,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK15-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7062,7 +7062,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK15-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK15-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK15-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK15-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -7143,7 +7143,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK15-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK15-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK15-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK15-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -7215,7 +7215,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP134]], 1
 // CHECK15-NEXT:    [[TMP135:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK15-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK15-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK15-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -7296,7 +7296,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP175]], 1
 // CHECK15-NEXT:    [[TMP176:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK15-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP177]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP177]], align 4
 // CHECK15-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP178]], align 4
 // CHECK15-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -8463,7 +8463,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK15-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8504,7 +8504,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK15-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK15-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -8554,7 +8554,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK15-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK15-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -8595,7 +8595,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK15-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK15-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -8645,7 +8645,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK15-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK15-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -9563,7 +9563,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK17-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9634,7 +9634,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK17-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK17-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK17-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK17-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -9714,7 +9714,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK17-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK17-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK17-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -9785,7 +9785,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP131]], 1
 // CHECK17-NEXT:    [[TMP132:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK17-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK17-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK17-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -9865,7 +9865,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP171]], 1
 // CHECK17-NEXT:    [[TMP172:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK17-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP173]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP173]], align 4
 // CHECK17-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP174]], align 4
 // CHECK17-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -11057,7 +11057,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -11098,7 +11098,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -11148,7 +11148,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK17-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK17-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -11189,7 +11189,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -11239,7 +11239,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK17-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK17-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -12184,7 +12184,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK19-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12256,7 +12256,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK19-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK19-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK19-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK19-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -12337,7 +12337,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK19-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK19-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK19-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK19-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -12409,7 +12409,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP134]], 1
 // CHECK19-NEXT:    [[TMP135:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK19-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK19-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK19-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -12490,7 +12490,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP175]], 1
 // CHECK19-NEXT:    [[TMP176:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK19-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP177]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP177]], align 4
 // CHECK19-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP178]], align 4
 // CHECK19-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -13657,7 +13657,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -13698,7 +13698,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -13748,7 +13748,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -13789,7 +13789,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK19-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -13839,7 +13839,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK19-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK19-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp
index 854afe3b9bbc8..0a492152e9e01 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp
@@ -262,7 +262,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -331,7 +331,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP61]], 1
 // CHECK1-NEXT:    [[TMP62:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP63]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP63]], align 4
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP64]], align 4
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -933,7 +933,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1002,7 +1002,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP61]], 1
 // CHECK3-NEXT:    [[TMP62:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP63]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP63]], align 4
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP64]], align 4
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1853,7 +1853,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK9-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK9-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK9-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2222,7 +2222,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK11-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK11-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2718,7 +2718,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK17-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK17-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2986,7 +2986,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK19-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK19-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3412,7 +3412,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK25-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK25-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK25-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK25-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3759,7 +3759,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK25-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK25-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK25-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK25-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4044,7 +4044,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK27-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK27-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK27-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK27-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4386,7 +4386,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK27-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK27-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK27-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK27-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp
index e7b9978dd43cd..910805c310745 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp
@@ -137,7 +137,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -389,7 +389,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -845,7 +845,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1231,7 +1231,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1547,7 +1547,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1935,7 +1935,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
index d3777e81047ca..aa6d6c67c20f1 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
@@ -185,7 +185,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -227,7 +227,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -269,7 +269,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -861,7 +861,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -903,7 +903,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -945,7 +945,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1818,7 +1818,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1889,7 +1889,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1969,7 +1969,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK9-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK9-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2790,7 +2790,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2831,7 +2831,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2881,7 +2881,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK9-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -3534,7 +3534,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3606,7 +3606,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3687,7 +3687,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK11-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK11-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -4493,7 +4493,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4534,7 +4534,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4584,7 +4584,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK11-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index 4f87d40e58f68..7d9e2ab89676f 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -343,7 +343,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -770,7 +770,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1435,7 +1435,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1856,7 +1856,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp
index 7c86b180ec674..58c1f4155abfb 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp
@@ -118,7 +118,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -151,7 +151,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -511,7 +511,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -544,7 +544,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -596,7 +596,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1140,7 +1140,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1173,7 +1173,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1225,7 +1225,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1758,7 +1758,7 @@ int main() {
 // CHECK3-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1791,7 +1791,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2151,7 +2151,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2184,7 +2184,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2236,7 +2236,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3010,7 +3010,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3043,7 +3043,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3095,7 +3095,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -4219,7 +4219,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4252,7 +4252,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4612,7 +4612,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4645,7 +4645,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4697,7 +4697,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK9-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK9-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK9-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -5241,7 +5241,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5274,7 +5274,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -5326,7 +5326,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK9-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK9-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK9-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -5859,7 +5859,7 @@ int main() {
 // CHECK11-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5892,7 +5892,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6252,7 +6252,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6285,7 +6285,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6337,7 +6337,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK11-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK11-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK11-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -7111,7 +7111,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7144,7 +7144,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -7196,7 +7196,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK11-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK11-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK11-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index 268298c14801c..8643cb9bb84a8 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -839,7 +839,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1312,7 +1312,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1853,7 +1853,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2320,7 +2320,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp
index fa8bcf65b8fb6..47d067256fb3a 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp
@@ -99,7 +99,7 @@ int main() {
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -157,7 +157,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i32
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP27]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -586,7 +586,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -619,7 +619,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -664,7 +664,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -706,7 +706,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
 // CHECK1-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1819,7 +1819,7 @@ int main() {
 // CHECK5:       invoke.cont:
 // CHECK5-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1877,7 +1877,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i32
 // CHECK5-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP27]], 0
 // CHECK5-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK5-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK5-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2306,7 +2306,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2339,7 +2339,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2384,7 +2384,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2426,7 +2426,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
 // CHECK5-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp
index 03bee1dbb63c0..f40acb42f9dae 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp
@@ -265,7 +265,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -549,7 +549,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1026,7 +1026,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1304,7 +1304,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
index 7d35ea305f92b..c473e8532189b 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -62,7 +62,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -95,7 +95,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -441,7 +441,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp
index 2dd1db48f0306..669ca4a6b76ec 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp
@@ -120,7 +120,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -428,7 +428,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -733,7 +733,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1037,7 +1037,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp
index ff70e71e01267..3d40ac12e5ec6 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp
@@ -267,7 +267,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -309,7 +309,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -351,7 +351,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -393,7 +393,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -435,7 +435,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1364,7 +1364,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1406,7 +1406,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1448,7 +1448,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1490,7 +1490,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK2-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -1532,7 +1532,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK2-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK2-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2461,7 +2461,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2503,7 +2503,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2545,7 +2545,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK5-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK5-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2587,7 +2587,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK5-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK5-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2629,7 +2629,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK5-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK5-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3531,7 +3531,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK6-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3573,7 +3573,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK6-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK6-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3615,7 +3615,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK6-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK6-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3657,7 +3657,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK6-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK6-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3699,7 +3699,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK6-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK6-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -5044,7 +5044,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK13-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5115,7 +5115,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK13-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK13-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK13-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK13-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -5195,7 +5195,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK13-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK13-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK13-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -5266,7 +5266,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP131]], 1
 // CHECK13-NEXT:    [[TMP132:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK13-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK13-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK13-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -5346,7 +5346,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP171]], 1
 // CHECK13-NEXT:    [[TMP172:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK13-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP173]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP173]], align 4
 // CHECK13-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP174]], align 4
 // CHECK13-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -6658,7 +6658,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6699,7 +6699,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -6749,7 +6749,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK13-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK13-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -6790,7 +6790,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6840,7 +6840,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK13-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -7855,7 +7855,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK14-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK14-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK14-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK14-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7926,7 +7926,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK14-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK14-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK14-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK14-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -8006,7 +8006,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK14-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK14-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK14-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK14-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -8077,7 +8077,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP131]], 1
 // CHECK14-NEXT:    [[TMP132:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK14-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK14-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK14-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -8157,7 +8157,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP171]], 1
 // CHECK14-NEXT:    [[TMP172:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK14-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP173]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP173]], align 4
 // CHECK14-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 4, ptr [[TMP174]], align 4
 // CHECK14-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -9469,7 +9469,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK14-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK14-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9510,7 +9510,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK14-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK14-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -9560,7 +9560,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK14-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK14-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -9601,7 +9601,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK14-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK14-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -9651,7 +9651,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK14-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK14-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -10666,7 +10666,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK17-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -10738,7 +10738,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK17-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK17-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK17-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK17-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -10819,7 +10819,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK17-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK17-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK17-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK17-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -10891,7 +10891,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP134]], 1
 // CHECK17-NEXT:    [[TMP135:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK17-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK17-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK17-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -10972,7 +10972,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP175]], 1
 // CHECK17-NEXT:    [[TMP176:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK17-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP177]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP177]], align 4
 // CHECK17-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP178]], align 4
 // CHECK17-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -12259,7 +12259,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12300,7 +12300,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -12350,7 +12350,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK17-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK17-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -12391,7 +12391,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -12441,7 +12441,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK17-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK17-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -13429,7 +13429,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK19-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -13501,7 +13501,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK19-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK19-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK19-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK19-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -13582,7 +13582,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK19-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK19-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK19-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK19-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -13654,7 +13654,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP134]], 1
 // CHECK19-NEXT:    [[TMP135:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK19-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK19-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK19-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -13735,7 +13735,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP175]], 1
 // CHECK19-NEXT:    [[TMP176:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK19-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP177]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP177]], align 4
 // CHECK19-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP178]], align 4
 // CHECK19-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -15022,7 +15022,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -15063,7 +15063,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -15113,7 +15113,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -15154,7 +15154,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK19-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -15204,7 +15204,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK19-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK19-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_private_codegen.cpp
index bdc001fe5d5d8..78b42e3194c79 100644
--- a/clang/test/OpenMP/teams_distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_private_codegen.cpp
@@ -243,7 +243,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -412,7 +412,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -771,7 +771,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -938,7 +938,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp
index 309cbffcf6162..ff2defa849efb 100644
--- a/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp
@@ -105,7 +105,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -285,7 +285,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -462,7 +462,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -642,7 +642,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
index ee5d5cad72fe2..2fab0cff55373 100644
--- a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
@@ -302,7 +302,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP26]], 0
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP29]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -362,7 +362,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP63]], 1
 // CHECK1-NEXT:    [[TMP64:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -737,7 +737,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP26]], 0
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP29]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -797,7 +797,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP63]], 1
 // CHECK3-NEXT:    [[TMP64:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1407,7 +1407,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1629,7 +1629,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1993,7 +1993,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK17-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK17-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2171,7 +2171,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK19-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK19-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2348,7 +2348,7 @@ int main (int argc, char **argv) {
 // CHECK21-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK21-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK21-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK21-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK21-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK21-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK21-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK21-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2576,7 +2576,7 @@ int main (int argc, char **argv) {
 // CHECK23-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK23-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK23-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK23-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK23-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK23-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK23-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK23-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3130,7 +3130,7 @@ int main (int argc, char **argv) {
 // CHECK33-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK33-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK33-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK33-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK33-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK33-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK33-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK33-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3343,7 +3343,7 @@ int main (int argc, char **argv) {
 // CHECK33-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK33-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK33-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK33-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK33-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK33-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK33-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK33-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3537,7 +3537,7 @@ int main (int argc, char **argv) {
 // CHECK35-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK35-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK35-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK35-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK35-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK35-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK35-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK35-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3749,7 +3749,7 @@ int main (int argc, char **argv) {
 // CHECK35-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK35-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK35-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK35-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK35-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK35-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK35-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK35-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3952,7 +3952,7 @@ int main (int argc, char **argv) {
 // CHECK37-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK37-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK37-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK37-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK37-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK37-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK37-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK37-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4212,7 +4212,7 @@ int main (int argc, char **argv) {
 // CHECK37-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK37-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK37-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK37-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK37-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK37-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK37-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK37-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4416,7 +4416,7 @@ int main (int argc, char **argv) {
 // CHECK39-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK39-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK39-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK39-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK39-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK39-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK39-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK39-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4674,7 +4674,7 @@ int main (int argc, char **argv) {
 // CHECK39-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK39-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK39-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK39-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK39-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK39-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK39-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK39-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp
index 292bfb2a296c6..da0e7703e30cd 100644
--- a/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp
@@ -128,7 +128,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -297,7 +297,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -674,7 +674,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -922,7 +922,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1155,7 +1155,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1401,7 +1401,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp
index a18b4d2feeec6..1266534b890f8 100644
--- a/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp
@@ -170,7 +170,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -212,7 +212,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -254,7 +254,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -603,7 +603,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -645,7 +645,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -687,7 +687,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1326,7 +1326,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1397,7 +1397,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1468,7 +1468,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP91]], 1
 // CHECK9-NEXT:    [[TMP92:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK9-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP93]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP93]], align 4
 // CHECK9-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CHECK9-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -1932,7 +1932,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1973,7 +1973,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2014,7 +2014,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK9-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2403,7 +2403,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2475,7 +2475,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2547,7 +2547,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP93]], 1
 // CHECK11-NEXT:    [[TMP94:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK11-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK11-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP96]], align 4
 // CHECK11-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -3008,7 +3008,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3049,7 +3049,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -3090,7 +3090,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
index dd3daf865512a..2d3fccd90c0a5 100644
--- a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
@@ -306,7 +306,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -596,7 +596,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1124,7 +1124,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1412,7 +1412,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
index 073960cc5a460..ec95ab55b1552 100644
--- a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
@@ -568,7 +568,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -886,7 +886,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1278,7 +1278,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1594,7 +1594,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp
index 52b213c345915..c839268ada753 100644
--- a/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp
@@ -244,7 +244,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -420,7 +420,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -786,7 +786,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -960,7 +960,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp
index e660009f1cd7f..27c47f50547ea 100644
--- a/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp
@@ -105,7 +105,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -292,7 +292,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -476,7 +476,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -663,7 +663,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
index c023de8cf010d..649fae99b21ac 100644
--- a/clang/test/OpenMP/teams_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
@@ -327,7 +327,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -371,7 +371,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP44]], align 4
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -663,7 +663,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP17]], align 4
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -707,7 +707,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK9-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP39]], align 4
 // CHECK9-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1121,7 +1121,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1165,7 +1165,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP44]], align 4
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1457,7 +1457,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP17]], align 4
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1501,7 +1501,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK11-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP39]], align 4
 // CHECK11-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1942,7 +1942,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [8 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [8 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK17-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 8, ptr [[TMP44]], align 4
 // CHECK17-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2180,7 +2180,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP55]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP55]], align 4
 // CHECK17-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 10, ptr [[TMP56]], align 4
 // CHECK17-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2405,7 +2405,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK19-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [8 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [8 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK19-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 8, ptr [[TMP42]], align 4
 // CHECK19-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2641,7 +2641,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP53]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP53]], align 4
 // CHECK19-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 10, ptr [[TMP54]], align 4
 // CHECK19-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp b/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp
index 85a4dfea91a21..d4a98f07fe24d 100644
--- a/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp
+++ b/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp
@@ -253,7 +253,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -313,7 +313,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK1-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -840,7 +840,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -900,7 +900,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK3-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1399,7 +1399,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1710,7 +1710,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1984,7 +1984,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2202,7 +2202,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2453,7 +2453,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK25-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK25-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK25-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK25-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2753,7 +2753,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK25-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK25-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK25-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK25-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3013,7 +3013,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK27-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK27-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK27-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK27-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3308,7 +3308,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK27-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK27-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK27-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK27-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp
index 49df83c9c765c..c0c04986f147e 100644
--- a/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp
@@ -132,7 +132,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -368,7 +368,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -668,7 +668,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1018,7 +1018,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1318,7 +1318,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1670,7 +1670,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
index 5ef729f044e09..303bce8c648c2 100644
--- a/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
@@ -263,7 +263,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -533,7 +533,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -996,7 +996,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1260,7 +1260,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp
index 4da49eed32efd..d80c003c01099 100644
--- a/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp
@@ -116,7 +116,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -410,7 +410,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -701,7 +701,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -991,7 +991,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_private_codegen.cpp b/clang/test/OpenMP/teams_private_codegen.cpp
index 0126545c5915b..1d0b2435ff005 100644
--- a/clang/test/OpenMP/teams_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_private_codegen.cpp
@@ -219,7 +219,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -405,7 +405,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -557,7 +557,7 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -724,7 +724,7 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef signext 3)
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -804,7 +804,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK9-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1037,7 +1037,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1147,7 +1147,7 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1314,7 +1314,7 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1394,7 +1394,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1627,7 +1627,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/Options/HV.hlsl b/clang/test/Options/HV.hlsl
new file mode 100644
index 0000000000000..9f7e1ebc02f25
--- /dev/null
+++ b/clang/test/Options/HV.hlsl
@@ -0,0 +1,20 @@
+// RUN: %clang_dxc -T lib_6_4 -HV 2016 %s 2>&1 -###   | FileCheck -check-prefix=2016 %s
+// RUN: %clang_dxc -T lib_6_4 -HV 2017 %s 2>&1 -###   | FileCheck -check-prefix=2017 %s
+// RUN: %clang_dxc -T lib_6_4 /HV 2018 %s 2>&1 -###   | FileCheck -check-prefix=2018 %s
+// RUN: %clang_dxc -T lib_6_4 /HV 2021 %s 2>&1 -###   | FileCheck -check-prefix=2021 %s
+// RUN: %clang_dxc -T lib_6_4 /HV 202x %s 2>&1 -###   | FileCheck -check-prefix=202x %s
+// RUN: %clang_dxc -T lib_6_4 %s 2>&1 -###   | FileCheck -check-prefix=NO_HV %s
+// RUN: not %clang_dxc -T lib_6_4 /HV gibberish -### %s 2>&1 | FileCheck -check-prefix=CHECK-ERR %s
+
+// 2016: "-std=hlsl2016"
+// 2017: "-std=hlsl2017"
+// 2018: "-std=hlsl2018"
+// 2021: "-std=hlsl2021"
+// 202x: "-std=hlsl202x"
+// NO_HV-NOT: "-std="
+// CHECK-ERR: error: invalid value 'gibberish' in 'HV'
+float4 main(float4 a : A) : SV_TARGET
+{
+  return -a.yxxx;
+}
+
diff --git a/clang/test/PCH/Inputs/bounds-safety-attributed-type.h b/clang/test/PCH/Inputs/bounds-safety-attributed-type.h
new file mode 100644
index 0000000000000..9bfd9615c2bfc
--- /dev/null
+++ b/clang/test/PCH/Inputs/bounds-safety-attributed-type.h
@@ -0,0 +1,4 @@
+struct Test {
+  int count;
+  int fam[] __attribute__((counted_by(count)));
+};
diff --git a/clang/test/PCH/bounds-safety-attributed-type.c b/clang/test/PCH/bounds-safety-attributed-type.c
new file mode 100644
index 0000000000000..c2dc0272bbd60
--- /dev/null
+++ b/clang/test/PCH/bounds-safety-attributed-type.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -include %S/Inputs/bounds-safety-attributed-type.h -fsyntax-only -verify %s
+
+// Test with pch.
+// RUN: %clang_cc1 -emit-pch -o %t %S/Inputs/bounds-safety-attributed-type.h
+// RUN: %clang_cc1 -include-pch %t -fsyntax-only -verify %s
+// RUN: %clang_cc1 -include-pch %t -ast-print %s | FileCheck %s --check-prefix PRINT
+// RUN: %clang_cc1 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix DUMP
+// expected-no-diagnostics
+
+// PRINT: 	   struct Test {
+// PRINT-NEXT:   int count;
+// PRINT-NEXT:   int fam[] __counted_by(count);
+// PRINT-NEXT: };
+
+// DUMP:        RecordDecl {{.*}} imported <undeserialized declarations> struct Test definition
+// DUMP-NEXT:   |-FieldDecl {{.*}} imported referenced count 'int'
+// DUMP-NEXT:   `-FieldDecl {{.*}} imported fam 'int[] __counted_by(count)':'int[]'
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 6ec4dcd60cf60..9f8a8bdeeb9cb 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -342,15 +342,15 @@
 
 // RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
 // RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
-// RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
-// RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto+nofp+nosimd+nocrc+nocrypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
+// RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto -mabi=aapcs-soft -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
+// RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto+nofp+nosimd+nocrc+nocrypto -mabi=aapcs-soft -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
 // RUN: %clang -target aarch64 -march=armv8-a+nosimd -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-3 %s
 // CHECK-MARCH-1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MARCH-2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "-fp-armv8"{{.*}} "-target-feature" "-neon"
 // CHECK-MARCH-3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "-neon"
 
 // While we're checking +nofp, also make sure it stops defining __ARM_FP
-// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-r+nofp -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-NOFP %s
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-r+nofp -mabi=aapcs-soft -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-NOFP %s
 // CHECK-NOFP-NOT: #define __ARM_FP{{ }}
 
 // Check +sm4:
diff --git a/clang/test/Preprocessor/bpf-predefined-macros.c b/clang/test/Preprocessor/bpf-predefined-macros.c
index fea24d1ea0ff7..246cbfa2d6ab7 100644
--- a/clang/test/Preprocessor/bpf-predefined-macros.c
+++ b/clang/test/Preprocessor/bpf-predefined-macros.c
@@ -61,7 +61,7 @@ int r;
 #ifdef __BPF_FEATURE_ST
 int s;
 #endif
-#ifdef __BPF_FEATURE_ARENA_CAST
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
 int t;
 #endif
 
diff --git a/clang/test/Preprocessor/ptrauth_feature.c b/clang/test/Preprocessor/ptrauth_feature.c
new file mode 100644
index 0000000000000..e45c6ea90fd11
--- /dev/null
+++ b/clang/test/Preprocessor/ptrauth_feature.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -E -triple=arm64-- | FileCheck %s --check-prefixes=NOINTRIN
+// RUN: %clang_cc1 %s -E -triple=arm64-- -fptrauth-intrinsics | FileCheck %s --check-prefixes=INTRIN
+
+#if __has_feature(ptrauth_intrinsics)
+// INTRIN: has_ptrauth_intrinsics
+void has_ptrauth_intrinsics() {}
+#else
+// NOINTRIN: no_ptrauth_intrinsics
+void no_ptrauth_intrinsics() {}
+#endif
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 1a15be1c6e4dc..dfc6d18dee504 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -56,11 +56,14 @@
 // CHECK-NOT: __riscv_xcvmac {{.*$}}
 // CHECK-NOT: __riscv_xcvmem {{.*$}}
 // CHECK-NOT: __riscv_xcvsimd {{.*$}}
+// CHECK-NOT: __riscv_xsfcease {{.*$}}
 // CHECK-NOT: __riscv_xsfvcp {{.*$}}
 // CHECK-NOT: __riscv_xsfvfnrclipxfqf {{.*$}}
 // CHECK-NOT: __riscv_xsfvfwmaccqqq {{.*$}}
 // CHECK-NOT: __riscv_xsfqmaccdod {{.*$}}
 // CHECK-NOT: __riscv_xsfvqmaccqoq {{.*$}}
+// CHECK-NOT: __riscv_xsifivecdiscarddlone {{.*$}}
+// CHECK-NOT: __riscv_xsifivecflushdlone {{.*$}}
 // CHECK-NOT: __riscv_xtheadba {{.*$}}
 // CHECK-NOT: __riscv_xtheadbb {{.*$}}
 // CHECK-NOT: __riscv_xtheadbs {{.*$}}
@@ -517,6 +520,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XCVSIMD-EXT %s
 // CHECK-XCVSIMD-EXT: __riscv_xcvsimd 1000000{{$}}
 
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ixsfcease -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFCEASE-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ixsfcease -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFCEASE-EXT %s
+// CHECK-XSFCEASE-EXT: __riscv_xsfcease 1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN:   -march=rv32ixsfvcp -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVCP-EXT %s
@@ -557,6 +568,22 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVQMACCQOQ-EXT %s
 // CHECK-XSFVQMACCQOQ-EXT: __riscv_xsfvqmaccqoq 1000000{{$}}
 
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ixsifivecdiscarddlone -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSIFIVECDISCARDDLONE-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ixsifivecdiscarddlone -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSIFIVECDISCARDDLONE-EXT %s
+// CHECK-XSIFIVECDISCARDDLONE-EXT: __riscv_xsifivecdiscarddlone 1000000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ixsifivecflushdlone -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSIFIVECFLUSHDLONE-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ixsifivecflushdlone -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSIFIVECFLUSHDLONE-EXT %s
+// CHECK-XSIFIVECFLUSHDLONE-EXT: __riscv_xsifivecflushdlone 1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN:   -march=rv32ixtheadba -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XTHEADBA-EXT %s
@@ -1639,9 +1666,9 @@
 // CHECK-MISALIGNED-AVOID: __riscv_misaligned_avoid 1
 
 // RUN: %clang --target=riscv32-unknown-linux-gnu -march=rv32i -E -dM %s \
-// RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
+// RUN:   -mno-strict-align -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64i -E -dM %s \
-// RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
+// RUN:   -mno-strict-align -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // RUN: %clang --target=riscv64-unknown-linux-gnu -mcpu=sifive-p450 -E -dM %s \
 // RUN:  -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // CHECK-MISALIGNED-FAST: __riscv_misaligned_fast 1
diff --git a/clang/test/Preprocessor/sysroot-prefix.c b/clang/test/Preprocessor/sysroot-prefix.c
index 08c72f53b44e9..eff71f5e3d5a3 100644
--- a/clang/test/Preprocessor/sysroot-prefix.c
+++ b/clang/test/Preprocessor/sysroot-prefix.c
@@ -4,6 +4,16 @@
 // RUN: %clang_cc1 -v -isysroot /var/empty -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_SYSROOT_NULL %s
 // RUN: %clang_cc1 -v -isysroot /var/empty -isysroot /var/empty/root -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSROOT_SYSROOT_NULL %s
 // RUN: %clang_cc1 -v -isysroot /var/empty/root -isysroot /var/empty -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL %s
+// RUN: %clang_cc1 -v -isystem=/usr/include -isysroot /var/empty -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSTEM_SYSROOT %s
+// RUN: %clang_cc1 -v -isystem=/usr/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSTEM_NO_SYSROOT %s
+// RUN: %clang_cc1 -v -iquote=/usr/include -isysroot /var/empty  -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_IQUOTE_SYSROOT %s
+// RUN: %clang_cc1 -v -iquote=/usr/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_IQUOTE_NO_SYSROOT %s
+// RUN: %clang_cc1 -v -idirafter=/usr/include -isysroot /var/empty  -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_IDIRAFTER_SYSROOT %s
+// RUN: %clang_cc1 -v -idirafter=/usr/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_IDIRAFTER_NO_SYSROOT %s
+// RUN: %clang_cc1 -v -isysroot /var/empty -isystem=/usr/include -iwithsysroot /opt/include -isystem=/usr/local/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-INTERLEAVE_I_PATHS %s
+
+// RUN: not %clang_cc1 -v -isysroot /var/empty -E %s -o /dev/null -I 2>&1 | FileCheck -check-prefix CHECK-EMPTY_I_PATH %s
+// RUN: %clang_cc1 -v -isysroot /var/empty/usr -E %s -o /dev/null -I= 2>&1 | FileCheck -check-prefix CHECK-SYSROOT_I_PATH %s
 
 // CHECK-ISYSROOT_NO_SYSROOT: ignoring nonexistent directory "/var/empty/include"
 // CHECK-ISYSROOT_NO_SYSROOT-NOT: ignoring nonexistent directory "/var/empty/var/empty/include"
@@ -23,3 +33,18 @@
 // CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL: ignoring nonexistent directory "/var/empty{{.}}null"
 // CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL-NOT: ignoring nonexistent directory "=null"
 
+// CHECK-ISYSROOT_ISYSTEM_SYSROOT: ignoring nonexistent directory "/var/empty/usr/include"
+// CHECK-ISYSROOT_ISYSTEM_NO_SYSROOT: ignoring nonexistent directory "=/usr/include"
+
+// CHECK-ISYSROOT_IQUOTE_SYSROOT: ignoring nonexistent directory "/var/empty/usr/include"
+// CHECK-ISYSROOT_IQUOTE_NO_SYSROOT: ignoring nonexistent directory "=/usr/include"
+
+// CHECK-ISYSROOT_IDIRAFTER_SYSROOT: ignoring nonexistent directory "/var/empty/usr/include"
+// CHECK-ISYSROOT_IDIRAFTER_NO_SYSROOT: ignoring nonexistent directory "=/usr/include"
+
+// CHECK-INTERLEAVE_I_PATHS: ignoring nonexistent directory "/var/empty/usr/include"
+// CHECK-INTERLEAVE_I_PATHS: ignoring nonexistent directory "/var/empty/opt/include"
+// CHECK-INTERLEAVE_I_PATHS: ignoring nonexistent directory "/var/empty/usr/local/include"
+
+// CHECK-EMPTY_I_PATH: argument to '-I' is missing
+// CHECK-SYSROOT_I_PATH: ignoring nonexistent directory "/var/empty/usr{{/|\\}}"
diff --git a/clang/test/Sema/PR84368.cpp b/clang/test/Sema/PR84368.cpp
new file mode 100644
index 0000000000000..6551df2935892
--- /dev/null
+++ b/clang/test/Sema/PR84368.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -std=c++20 -verify %s
+// RUN: %clang_cc1 -std=c++23 -verify %s
+// expected-no-diagnostics
+
+template<class T> concept IsOk = requires() { typename T::Float; };
+
+template<IsOk T> struct Thing;
+
+template<IsOk T> struct Foobar {
+  template<int> struct Inner {
+    template<IsOk T2> friend struct Thing;
+  };
+};
+
+struct MyType { using Float=float; };
+Foobar<MyType>::Inner<0> foobar;
diff --git a/clang/test/Sema/PR85343.cpp b/clang/test/Sema/PR85343.cpp
new file mode 100644
index 0000000000000..d90ef19d42345
--- /dev/null
+++ b/clang/test/Sema/PR85343.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -std=c++14 -verify %s
+// expected-no-diagnostics
+
+template <typename c> auto ab() -> c ;
+
+template <typename> struct e {};
+
+template <typename f> struct ac {
+  template <typename h> static e<decltype(ab<h>()(ab<int>))> i;
+  decltype(i<f>) j;
+};
+
+struct d {
+  template <typename f>
+  d(f) { 
+    ac<f> a;
+  }
+};
+struct a {
+  d b = [=](auto) { (void)[this] {}; };
+};
+void b() { new a; }
diff --git a/clang/test/Sema/arm-vector-types-support.c b/clang/test/Sema/arm-vector-types-support.c
index 83a83ddfe7801..ed5f5ba175a94 100644
--- a/clang/test/Sema/arm-vector-types-support.c
+++ b/clang/test/Sema/arm-vector-types-support.c
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 %s -triple armv7 -fsyntax-only -verify
+// RUN: %clang_cc1 %s -triple aarch64 -fsyntax-only -verify
+// RUN: %clang_cc1 %s -triple aarch64 -target-feature -fp-armv8 -target-abi aapcs-soft -fsyntax-only -verify
 
 typedef __attribute__((neon_vector_type(2))) int int32x2_t; // expected-error{{'neon_vector_type' attribute is not supported on targets missing 'neon', 'mve', 'sve' or 'sme'; specify an appropriate -march= or -mcpu=}}
 typedef __attribute__((neon_polyvector_type(16))) short poly8x16_t; // expected-error{{'neon_polyvector_type' attribute is not supported on targets missing 'neon' or 'mve'; specify an appropriate -march= or -mcpu=}}
diff --git a/clang/test/Sema/attr-counted-by.c b/clang/test/Sema/attr-counted-by.c
index f14da9c77fa8b..d5d4ebf557392 100644
--- a/clang/test/Sema/attr-counted-by.c
+++ b/clang/test/Sema/attr-counted-by.c
@@ -11,16 +11,64 @@ struct not_found {
 
 struct no_found_count_not_in_substruct {
   unsigned long flags;
-  unsigned char count; // expected-note {{field 'count' declared here}}
+  unsigned char count; // expected-note {{'count' declared here}}
   struct A {
     int dummy;
     int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
   } a;
 };
 
+struct not_found_count_not_in_unnamed_substruct {
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct_2 {
+  struct {
+    unsigned char count; // expected-note {{'count' declared here}}
+  };
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_in_other_unnamed_substruct {
+  struct {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct {
+  struct _a1 {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct_2 {
+  struct _a2 {
+    unsigned char count;
+  } a2;
+
+  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+};
+
 struct not_found_suggest {
-  int bork; // expected-note {{'bork' declared here}}
-  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'; did you mean 'bork'?}}
+  int bork;
+  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
 };
 
 int global; // expected-note {{'global' declared here}}
@@ -32,17 +80,17 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __counted_by(self); // expected-error {{'counted_by' cannot refer to the flexible array 'self'}}
+  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
 };
 
 struct non_int_count {
-  double dbl_count; // expected-note {{field 'dbl_count' declared here}}
-  struct bar *fam[] __counted_by(dbl_count); // expected-error {{field 'dbl_count' in 'counted_by' must be a non-boolean integer type}}
+  double dbl_count;
+  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
 };
 
 struct array_of_ints_count {
-  int integers[2]; // expected-note {{field 'integers' declared here}}
-  struct bar *fam[] __counted_by(integers); // expected-error {{field 'integers' in 'counted_by' must be a non-boolean integer type}}
+  int integers[2];
+  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
 };
 
 struct not_a_fam {
@@ -58,7 +106,7 @@ struct not_a_c99_fam {
 struct annotated_with_anon_struct {
   unsigned long flags;
   struct {
-    unsigned char count; // expected-note {{'count' declared here}}
-    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'; did you mean 'count'?}}
+    unsigned char count;
+    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
   };
 };
diff --git a/clang/test/Sema/builtins-arm64.c b/clang/test/Sema/builtins-arm64.c
index e711121f7260f..f094162b3aadc 100644
--- a/clang/test/Sema/builtins-arm64.c
+++ b/clang/test/Sema/builtins-arm64.c
@@ -29,3 +29,12 @@ void test_prefetch(void) {
   __builtin_arm_prefetch(0, 0, 0, 2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}}
   __builtin_arm_prefetch(0, 0, 0, 0, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}}
 }
+
+void test_trap(short s, unsigned short us) {
+  __builtin_arm_trap(42);
+  __builtin_arm_trap(65535);
+  __builtin_arm_trap(-1);
+  __builtin_arm_trap(65536); // expected-warning {{implicit conversion from 'int' to 'unsigned short' changes value from 65536 to 0}}
+  __builtin_arm_trap(s); // expected-error {{argument to '__builtin_arm_trap' must be a constant integer}}
+  __builtin_arm_trap(us); // expected-error {{argument to '__builtin_arm_trap' must be a constant integer}}
+}
\ No newline at end of file
diff --git a/clang/test/Sema/complex-arithmetic.c b/clang/test/Sema/complex-arithmetic.c
new file mode 100644
index 0000000000000..c9e84da6daa9d
--- /dev/null
+++ b/clang/test/Sema/complex-arithmetic.c
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -verify %s
+// expected-no-diagnostics
+
+// This tests evaluation of _Complex arithmetic at compile time.
+
+#define APPROX_EQ(a, b) (                             \
+  __builtin_fabs(__real (a) - __real (b)) < 0.0001 && \
+  __builtin_fabs(__imag (a) - __imag (b)) < 0.0001    \
+)
+
+#define EVAL(a, b) _Static_assert(a == b, "")
+#define EVALF(a, b) _Static_assert(APPROX_EQ(a, b), "")
+
+// _Complex float + _Complex float
+void a() {
+  EVALF((2.f + 3i) + (4.f + 5i), 6.f + 8i);
+  EVALF((2.f + 3i) - (4.f + 5i), -2.f - 2i);
+  EVALF((2.f + 3i) * (4.f + 5i), -7.f + 22i);
+  EVALF((2.f + 3i) / (4.f + 5i), 0.5609f + 0.0487i);
+
+  EVALF((2. + 3i) + (4. + 5i), 6. + 8i);
+  EVALF((2. + 3i) - (4. + 5i), -2. - 2i);
+  EVALF((2. + 3i) * (4. + 5i), -7. + 22i);
+  EVALF((2. + 3i) / (4. + 5i), .5609 + .0487i);
+}
+
+// _Complex int + _Complex int
+void b() {
+  EVAL((2 + 3i) + (4 + 5i), 6 + 8i);
+  EVAL((2 + 3i) - (4 + 5i), -2 - 2i);
+  EVAL((2 + 3i) * (4 + 5i), -7 + 22i);
+  EVAL((8 + 30i) / (4 + 5i), 4 + 1i);
+}
+
+// _Complex float + float
+void c() {
+  EVALF((2.f + 4i) + 3.f, 5.f + 4i);
+  EVALF((2.f + 4i) - 3.f, -1.f + 4i);
+  EVALF((2.f + 4i) * 3.f, 6.f + 12i);
+  EVALF((2.f + 4i) / 2.f, 1.f + 2i);
+
+  EVALF(3.f + (2.f + 4i), 5.f + 4i);
+  EVALF(3.f - (2.f + 4i), 1.f - 4i);
+  EVALF(3.f * (2.f + 4i), 6.f + 12i);
+  EVALF(3.f / (2.f + 4i), .3f - 0.6i);
+
+  EVALF((2. + 4i) + 3., 5. + 4i);
+  EVALF((2. + 4i) - 3., -1. + 4i);
+  EVALF((2. + 4i) * 3., 6. + 12i);
+  EVALF((2. + 4i) / 2., 1. + 2i);
+
+  EVALF(3. + (2. + 4i), 5. + 4i);
+  EVALF(3. - (2. + 4i), 1. - 4i);
+  EVALF(3. * (2. + 4i), 6. + 12i);
+  EVALF(3. / (2. + 4i), .3 - 0.6i);
+}
+
+// _Complex int + int
+void d() {
+  EVAL((2 + 4i) + 3, 5 + 4i);
+  EVAL((2 + 4i) - 3, -1 + 4i);
+  EVAL((2 + 4i) * 3, 6 + 12i);
+  EVAL((2 + 4i) / 2, 1 + 2i);
+
+  EVAL(3 + (2 + 4i), 5 + 4i);
+  EVAL(3 - (2 + 4i), 1 - 4i);
+  EVAL(3 * (2 + 4i), 6 + 12i);
+  EVAL(20 / (2 + 4i), 2 - 4i);
+}
+
+// _Complex float + int
+void e() {
+  EVALF((2.f + 4i) + 3, 5.f + 4i);
+  EVALF((2.f + 4i) - 3, -1.f + 4i);
+  EVALF((2.f + 4i) * 3, 6.f + 12i);
+  EVALF((2.f + 4i) / 2, 1.f + 2i);
+
+  EVALF(3 + (2.f + 4i), 5.f + 4i);
+  EVALF(3 - (2.f + 4i), 1.f - 4i);
+  EVALF(3 * (2.f + 4i), 6.f + 12i);
+  EVALF(3 / (2.f + 4i), .3f - 0.6i);
+
+  EVALF((2. + 4i) + 3, 5. + 4i);
+  EVALF((2. + 4i) - 3, -1. + 4i);
+  EVALF((2. + 4i) * 3, 6. + 12i);
+  EVALF((2. + 4i) / 2, 1. + 2i);
+
+  EVALF(3 + (2. + 4i), 5. + 4i);
+  EVALF(3 - (2. + 4i), 1. - 4i);
+  EVALF(3 * (2. + 4i), 6. + 12i);
+  EVALF(3 / (2. + 4i), .3 - 0.6i);
+}
+
+// _Complex int + float
+void f() {
+  EVALF((2 + 4i) + 3.f, 5.f + 4i);
+  EVALF((2 + 4i) - 3.f, -1.f + 4i);
+  EVALF((2 + 4i) * 3.f, 6.f + 12i);
+  EVALF((2 + 4i) / 2.f, 1.f + 2i);
+
+  EVALF(3.f + (2 + 4i), 5.f + 4i);
+  EVALF(3.f - (2 + 4i), 1.f - 4i);
+  EVALF(3.f * (2 + 4i), 6.f + 12i);
+  EVALF(3.f / (2 + 4i), .3f - 0.6i);
+
+  EVALF((2 + 4i) + 3., 5. + 4i);
+  EVALF((2 + 4i) - 3., -1. + 4i);
+  EVALF((2 + 4i) * 3., 6. + 12i);
+  EVALF((2 + 4i) / 2., 1. + 2i);
+
+  EVALF(3. + (2 + 4i), 5. + 4i);
+  EVALF(3. - (2 + 4i), 1. - 4i);
+  EVALF(3. * (2 + 4i), 6. + 12i);
+  EVALF(3. / (2 + 4i), .3 - 0.6i);
+}
diff --git a/clang/test/Sema/const-eval.c b/clang/test/Sema/const-eval.c
index 2e38d5e23c208..e358aceaad5a4 100644
--- a/clang/test/Sema/const-eval.c
+++ b/clang/test/Sema/const-eval.c
@@ -134,8 +134,7 @@ void PR21945(void) { int i = (({}), 0l); }
 
 void PR24622(void);
 struct PR24622 {} pr24622;
-EVAL_EXPR(52, &pr24622 == (void *)&PR24622); // expected-error {{not an integer constant expression}}
-                                             // expected-note@-1 {{past the end}}
+EVAL_EXPR(52, &pr24622 == (void *)&PR24622);
 
 // We evaluate these by providing 2s' complement semantics in constant
 // expressions, like we do for integers.
diff --git a/clang/test/Sema/constexpr-void-cast.c b/clang/test/Sema/constexpr-void-cast.c
new file mode 100644
index 0000000000000..91e4027f67fe3
--- /dev/null
+++ b/clang/test/Sema/constexpr-void-cast.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -x c -fsyntax-only %s -verify=c -std=c11
+// RUN: %clang_cc1 -x c -fsyntax-only %s -pedantic -verify=c-pedantic -std=c11
+//
+// RUN: %clang_cc1 -x c++ -fsyntax-only %s -verify=cxx
+// RUN: %clang_cc1 -x c++ -fsyntax-only %s -pedantic -verify=cxx-pedantic
+
+// c-no-diagnostics
+// cxx-no-diagnostics
+
+void f(void);
+struct S {char c;} s;
+_Static_assert(&s != (void *)&f, ""); // c-pedantic-warning {{not an integer constant expression}} \
+                                      // c-pedantic-note {{this conversion is not allowed in a constant expression}} \
+                                      // cxx-pedantic-warning {{'_Static_assert' is a C11 extension}}
diff --git a/clang/test/Sema/ptrauth-intrinsics-macro.c b/clang/test/Sema/ptrauth-intrinsics-macro.c
new file mode 100644
index 0000000000000..07d6374045145
--- /dev/null
+++ b/clang/test/Sema/ptrauth-intrinsics-macro.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -Wall -fsyntax-only -verify -fptrauth-intrinsics %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -Wall -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+#include <ptrauth.h>
+
+#define VALID_CODE_KEY 0
+#define VALID_DATA_KEY 2
+
+extern int dv;
+
+void test(int *dp, int value) {
+  dp = ptrauth_strip(dp, VALID_DATA_KEY);
+  ptrauth_extra_data_t t0 = ptrauth_blend_discriminator(dp, value);
+  (void)t0;
+  dp = ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, 0);
+  dp = ptrauth_auth_and_resign(dp, VALID_DATA_KEY, dp, VALID_DATA_KEY, dp);
+  dp = ptrauth_auth_data(dp, VALID_DATA_KEY, 0);
+  int pu0 = 0, pu1 = 0, pu2 = 0, pu3 = 0, pu4 = 0, pu5 = 0, pu6 = 0, pu7 = 0;
+  ptrauth_blend_discriminator(&pu0, value);
+  ptrauth_auth_and_resign(&pu1, VALID_DATA_KEY, dp, VALID_DATA_KEY, dp);
+  ptrauth_auth_and_resign(dp, VALID_DATA_KEY, &pu2, VALID_DATA_KEY, dp);
+  ptrauth_auth_and_resign(dp, VALID_DATA_KEY, dp, VALID_DATA_KEY, &pu3);
+  ptrauth_sign_generic_data(pu4, dp);
+  ptrauth_sign_generic_data(dp, pu5);
+  ptrauth_auth_data(&pu6, VALID_DATA_KEY, value);
+  ptrauth_auth_data(dp, VALID_DATA_KEY, pu7);
+
+
+
+  int t2 = ptrauth_sign_generic_data(dp, 0);
+  (void)t2;
+}
diff --git a/clang/test/Sema/ptrauth.c b/clang/test/Sema/ptrauth.c
new file mode 100644
index 0000000000000..3ad3d70c24e41
--- /dev/null
+++ b/clang/test/Sema/ptrauth.c
@@ -0,0 +1,126 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics %s
+
+#if __has_feature(ptrauth_intrinsics)
+#warning Pointer authentication enabled!
+// expected-warning@-1 {{Pointer authentication enabled!}}
+#endif
+
+#if __aarch64__
+#define VALID_CODE_KEY 0
+#define VALID_DATA_KEY 2
+#define INVALID_KEY 200
+#else
+#error Provide these constants if you port this test
+#endif
+
+#define NULL ((void*) 0)
+struct A { int x; } mismatched_type;
+
+extern int dv;
+extern int fv(int);
+
+void test_strip(int *dp, int (*fp)(int)) {
+  __builtin_ptrauth_strip(dp); // expected-error {{too few arguments}}
+  __builtin_ptrauth_strip(dp, VALID_DATA_KEY, dp); // expected-error {{too many arguments}}
+  (void) __builtin_ptrauth_strip(NULL, VALID_DATA_KEY); // no warning
+
+  __builtin_ptrauth_strip(mismatched_type, VALID_DATA_KEY); // expected-error {{signed value must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_strip(dp, mismatched_type); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+
+  int *dr = __builtin_ptrauth_strip(dp, VALID_DATA_KEY);
+  dr = __builtin_ptrauth_strip(dp, INVALID_KEY); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  int (*fr)(int) = __builtin_ptrauth_strip(fp, VALID_CODE_KEY);
+  fr = __builtin_ptrauth_strip(fp, INVALID_KEY); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  float *mismatch = __builtin_ptrauth_strip(dp, VALID_DATA_KEY); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+}
+
+void test_blend_discriminator(int *dp, int (*fp)(int), int value) {
+  __builtin_ptrauth_blend_discriminator(dp); // expected-error {{too few arguments}}
+  __builtin_ptrauth_blend_discriminator(dp, dp, dp); // expected-error {{too many arguments}}
+  (void) __builtin_ptrauth_blend_discriminator(dp, value); // no warning
+
+  __builtin_ptrauth_blend_discriminator(mismatched_type, value); // expected-error {{blended pointer must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_blend_discriminator(dp, mismatched_type); // expected-error {{blended integer must have integer type; type here is 'struct A'}}
+
+  float *mismatch = __builtin_ptrauth_blend_discriminator(dp, value); // expected-error {{incompatible integer to pointer conversion initializing 'float *' with an expression of type}}
+}
+
+void test_sign_unauthenticated(int *dp, int (*fp)(int)) {
+  __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY); // expected-error {{too few arguments}}
+  __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, dp, dp); // expected-error {{too many arguments}}
+
+  __builtin_ptrauth_sign_unauthenticated(mismatched_type, VALID_DATA_KEY, 0); // expected-error {{signed value must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_sign_unauthenticated(dp, mismatched_type, 0); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+  __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+
+  (void) __builtin_ptrauth_sign_unauthenticated(NULL, VALID_DATA_KEY, 0); // expected-warning {{signing a null pointer will yield a non-null pointer}}
+
+  int *dr = __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, 0);
+  dr = __builtin_ptrauth_sign_unauthenticated(dp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  int (*fr)(int) = __builtin_ptrauth_sign_unauthenticated(fp, VALID_CODE_KEY, 0);
+  fr = __builtin_ptrauth_sign_unauthenticated(fp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  float *mismatch = __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, 0); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+}
+
+void test_auth(int *dp, int (*fp)(int)) {
+  __builtin_ptrauth_auth(dp, VALID_DATA_KEY); // expected-error {{too few arguments}}
+  __builtin_ptrauth_auth(dp, VALID_DATA_KEY, dp, dp); // expected-error {{too many arguments}}
+
+  __builtin_ptrauth_auth(mismatched_type, VALID_DATA_KEY, 0); // expected-error {{signed value must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_auth(dp, mismatched_type, 0); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+  __builtin_ptrauth_auth(dp, VALID_DATA_KEY, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+
+  (void) __builtin_ptrauth_auth(NULL, VALID_DATA_KEY, 0); // expected-warning {{authenticating a null pointer will almost certainly trap}}
+
+  int *dr = __builtin_ptrauth_auth(dp, VALID_DATA_KEY, 0);
+  dr = __builtin_ptrauth_auth(dp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  int (*fr)(int) = __builtin_ptrauth_auth(fp, VALID_CODE_KEY, 0);
+  fr = __builtin_ptrauth_auth(fp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  float *mismatch = __builtin_ptrauth_auth(dp, VALID_DATA_KEY, 0); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+}
+
+void test_auth_and_resign(int *dp, int (*fp)(int)) {
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY); // expected-error {{too few arguments}}
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, dp, VALID_DATA_KEY, dp, 0); // expected-error {{too many arguments}}
+
+  __builtin_ptrauth_auth_and_resign(mismatched_type, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp); // expected-error {{signed value must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_auth_and_resign(dp, mismatched_type, 0, VALID_DATA_KEY, dp); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, mismatched_type, VALID_DATA_KEY, dp); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, mismatched_type, dp); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+
+  (void) __builtin_ptrauth_auth_and_resign(NULL, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp); // expected-warning {{authenticating a null pointer will almost certainly trap}}
+
+  int *dr = __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp);
+  dr = __builtin_ptrauth_auth_and_resign(dp, INVALID_KEY, 0, VALID_DATA_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+  dr = __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, INVALID_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  int (*fr)(int) = __builtin_ptrauth_auth_and_resign(fp, VALID_CODE_KEY, 0, VALID_CODE_KEY, dp);
+  fr = __builtin_ptrauth_auth_and_resign(fp, INVALID_KEY, 0, VALID_CODE_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+  fr = __builtin_ptrauth_auth_and_resign(fp, VALID_CODE_KEY, 0, INVALID_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  float *mismatch = __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+}
+
+void test_sign_generic_data(int *dp) {
+  __builtin_ptrauth_sign_generic_data(dp); // expected-error {{too few arguments}}
+  __builtin_ptrauth_sign_generic_data(dp, 0, 0); // expected-error {{too many arguments}}
+
+  __builtin_ptrauth_sign_generic_data(mismatched_type, 0); // expected-error {{signed value must have pointer or integer type; type here is 'struct A'}}
+  __builtin_ptrauth_sign_generic_data(dp, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+
+  (void) __builtin_ptrauth_sign_generic_data(NULL, 0); // no warning
+
+  unsigned long dr = __builtin_ptrauth_sign_generic_data(dp, 0);
+  dr = __builtin_ptrauth_sign_generic_data(dp, &dv);
+  dr = __builtin_ptrauth_sign_generic_data(12314, 0);
+  dr = __builtin_ptrauth_sign_generic_data(12314, &dv);
+
+  int *mismatch = __builtin_ptrauth_sign_generic_data(dp, 0); // expected-error {{incompatible integer to pointer conversion initializing 'int *' with an expression of type}}
+}
diff --git a/clang/test/Sema/warn-bitwise-and-bool.c b/clang/test/Sema/warn-bitwise-and-bool.c
index 6bec1be1abdef..c30498e2bd8d1 100644
--- a/clang/test/Sema/warn-bitwise-and-bool.c
+++ b/clang/test/Sema/warn-bitwise-and-bool.c
@@ -19,6 +19,8 @@ boolean baz(void) __attribute__((const));
 void sink(boolean);
 
 #define FOO foo()
+#define MY_AND &
+#define My_BITAND bitand
 
 void test(boolean a, boolean b, int *p, volatile int *q, int i) {
   b = a & b;
@@ -44,9 +46,12 @@ void test(boolean a, boolean b, int *p, volatile int *q, int i) {
   b = b & foo();
   b = bar() & (i > 4);
   b = (i == 7) & foo();
+  b = b MY_AND foo();     // OK, no warning expected
+
 #ifdef __cplusplus
-  b = foo() bitand bar(); // expected-warning {{use of bitwise '&' with boolean operands}}
-                          // expected-note@-1 {{cast one or both operands to int to silence this warning}}
+  b = foo() bitand bar(); // Ok, no warning expected
+  b = foo() My_BITAND bar(); // Ok, no warning expected
+                          
 #endif
 
   if (foo() & bar())      // expected-warning {{use of bitwise '&' with boolean operands}}
@@ -60,4 +65,5 @@ void test(boolean a, boolean b, int *p, volatile int *q, int i) {
 
   int n = i + 10;
   b = (n & (n - 1));
+
 }
diff --git a/clang/test/Sema/warn-bitwise-or-bool.c b/clang/test/Sema/warn-bitwise-or-bool.c
index ae86790901aac..e45ef28da0a70 100644
--- a/clang/test/Sema/warn-bitwise-or-bool.c
+++ b/clang/test/Sema/warn-bitwise-or-bool.c
@@ -19,6 +19,8 @@ boolean baz(void) __attribute__((const));
 void sink(boolean);
 
 #define FOO foo()
+#define MY_OR |
+#define My_BITOR bitor
 
 void test(boolean a, boolean b, int *p, volatile int *q, int i) {
   b = a | b;
@@ -44,9 +46,11 @@ void test(boolean a, boolean b, int *p, volatile int *q, int i) {
   b = b | foo();
   b = bar() | (i > 4);
   b = (i == 7) | foo();
+  b = b MY_OR foo();     // OK, no warning expected
 #ifdef __cplusplus
-  b = foo() bitor bar();  // expected-warning {{use of bitwise '|' with boolean operands}}
-                          // expected-note@-1 {{cast one or both operands to int to silence this warning}}
+  b = foo() bitor bar();  //Ok, no warning expected
+  b = foo() My_BITOR bar(); // Ok, no warning expected
+                          
 #endif
 
   if (foo() | bar())      // expected-warning {{use of bitwise '|' with boolean operands}}
diff --git a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
index c5b8032f40b13..7520b43a194ab 100644
--- a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
+++ b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
@@ -90,7 +90,8 @@ void test_record() {
   struct tuple4 {
     unsigned x, y, z, doublez;
 
-    constexpr bool operator==(tuple4 const &other) const {
+    bool operator==(tuple4 const &other) const = default;
+    constexpr bool operator==(bases const &other) const {
       return x == other.x && y == other.y &&
              z == other.z && doublez == other.doublez;
     }
@@ -99,6 +100,9 @@ void test_record() {
   constexpr tuple4 t4 = bit_cast<tuple4>(b);
   static_assert(t4 == tuple4{1, 2, 3, 4});
   static_assert(round_trip<tuple4>(b));
+
+  constexpr auto b2 = bit_cast<bases>(t4);
+  static_assert(t4 == b2);
 }
 
 void test_partially_initialized() {
diff --git a/clang/test/SemaCXX/constexpr-explicit-object-lambda.cpp b/clang/test/SemaCXX/constexpr-explicit-object-lambda.cpp
new file mode 100644
index 0000000000000..4e8e94d428d07
--- /dev/null
+++ b/clang/test/SemaCXX/constexpr-explicit-object-lambda.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -std=c++23 -verify %s
+// expected-no-diagnostics
+
+struct S {
+  int i = 42;
+  constexpr auto f1() {
+    return [this](this auto) {
+      return this->i;
+    }();
+  };
+
+  constexpr auto f2() {
+    return [this](this auto&&) {
+      return this->i;
+    }();
+  };
+
+  constexpr auto f3() {
+    return [i = this->i](this auto) {
+      return i;
+    }();
+  };
+
+  constexpr auto f4() {
+    return [i = this->i](this auto&&) {
+      return i;
+    }();
+  };
+};
+
+static_assert(S().f1() == 42);
+static_assert(S().f2() == 42);
+static_assert(S().f3() == 42);
+static_assert(S().f4() == 42);
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index 2d7c9b174d901..478da092471af 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -126,3 +126,13 @@ static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint sati
 static_assert(f5<double>() == 2);
 static_assert(f5<E>() == 1); // expected-note {{while checking constraint satisfaction}} expected-note {{in instantiation of}}
 static_assert(f5<F>() == 2); // expected-note {{while checking constraint satisfaction}} expected-note {{in instantiation of}}
+
+// Do not validate assumptions whose evaluation would have side-effects.
+constexpr int foo() {
+  int a = 0;
+  [[assume(a++)]] [[assume(++a)]]; // expected-warning 2 {{has side effects that will be discarded}} ext-warning 2 {{C++23 extension}}
+  [[assume((a+=1))]]; // expected-warning {{has side effects that will be discarded}} ext-warning {{C++23 extension}}
+  return a;
+}
+
+static_assert(foo() == 0);
diff --git a/clang/test/SemaCXX/datasizeof.cpp b/clang/test/SemaCXX/datasizeof.cpp
index 5baf2ecb24ed7..43135c0049663 100644
--- a/clang/test/SemaCXX/datasizeof.cpp
+++ b/clang/test/SemaCXX/datasizeof.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -triple x86_64-linux-gnu -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple x86_64-linux-gnu -verify %s -fexperimental-new-constant-interpreter
 
 #if !__has_extension(datasizeof)
 #  error "Expected datasizeof extension"
diff --git a/clang/test/SemaCXX/decomposed-condition.cpp b/clang/test/SemaCXX/decomposed-condition.cpp
index ab011f6ae4ba4..e55bbee3134ca 100644
--- a/clang/test/SemaCXX/decomposed-condition.cpp
+++ b/clang/test/SemaCXX/decomposed-condition.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++1z -Wno-binding-in-condition -verify %s
+// RUN: %clang_cc1 -std=c++1z -Wno-binding-in-condition -verify %s -fexperimental-new-constant-interpreter
 
 struct X {
   bool flag;
diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp
index 0516a5da31ae9..389002ab0e349 100644
--- a/clang/test/SemaCXX/lambda-expressions.cpp
+++ b/clang/test/SemaCXX/lambda-expressions.cpp
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 -std=c++11 -Wno-unused-value -fsyntax-only -verify=expected,expected-cxx14,cxx11 -fblocks %s
 // RUN: %clang_cc1 -std=c++14 -Wno-unused-value -fsyntax-only -verify -verify=expected-cxx14 -fblocks %s
 // RUN: %clang_cc1 -std=c++17 -Wno-unused-value -verify -ast-dump -fblocks %s | FileCheck %s
 
@@ -558,8 +559,8 @@ struct B {
   int x;
   A a = [&] { int y = x; };
   A b = [&] { [&] { [&] { int y = x; }; }; };
-  A d = [&](auto param) { int y = x; };
-  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; };
+  A d = [&](auto param) { int y = x; }; // cxx11-error {{'auto' not allowed in lambda parameter}}
+  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx11-error 2 {{'auto' not allowed in lambda parameter}}
 };
 
 B<int> b;
@@ -589,6 +590,7 @@ struct S1 {
 void foo1() {
   auto s0 = S1{[name=]() {}}; // expected-error 2 {{expected expression}}
   auto s1 = S1{[name=name]() {}}; // expected-error {{use of undeclared identifier 'name'; did you mean 'name1'?}}
+                                  // cxx11-warning@-1 {{initialized lambda captures are a C++14 extension}}
 }
 }
 
@@ -604,7 +606,7 @@ namespace PR25627_dont_odr_use_local_consts {
 
 namespace ConversionOperatorDoesNotHaveDeducedReturnType {
   auto x = [](int){};
-  auto y = [](auto &v) -> void { v.n = 0; };
+  auto y = [](auto &v) -> void { v.n = 0; }; // cxx11-error {{'auto' not allowed in lambda parameter}} cxx11-note {{candidate function not viable}} cxx11-note {{conversion candidate}}
   using T = decltype(x);
   using U = decltype(y);
   using ExpectedTypeT = void (*)(int);
@@ -624,22 +626,22 @@ namespace ConversionOperatorDoesNotHaveDeducedReturnType {
     template<typename T>
       friend constexpr U::operator ExpectedTypeU<T>() const noexcept;
 #else
-    friend auto T::operator()(int) const;
+    friend auto T::operator()(int) const; // cxx11-error {{'auto' return without trailing return type; deduced return types are a C++14 extension}}
     friend T::operator ExpectedTypeT() const;
 
     template<typename T>
-      friend void U::operator()(T&) const;
+      friend void U::operator()(T&) const; // cxx11-error {{friend declaration of 'operator()' does not match any declaration}}
     // FIXME: This should not match, as above.
     template<typename T>
-      friend U::operator ExpectedTypeU<T>() const;
+      friend U::operator ExpectedTypeU<T>() const; // cxx11-error {{friend declaration of 'operator void (*)(type-parameter-0-0 &)' does not match any declaration}}
 #endif
 
   private:
     int n;
   };
 
-  // Should be OK: lambda's call operator is a friend.
-  void use(X &x) { y(x); }
+  // Should be OK in C++14 and later: lambda's call operator is a friend.
+  void use(X &x) { y(x); } // cxx11-error {{no matching function for call to object}}
 
   // This used to crash in return type deduction for the conversion opreator.
   struct A { int n; void f() { +[](decltype(n)) {}; } };
@@ -733,6 +735,8 @@ void GH67492() {
   auto lambda = (test, []() noexcept(true) {});
 }
 
+// FIXME: This currently causes clang to crash in C++11 mode.
+#if __cplusplus >= 201402L
 namespace GH83267 {
 auto l = [](auto a) { return 1; };
 using type = decltype(l);
@@ -747,3 +751,4 @@ using t = decltype(ll);
 template auto t::operator()<int>(int a) const; // expected-note {{in instantiation}}
 
 }
+#endif
diff --git a/clang/test/SemaCXX/source_location.cpp b/clang/test/SemaCXX/source_location.cpp
index 1699f54e4830b..e87d95577ebea 100644
--- a/clang/test/SemaCXX/source_location.cpp
+++ b/clang/test/SemaCXX/source_location.cpp
@@ -1,14 +1,14 @@
 // RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fexceptions -verify %s
 // RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -DUSE_CONSTEVAL -fexceptions -verify %s
 // RUN: %clang_cc1 -std=c++2b -fcxx-exceptions -DUSE_CONSTEVAL -DPAREN_INIT -fexceptions -verify %s
-// RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fms-extensions -DMS -fexceptions -verify %s
-// RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -fms-extensions -DMS -DUSE_CONSTEVAL -fexceptions -verify %s
+// RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fms-extensions -DMS -fexceptions -fms-compatibility -verify %s
+// RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -fms-extensions -DMS -DUSE_CONSTEVAL -fexceptions -fms-compatibility -verify %s
 //
 // RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
 // RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -DUSE_CONSTEVAL -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
 // RUN: %clang_cc1 -std=c++2b -fcxx-exceptions -DUSE_CONSTEVAL -DPAREN_INIT -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
-// RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fms-extensions -DMS -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
-// RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -fms-extensions -DMS -DUSE_CONSTEVAL -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
+// RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fms-extensions -DMS -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -fms-compatibility -verify %s
+// RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -fms-extensions -DMS -DUSE_CONSTEVAL -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify -fms-compatibility %s
 // expected-no-diagnostics
 
 #define assert(...) ((__VA_ARGS__) ? ((void)0) : throw 42)
@@ -463,8 +463,70 @@ void ctor_tests() {
 constexpr SL global_sl = SL::current();
 static_assert(is_equal(global_sl.function(), ""));
 
+template <class T>
+class TestBI {
+public:
+   TestBI() {
+#ifdef MS
+     static_assert(is_equal(__FUNCTION__, "test_func::TestBI<int>::TestBI"));
+#else
+     static_assert(is_equal(__FUNCTION__, "TestBI"));
+#endif
+     static_assert(is_equal(__func__, "TestBI"));
+   }
+};
+
+template <class T>
+class TestClass {
+public:
+   TestClass() {
+#ifdef MS
+      static_assert(is_equal(__FUNCTION__, "test_func::TestClass<class test_func::C>::TestClass"));
+#else
+      static_assert(is_equal(__FUNCTION__, "TestClass"));
+#endif
+      static_assert(is_equal(__func__, "TestClass"));
+   }
+};
+
+template <class T>
+class TestStruct {
+public:
+   TestStruct() {
+#ifdef MS
+      static_assert(is_equal(__FUNCTION__, "test_func::TestStruct<struct test_func::S>::TestStruct"));
+#else
+      static_assert(is_equal(__FUNCTION__, "TestStruct"));
+#endif
+      static_assert(is_equal(__func__, "TestStruct"));
+   }
+};
+
+template <class T>
+class TestEnum {
+public:
+   TestEnum() {
+#ifdef MS
+      static_assert(is_equal(__FUNCTION__, "test_func::TestEnum<enum test_func::E>::TestEnum"));
+#else
+      static_assert(is_equal(__FUNCTION__, "TestEnum"));
+#endif
+      static_assert(is_equal(__func__, "TestEnum"));
+   }
+};
+
+class C {};
+struct S {};
+enum E {};
+
+TestBI<int> t1;
+TestClass<test_func::C> t2;
+TestStruct<test_func::S> t3;
+TestEnum<test_func::E> t4;
+
 } // namespace test_func
 
+
 //===----------------------------------------------------------------------===//
 //                            __builtin_FUNCSIG()
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/SemaCXX/warn-constant-evaluated-constexpr.cpp b/clang/test/SemaCXX/warn-constant-evaluated-constexpr.cpp
index 35dc69cccb3a2..fa40c13971118 100644
--- a/clang/test/SemaCXX/warn-constant-evaluated-constexpr.cpp
+++ b/clang/test/SemaCXX/warn-constant-evaluated-constexpr.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++2a -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++2a -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
 
 namespace std {
 constexpr bool is_constant_evaluated() noexcept {
diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
new file mode 100644
index 0000000000000..4c0e5315ce532
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float2 test_no_second_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_clamp(p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+float2 test_no_third_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+float2 test_clamp_no_second_arg(float2 p0) {
+  return clamp(p0);
+  // expected-error@-1 {{no matching function for call to 'clamp'}}
+}
+
+float2 test_clamp_vector_size_mismatch(float3 p0, float2 p1) {
+  return clamp(p0, p0, p1);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
+}
+
+float2 test_clamp_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must have the same type}}
+}
+
+float test_clamp_scalar_mismatch(float p0, half p1) {
+  return clamp(p1, p0, p1);
+  // expected-error@-1 {{call to 'clamp' is ambiguous}}
+}
+
+float2 test_clamp_element_type_mismatch(half2 p0, float2 p1) {
+  return clamp(p1, p0, p1);
+  // expected-error@-1 {{call to 'clamp' is ambiguous}}
+}
+
+float2 test_builtin_clamp_float2_splat(float p0, float2 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float3 test_builtin_clamp_float3_splat(float p0, float3 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float4 test_builtin_clamp_float4_splat(float p0, float4 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float2 test_clamp_float2_int_splat(float2 p0, int p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float3 test_clamp_float3_int_splat(float3 p0, int p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float2 test_builtin_clamp_int_vect_to_float_vec_promotion(int2 p0, float p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float test_builtin_clamp_bool_type_promotion(bool p0) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p0);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion(float p0, bool p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
+  return __builtin_hlsl_elementwise_clamp(p1, p0, p1);
+  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_clamp_int_to_float_promotion(float p0, int p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
index 59eb9482b9ef9..ba7ffc20484ae 100644
--- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
@@ -108,3 +108,12 @@ int test_builtin_dot_bool_type_promotion(bool p0, bool p1) {
   return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
 }
+
+double test_dot_double(double2 p0, double2 p1) {
+  return dot(p0, p1);
+  // expected-error@-1 {{call to 'dot' is ambiguous}}
+}
+double test_dot_double_builtin(double2 p0, double2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{passing 'double2' (aka 'vector<double, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
index 06dbdf0a68dfc..f82b5942fd46b 100644
--- a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
@@ -13,7 +13,7 @@ float2 test_too_many_arg(float2 p0) {
 
 float builtin_bool_to_float_type_promotion(bool p1) {
   return __builtin_hlsl_elementwise_frac(p1);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+  // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}}
 }
 
 float builtin_frac_int_to_float_promotion(int p1) {
@@ -25,3 +25,15 @@ float2 builtin_frac_int2_to_float2_promotion(int2 p1) {
   return __builtin_hlsl_elementwise_frac(p1);
   // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
 }
+
+// builtins are variadic functions and so are subject to DefaultVariadicArgumentPromotion
+half builtin_frac_half_scalar (half p0) {
+  return __builtin_hlsl_elementwise_frac (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+float builtin_frac_float_scalar ( float p0) {
+  return __builtin_hlsl_elementwise_frac (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
diff --git a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
new file mode 100644
index 0000000000000..7ddfd56638273
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
@@ -0,0 +1,38 @@
+
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+bool test_too_few_arg() {
+  return __builtin_hlsl_elementwise_isinf();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+bool2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_isinf(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+bool builtin_bool_to_float_type_promotion(bool p1) {
+  return __builtin_hlsl_elementwise_isinf(p1);
+  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
+}
+
+bool builtin_isinf_int_to_float_promotion(int p1) {
+  return __builtin_hlsl_elementwise_isinf(p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+bool2 builtin_isinf_int2_to_float2_promotion(int2 p1) {
+  return __builtin_hlsl_elementwise_isinf(p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
+
+// builtins are variadic functions and so are subject to DefaultVariadicArgumentPromotion
+half builtin_isinf_half_scalar (half p0) {
+  return __builtin_hlsl_elementwise_isinf (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+float builtin_isinf_float_scalar ( float p0) {
+  return __builtin_hlsl_elementwise_isinf (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
index f6ce87e7c33e3..83751f68357ed 100644
--- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
@@ -92,5 +92,18 @@ float builtin_lerp_int_to_float_promotion(float p0, int p1) {
 
 float4 test_lerp_int4(int4 p0, int4 p1, int4 p2) {
   return __builtin_hlsl_lerp(p0, p1, p2);
-   // expected-error@-1 {{1st argument must be a floating point type (was 'int4' (aka 'vector<int, 4>'))}}
-}
\ No newline at end of file
+  // expected-error@-1 {{1st argument must be a floating point type (was 'int4' (aka 'vector<int, 4>'))}}
+}
+
+// note: DefaultVariadicArgumentPromotion --> DefaultArgumentPromotion has already promoted to double
+// we don't know anymore that the input was half when __builtin_hlsl_lerp is called so we default to float
+// for expected type
+half builtin_lerp_half_scalar (half p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+float builtin_lerp_float_scalar ( float p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
index 0b6843591455b..97ce931bf1b5b 100644
--- a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
@@ -72,15 +72,15 @@ float2 test_builtin_mad_int_vect_to_float_vec_promotion(int2 p0, float p1) {
 
 float builtin_bool_to_float_type_promotion(float p0, bool p1) {
   return __builtin_hlsl_mad(p0, p0, p1);
-  // expected-error@-1 {{3rd argument must be a vector, integer or floating point type (was 'bool')}}
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
 }
 
 float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
   return __builtin_hlsl_mad(p1, p0, p1);
-  // expected-error@-1 {{2nd argument must be a vector, integer or floating point type (was 'bool')}}
+  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
 }
 
 float builtin_mad_int_to_float_promotion(float p0, int p1) {
   return __builtin_hlsl_mad(p0, p0, p1);
-  // expected-error@-1 {{arguments are of different types ('double' vs 'int')}}
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
index dc4501dbd6d15..fa6fd813f19e6 100644
--- a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
@@ -13,7 +13,7 @@ float2 test_too_many_arg(float2 p0) {
 
 float builtin_bool_to_float_type_promotion(bool p1) {
   return __builtin_hlsl_elementwise_rcp(p1);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
 }
 
 float builtin_rcp_int_to_float_promotion(int p1) {
diff --git a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
new file mode 100644
index 0000000000000..c027a698c5e58
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
@@ -0,0 +1,38 @@
+
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float test_too_few_arg() {
+  return __builtin_hlsl_elementwise_rsqrt();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_rsqrt(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+float builtin_bool_to_float_type_promotion(bool p1) {
+  return __builtin_hlsl_elementwise_rsqrt(p1);
+  // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}}
+}
+
+float builtin_rsqrt_int_to_float_promotion(int p1) {
+  return __builtin_hlsl_elementwise_rsqrt(p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+float2 builtin_rsqrt_int2_to_float2_promotion(int2 p1) {
+  return __builtin_hlsl_elementwise_rsqrt(p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
+
+// builtins are variadic functions and so are subject to DefaultVariadicArgumentPromotion
+half builtin_rsqrt_half_scalar (half p0) {
+  return __builtin_hlsl_elementwise_rsqrt (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+float builtin_rsqrt_float_scalar ( float p0) {
+  return __builtin_hlsl_elementwise_rsqrt (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index bac209a28da91..b7ea0d003a52d 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1085,3 +1085,32 @@ template void Struct<void>::bar<>();
 template int Struct<void>::field<1, 2>;
 
 }
+
+namespace GH64808 {
+
+template <class T> struct basic_sender {
+  T func;
+  basic_sender(T) : func(T()) {}
+};
+
+auto a = basic_sender{[](auto... __captures) {
+  return []() // #note-a-1
+    requires((__captures, ...), false) // #note-a-2
+  {};
+}()};
+
+auto b = basic_sender{[](auto... __captures) {
+  return []()
+    requires([](int, double) { return true; }(decltype(__captures)()...))
+  {};
+}(1, 2.33)};
+
+void foo() {
+  a.func();
+  // expected-error@-1{{no matching function for call}}
+  // expected-note@#note-a-1{{constraints not satisfied}}
+  // expected-note@#note-a-2{{evaluated to false}}
+  b.func();
+}
+
+} // namespace GH64808
diff --git a/clang/tools/clang-format/.clang-format b/clang/tools/clang-format/.clang-format
index f95602cab0f7f..d7331b3c8cf02 100644
--- a/clang/tools/clang-format/.clang-format
+++ b/clang/tools/clang-format/.clang-format
@@ -1,6 +1 @@
-BasedOnStyle: LLVM
-InsertBraces: true
-InsertNewlineAtEOF: true
-LineEnding: LF
-RemoveBracesLLVM: true
-RemoveParentheses: ReturnStatement
+BasedOnStyle: clang-format
diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt
index e05f4eac3ad19..b90ffc847b155 100644
--- a/clang/tools/clang-installapi/CMakeLists.txt
+++ b/clang/tools/clang-installapi/CMakeLists.txt
@@ -2,13 +2,20 @@ set(LLVM_LINK_COMPONENTS
   Support
   TargetParser
   TextAPI
+  TextAPIBinaryReader
   Option
   )
 
+set(LLVM_TARGET_DEFINITIONS InstallAPIOpts.td)
+tablegen(LLVM InstallAPIOpts.inc -gen-opt-parser-defs)
+add_public_tablegen_target(InstallAPIDriverOptions)
+
 add_clang_tool(clang-installapi
   ClangInstallAPI.cpp
   Options.cpp
 
+  DEPENDS
+  InstallAPIDriverOptions
   GENERATE_DRIVER
   )
 
@@ -22,3 +29,4 @@ clang_target_link_libraries(clang-installapi
   clangTooling
   clangSerialization
   )
+
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index 15b0baee88bc3..54e82d78d4d22 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -14,12 +14,12 @@
 #include "Options.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticFrontend.h"
-#include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/InstallAPI/Frontend.h"
 #include "clang/InstallAPI/FrontendRecords.h"
+#include "clang/InstallAPI/InstallAPIDiagnostic.h"
 #include "clang/InstallAPI/MachO.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -92,22 +92,8 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   IntrusiveRefCntPtr<clang::FileManager> FM(
       new FileManager(clang::FileSystemOptions(), OverlayFileSystem));
 
-  // Set up driver to parse input arguments.
-  auto DriverArgs = llvm::ArrayRef(Args).slice(1);
-  clang::driver::Driver Driver(ProgName, llvm::sys::getDefaultTargetTriple(),
-                               *Diag, "clang installapi tool");
-  auto TargetAndMode =
-      clang::driver::ToolChain::getTargetAndModeFromProgramName(ProgName);
-  Driver.setTargetAndMode(TargetAndMode);
-  bool HasError = false;
-  llvm::opt::InputArgList ArgList =
-      Driver.ParseArgStrings(DriverArgs, /*UseDriverMode=*/true, HasError);
-  if (HasError)
-    return EXIT_FAILURE;
-  Driver.setCheckInputsExist(false);
-
-  // Capture InstallAPI specific options and diagnose any option errors.
-  Options Opts(*Diag, FM.get(), ArgList);
+  // Capture all options and diagnose any errors.
+  Options Opts(*Diag, FM.get(), Args, ProgName);
   if (Diag->hasErrorOccurred())
     return EXIT_FAILURE;
 
@@ -125,19 +111,21 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   // Execute and gather AST results.
   // An invocation is ran for each unique target triple and for each header
   // access level.
-  Records FrontendResults;
   for (const auto &[Targ, Trip] : Opts.DriverOpts.Targets) {
+    Ctx.Verifier->setTarget(Targ);
+    Ctx.Slice = std::make_shared<FrontendRecordsSlice>(Trip);
     for (const HeaderType Type :
          {HeaderType::Public, HeaderType::Private, HeaderType::Project}) {
-      Ctx.Slice = std::make_shared<FrontendRecordsSlice>(Trip);
       Ctx.Type = Type;
       if (!runFrontend(ProgName, Opts.DriverOpts.Verbose, Ctx,
                        InMemoryFileSystem.get(), Opts.getClangFrontendArgs()))
         return EXIT_FAILURE;
-      FrontendResults.emplace_back(std::move(Ctx.Slice));
     }
   }
 
+  if (Ctx.Verifier->getState() == DylibVerifier::Result::Invalid)
+    return EXIT_FAILURE;
+
   // After symbols have been collected, prepare to write output.
   auto Out = CI->createOutputFile(Ctx.OutputLoc, /*Binary=*/false,
                                   /*RemoveFileOnSignal=*/false,
@@ -147,13 +135,7 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
     return EXIT_FAILURE;
 
   // Assign attributes for serialization.
-  auto Symbols = std::make_unique<SymbolSet>();
-  for (const auto &FR : FrontendResults) {
-    SymbolConverter Converter(Symbols.get(), FR->getTarget());
-    FR->visit(Converter);
-  }
-
-  InterfaceFile IF(std::move(Symbols));
+  InterfaceFile IF(Ctx.Verifier->getExports());
   for (const auto &TargetInfo : Opts.DriverOpts.Targets) {
     IF.addTarget(TargetInfo.first);
     IF.setFromBinaryAttrs(Ctx.BA, TargetInfo.first);
@@ -161,7 +143,8 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
 
   // Write output file and perform CI cleanup.
   if (auto Err = TextAPIWriter::writeToStream(*Out, IF, Ctx.FT)) {
-    Diag->Report(diag::err_cannot_open_file) << Ctx.OutputLoc;
+    Diag->Report(diag::err_cannot_write_file)
+        << Ctx.OutputLoc << std::move(Err);
     CI->clearOutputFiles(/*EraseFiles=*/true);
     return EXIT_FAILURE;
   }
diff --git a/clang/tools/clang-installapi/InstallAPIOpts.td b/clang/tools/clang-installapi/InstallAPIOpts.td
new file mode 100644
index 0000000000000..87f4c3327e840
--- /dev/null
+++ b/clang/tools/clang-installapi/InstallAPIOpts.td
@@ -0,0 +1,31 @@
+//===--- InstallAPIOpts.td ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the specific options for InstallAPI.
+//
+//===----------------------------------------------------------------------===//
+
+// Include the common option parsing interfaces.
+include "llvm/Option/OptParser.td"
+
+
+/////////
+// Options
+
+// TextAPI options.
+def filetype : Joined<["--"], "filetype=">,
+  HelpText<"Specify the output file type (tbd-v4 or tbd-v5)">;
+
+// Verification options.
+def verify_against : Separate<["-"], "verify-against">,
+  HelpText<"Verify the specified dynamic library/framework against the headers">;
+def verify_against_EQ : Joined<["--"], "verify-against=">, Alias<verify_against>;
+def verify_mode_EQ : Joined<["--"], "verify-mode=">,
+  HelpText<"Specify the severity and extend of the validation. Valid modes are ErrorsOnly, ErrorsAndWarnings, and Pedantic.">;
+def demangle : Flag<["--", "-"], "demangle">,
+  HelpText<"Demangle symbols when printing warnings and errors">;
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 701ab81c57c3d..b8696bb7896d8 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -10,35 +10,85 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/InstallAPI/FileList.h"
+#include "clang/InstallAPI/InstallAPIDiagnostic.h"
 #include "llvm/Support/Program.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TextAPI/DylibReader.h"
+#include "llvm/TextAPI/TextAPIWriter.h"
 
-using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace llvm;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
+namespace drv = clang::driver::options;
+
 namespace clang {
 namespace installapi {
 
+/// Create prefix string literals used in InstallAPIOpts.td.
+#define PREFIX(NAME, VALUE)                                                    \
+  static constexpr llvm::StringLiteral NAME##_init[] = VALUE;                  \
+  static constexpr llvm::ArrayRef<llvm::StringLiteral> NAME(                   \
+      NAME##_init, std::size(NAME##_init) - 1);
+#include "InstallAPIOpts.inc"
+#undef PREFIX
+
+static constexpr const llvm::StringLiteral PrefixTable_init[] =
+#define PREFIX_UNION(VALUES) VALUES
+#include "InstallAPIOpts.inc"
+#undef PREFIX_UNION
+    ;
+static constexpr const ArrayRef<StringLiteral>
+    PrefixTable(PrefixTable_init, std::size(PrefixTable_init) - 1);
+
+/// Create table mapping all options defined in InstallAPIOpts.td.
+static constexpr OptTable::Info InfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS,         \
+               VISIBILITY, PARAM, HELPTEXT, METAVAR, VALUES)                   \
+  {PREFIX, NAME,  HELPTEXT,   METAVAR,     OPT_##ID,    Option::KIND##Class,   \
+   PARAM,  FLAGS, VISIBILITY, OPT_##GROUP, OPT_##ALIAS, ALIASARGS,             \
+   VALUES},
+#include "InstallAPIOpts.inc"
+#undef OPTION
+};
+
+namespace {
+
+/// \brief Create OptTable class for parsing actual command line arguments.
+class DriverOptTable : public opt::PrecomputedOptTable {
+public:
+  DriverOptTable() : PrecomputedOptTable(InfoTable, PrefixTable) {}
+};
+
+} // end anonymous namespace.
+
+static llvm::opt::OptTable *createDriverOptTable() {
+  return new DriverOptTable();
+}
+
 bool Options::processDriverOptions(InputArgList &Args) {
   // Handle inputs.
-  llvm::append_range(DriverOpts.FileLists, Args.getAllArgValues(OPT_INPUT));
+  llvm::append_range(DriverOpts.FileLists,
+                     Args.getAllArgValues(drv::OPT_INPUT));
 
   // Handle output.
   SmallString<PATH_MAX> OutputPath;
-  if (auto *Arg = Args.getLastArg(OPT_o)) {
+  if (auto *Arg = Args.getLastArg(drv::OPT_o)) {
     OutputPath = Arg->getValue();
     if (OutputPath != "-")
       FM->makeAbsolutePath(OutputPath);
     DriverOpts.OutputPath = std::string(OutputPath);
   }
+  if (DriverOpts.OutputPath.empty()) {
+    Diags->Report(diag::err_no_output_file);
+    return false;
+  }
 
   // Do basic error checking first for mixing -target and -arch options.
-  auto *ArgArch = Args.getLastArgNoClaim(OPT_arch);
-  auto *ArgTarget = Args.getLastArgNoClaim(OPT_target);
+  auto *ArgArch = Args.getLastArgNoClaim(drv::OPT_arch);
+  auto *ArgTarget = Args.getLastArgNoClaim(drv::OPT_target);
   auto *ArgTargetVariant =
-      Args.getLastArgNoClaim(OPT_darwin_target_variant_triple);
+      Args.getLastArgNoClaim(drv::OPT_darwin_target_variant_triple);
   if (ArgArch && (ArgTarget || ArgTargetVariant)) {
     Diags->Report(clang::diag::err_drv_argument_not_allowed_with)
         << ArgArch->getAsString(Args)
@@ -46,7 +96,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     return false;
   }
 
-  auto *ArgMinTargetOS = Args.getLastArgNoClaim(OPT_mtargetos_EQ);
+  auto *ArgMinTargetOS = Args.getLastArgNoClaim(drv::OPT_mtargetos_EQ);
   if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) {
     Diags->Report(clang::diag::err_drv_cannot_mix_options)
         << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args);
@@ -55,7 +105,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Capture target triples first.
   if (ArgTarget) {
-    for (const Arg *A : Args.filtered(OPT_target)) {
+    for (const Arg *A : Args.filtered(drv::OPT_target)) {
       A->claim();
       llvm::Triple TargetTriple(A->getValue());
       Target TAPITarget = Target(TargetTriple);
@@ -69,27 +119,32 @@ bool Options::processDriverOptions(InputArgList &Args) {
     }
   }
 
-  DriverOpts.Verbose = Args.hasArgNoClaim(OPT_v);
+  DriverOpts.Verbose = Args.hasArgNoClaim(drv::OPT_v);
 
   return true;
 }
 
 bool Options::processLinkerOptions(InputArgList &Args) {
-  // TODO: add error handling.
-
-  // Required arguments.
-  if (const Arg *A = Args.getLastArg(options::OPT_install__name))
+  // Handle required arguments.
+  if (const Arg *A = Args.getLastArg(drv::OPT_install__name))
     LinkerOpts.InstallName = A->getValue();
+  if (LinkerOpts.InstallName.empty()) {
+    Diags->Report(diag::err_no_install_name);
+    return false;
+  }
 
   // Defaulted or optional arguments.
-  if (auto *Arg = Args.getLastArg(OPT_current__version))
+  if (auto *Arg = Args.getLastArg(drv::OPT_current__version))
     LinkerOpts.CurrentVersion.parse64(Arg->getValue());
 
-  LinkerOpts.IsDylib = Args.hasArg(OPT_dynamiclib);
+  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
+    LinkerOpts.CompatVersion.parse64(Arg->getValue());
+
+  LinkerOpts.IsDylib = Args.hasArg(drv::OPT_dynamiclib);
 
-  LinkerOpts.AppExtensionSafe =
-      Args.hasFlag(OPT_fapplication_extension, OPT_fno_application_extension,
-                   /*Default=*/LinkerOpts.AppExtensionSafe);
+  LinkerOpts.AppExtensionSafe = Args.hasFlag(
+      drv::OPT_fapplication_extension, drv::OPT_fno_application_extension,
+      /*Default=*/LinkerOpts.AppExtensionSafe);
 
   if (::getenv("LD_NO_ENCRYPT") != nullptr)
     LinkerOpts.AppExtensionSafe = true;
@@ -102,7 +157,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
 bool Options::processFrontendOptions(InputArgList &Args) {
   // Do not claim any arguments, as they will be passed along for CC1
   // invocations.
-  if (auto *A = Args.getLastArgNoClaim(OPT_x)) {
+  if (auto *A = Args.getLastArgNoClaim(drv::OPT_x)) {
     FEOpts.LangMode = llvm::StringSwitch<clang::Language>(A->getValue())
                           .Case("c", clang::Language::C)
                           .Case("c++", clang::Language::CXX)
@@ -116,8 +171,8 @@ bool Options::processFrontendOptions(InputArgList &Args) {
       return false;
     }
   }
-  for (auto *A : Args.filtered(OPT_ObjC, OPT_ObjCXX)) {
-    if (A->getOption().matches(OPT_ObjC))
+  for (auto *A : Args.filtered(drv::OPT_ObjC, drv::OPT_ObjCXX)) {
+    if (A->getOption().matches(drv::OPT_ObjC))
       FEOpts.LangMode = clang::Language::ObjC;
     else
       FEOpts.LangMode = clang::Language::ObjCXX;
@@ -126,9 +181,77 @@ bool Options::processFrontendOptions(InputArgList &Args) {
   return true;
 }
 
+std::vector<const char *>
+Options::processAndFilterOutInstallAPIOptions(ArrayRef<const char *> Args) {
+  std::unique_ptr<llvm::opt::OptTable> Table;
+  Table.reset(createDriverOptTable());
+
+  unsigned MissingArgIndex, MissingArgCount;
+  auto ParsedArgs = Table->ParseArgs(Args.slice(1), MissingArgIndex,
+                                     MissingArgCount, Visibility());
+
+  // Capture InstallAPI only driver options.
+  DriverOpts.Demangle = ParsedArgs.hasArg(OPT_demangle);
+
+  if (auto *A = ParsedArgs.getLastArg(OPT_filetype)) {
+    DriverOpts.OutFT = TextAPIWriter::parseFileType(A->getValue());
+    if (DriverOpts.OutFT == FileType::Invalid) {
+      Diags->Report(clang::diag::err_drv_invalid_value)
+          << A->getAsString(ParsedArgs) << A->getValue();
+      return {};
+    }
+  }
+
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_verify_mode_EQ)) {
+    DriverOpts.VerifyMode =
+        StringSwitch<VerificationMode>(A->getValue())
+            .Case("ErrorsOnly", VerificationMode::ErrorsOnly)
+            .Case("ErrorsAndWarnings", VerificationMode::ErrorsAndWarnings)
+            .Case("Pedantic", VerificationMode::Pedantic)
+            .Default(VerificationMode::Invalid);
+
+    if (DriverOpts.VerifyMode == VerificationMode::Invalid) {
+      Diags->Report(clang::diag::err_drv_invalid_value)
+          << A->getAsString(ParsedArgs) << A->getValue();
+      return {};
+    }
+  }
+
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_verify_against))
+    DriverOpts.DylibToVerify = A->getValue();
+
+  /// Any unclaimed arguments should be forwarded to the clang driver.
+  std::vector<const char *> ClangDriverArgs(ParsedArgs.size());
+  for (const Arg *A : ParsedArgs) {
+    if (A->isClaimed())
+      continue;
+    llvm::copy(A->getValues(), std::back_inserter(ClangDriverArgs));
+  }
+  return ClangDriverArgs;
+}
+
 Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
-                 InputArgList &ArgList)
+                 ArrayRef<const char *> Args, const StringRef ProgName)
     : Diags(&Diag), FM(FM) {
+
+  // First process InstallAPI specific options.
+  auto DriverArgs = processAndFilterOutInstallAPIOptions(Args);
+  if (Diags->hasErrorOccurred())
+    return;
+
+  // Set up driver to parse remaining input arguments.
+  clang::driver::Driver Driver(ProgName, llvm::sys::getDefaultTargetTriple(),
+                               *Diags, "clang installapi tool");
+  auto TargetAndMode =
+      clang::driver::ToolChain::getTargetAndModeFromProgramName(ProgName);
+  Driver.setTargetAndMode(TargetAndMode);
+  bool HasError = false;
+  llvm::opt::InputArgList ArgList =
+      Driver.ParseArgStrings(DriverArgs, /*UseDriverMode=*/true, HasError);
+  if (HasError)
+    return;
+  Driver.setCheckInputsExist(false);
+
   if (!processDriverOptions(ArgList))
     return;
 
@@ -138,15 +261,16 @@ Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
   if (!processFrontendOptions(ArgList))
     return;
 
+  /// Force cc1 options that should always be on.
+  FrontendArgs = {"-fsyntax-only", "-Wprivate-extern"};
+
   /// Any unclaimed arguments should be handled by invoking the clang frontend.
   for (const Arg *A : ArgList) {
     if (A->isClaimed())
       continue;
-
     FrontendArgs.emplace_back(A->getSpelling());
     llvm::copy(A->getValues(), std::back_inserter(FrontendArgs));
   }
-  FrontendArgs.push_back("-fsyntax-only");
 }
 
 InstallAPIContext Options::createContext() {
@@ -159,6 +283,7 @@ InstallAPIContext Options::createContext() {
 
   Ctx.BA.InstallName = LinkerOpts.InstallName;
   Ctx.BA.CurrentVersion = LinkerOpts.CurrentVersion;
+  Ctx.BA.CompatVersion = LinkerOpts.CompatVersion;
   Ctx.BA.AppExtensionSafe = LinkerOpts.AppExtensionSafe;
   Ctx.FT = DriverOpts.OutFT;
   Ctx.OutputLoc = DriverOpts.OutputPath;
@@ -168,16 +293,40 @@ InstallAPIContext Options::createContext() {
   for (const std::string &ListPath : DriverOpts.FileLists) {
     auto Buffer = FM->getBufferForFile(ListPath);
     if (auto Err = Buffer.getError()) {
-      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      Diags->Report(diag::err_cannot_open_file) << ListPath << Err.message();
       return Ctx;
     }
     if (auto Err = FileListReader::loadHeaders(std::move(Buffer.get()),
                                                Ctx.InputHeaders)) {
-      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      Diags->Report(diag::err_cannot_open_file) << ListPath << std::move(Err);
       return Ctx;
     }
   }
 
+  // Parse binary dylib and initialize verifier.
+  if (DriverOpts.DylibToVerify.empty()) {
+    Ctx.Verifier = std::make_unique<DylibVerifier>();
+    return Ctx;
+  }
+
+  auto Buffer = FM->getBufferForFile(DriverOpts.DylibToVerify);
+  if (auto Err = Buffer.getError()) {
+    Diags->Report(diag::err_cannot_open_file)
+        << DriverOpts.DylibToVerify << Err.message();
+    return Ctx;
+  }
+
+  DylibReader::ParseOption PO;
+  PO.Undefineds = false;
+  Expected<Records> Slices =
+      DylibReader::readFile((*Buffer)->getMemBufferRef(), PO);
+  if (auto Err = Slices.takeError()) {
+    Diags->Report(diag::err_cannot_open_file) << DriverOpts.DylibToVerify;
+    return Ctx;
+  }
+
+  Ctx.Verifier = std::make_unique<DylibVerifier>(
+      std::move(*Slices), Diags, DriverOpts.VerifyMode, DriverOpts.Demangle);
   return Ctx;
 }
 
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index 06f79b62c531e..2beeafc86bb08 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -11,8 +11,10 @@
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/FileManager.h"
+#include "clang/Driver/Driver.h"
 #include "clang/Frontend/FrontendOptions.h"
 #include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/MachO.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
@@ -32,12 +34,21 @@ struct DriverOptions {
   /// \brief Mappings of target triples & tapi targets to build for.
   std::map<llvm::MachO::Target, llvm::Triple> Targets;
 
+  /// \brief Path to binary dylib for comparing.
+  std::string DylibToVerify;
+
   /// \brief Output path.
   std::string OutputPath;
 
   /// \brief File encoding to print.
   FileType OutFT = FileType::TBD_V5;
 
+  /// \brief Verification mode for comparing symbols.
+  VerificationMode VerifyMode = VerificationMode::Pedantic;
+
+  /// \brief Print demangled symbols when reporting errors.
+  bool Demangle = false;
+
   /// \brief Print verbose output.
   bool Verbose = false;
 };
@@ -49,6 +60,9 @@ struct LinkerOptions {
   /// \brief The current version to use for the dynamic library.
   PackedVersion CurrentVersion;
 
+  /// \brief The compatibility version to use for the dynamic library.
+  PackedVersion CompatVersion;
+
   /// \brief Is application extension safe.
   bool AppExtensionSafe = false;
 
@@ -66,6 +80,8 @@ class Options {
   bool processDriverOptions(llvm::opt::InputArgList &Args);
   bool processLinkerOptions(llvm::opt::InputArgList &Args);
   bool processFrontendOptions(llvm::opt::InputArgList &Args);
+  std::vector<const char *>
+  processAndFilterOutInstallAPIOptions(ArrayRef<const char *> Args);
 
 public:
   /// The various options grouped together.
@@ -80,7 +96,7 @@ class Options {
 
   /// \brief Constructor for options.
   Options(clang::DiagnosticsEngine &Diag, FileManager *FM,
-          llvm::opt::InputArgList &Args);
+          ArrayRef<const char *> Args, const StringRef ProgName);
 
   /// \brief Get CC1 arguments after extracting out the irrelevant
   /// ones.
@@ -92,6 +108,16 @@ class Options {
   std::vector<std::string> FrontendArgs;
 };
 
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS,         \
+               VISIBILITY, PARAM, HELPTEXT, METAVAR, VALUES)                   \
+  OPT_##ID,
+#include "InstallAPIOpts.inc"
+  LastOption
+#undef OPTION
+};
+
 } // namespace installapi
 } // namespace clang
 #endif
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 722d59cf2846a..8aeb5086638e9 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1130,6 +1130,7 @@ Expected<StringRef> linkDevice(ArrayRef<StringRef> InputFiles,
   case Triple::aarch64_be:
   case Triple::ppc64:
   case Triple::ppc64le:
+  case Triple::systemz:
     return generic::clang(InputFiles, Args);
   case Triple::spirv32:
   case Triple::spirv64:
diff --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp
index 852b8c226e885..eb90f082437b3 100644
--- a/clang/tools/diagtool/DiagnosticNames.cpp
+++ b/clang/tools/diagtool/DiagnosticNames.cpp
@@ -42,6 +42,7 @@ static const DiagnosticRecord BuiltinDiagnosticsByID[] = {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
 #undef DIAG
 };
 
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 376025e3605be..83b5bbb71f521 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
@@ -41,7 +42,6 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/Regex.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/TargetSelect.h"
@@ -73,136 +73,8 @@ std::string GetExecutablePath(const char *Argv0, bool CanonicalPrefixes) {
   return llvm::sys::fs::getMainExecutable(Argv0, P);
 }
 
-static const char *GetStableCStr(std::set<std::string> &SavedStrings,
-                                 StringRef S) {
-  return SavedStrings.insert(std::string(S)).first->c_str();
-}
-
-/// ApplyOneQAOverride - Apply a list of edits to the input argument lists.
-///
-/// The input string is a space separated list of edits to perform,
-/// they are applied in order to the input argument lists. Edits
-/// should be one of the following forms:
-///
-///  '#': Silence information about the changes to the command line arguments.
-///
-///  '^': Add FOO as a new argument at the beginning of the command line.
-///
-///  '+': Add FOO as a new argument at the end of the command line.
-///
-///  's/XXX/YYY/': Substitute the regular expression XXX with YYY in the command
-///  line.
-///
-///  'xOPTION': Removes all instances of the literal argument OPTION.
-///
-///  'XOPTION': Removes all instances of the literal argument OPTION,
-///  and the following argument.
-///
-///  'Ox': Removes all flags matching 'O' or 'O[sz0-9]' and adds 'Ox'
-///  at the end of the command line.
-///
-/// \param OS - The stream to write edit information to.
-/// \param Args - The vector of command line arguments.
-/// \param Edit - The override command to perform.
-/// \param SavedStrings - Set to use for storing string representations.
-static void ApplyOneQAOverride(raw_ostream &OS,
-                               SmallVectorImpl<const char*> &Args,
-                               StringRef Edit,
-                               std::set<std::string> &SavedStrings) {
-  // This does not need to be efficient.
-
-  if (Edit[0] == '^') {
-    const char *Str =
-      GetStableCStr(SavedStrings, Edit.substr(1));
-    OS << "### Adding argument " << Str << " at beginning\n";
-    Args.insert(Args.begin() + 1, Str);
-  } else if (Edit[0] == '+') {
-    const char *Str =
-      GetStableCStr(SavedStrings, Edit.substr(1));
-    OS << "### Adding argument " << Str << " at end\n";
-    Args.push_back(Str);
-  } else if (Edit[0] == 's' && Edit[1] == '/' && Edit.ends_with("/") &&
-             Edit.slice(2, Edit.size() - 1).contains('/')) {
-    StringRef MatchPattern = Edit.substr(2).split('/').first;
-    StringRef ReplPattern = Edit.substr(2).split('/').second;
-    ReplPattern = ReplPattern.slice(0, ReplPattern.size()-1);
-
-    for (unsigned i = 1, e = Args.size(); i != e; ++i) {
-      // Ignore end-of-line response file markers
-      if (Args[i] == nullptr)
-        continue;
-      std::string Repl = llvm::Regex(MatchPattern).sub(ReplPattern, Args[i]);
-
-      if (Repl != Args[i]) {
-        OS << "### Replacing '" << Args[i] << "' with '" << Repl << "'\n";
-        Args[i] = GetStableCStr(SavedStrings, Repl);
-      }
-    }
-  } else if (Edit[0] == 'x' || Edit[0] == 'X') {
-    auto Option = Edit.substr(1);
-    for (unsigned i = 1; i < Args.size();) {
-      if (Option == Args[i]) {
-        OS << "### Deleting argument " << Args[i] << '\n';
-        Args.erase(Args.begin() + i);
-        if (Edit[0] == 'X') {
-          if (i < Args.size()) {
-            OS << "### Deleting argument " << Args[i] << '\n';
-            Args.erase(Args.begin() + i);
-          } else
-            OS << "### Invalid X edit, end of command line!\n";
-        }
-      } else
-        ++i;
-    }
-  } else if (Edit[0] == 'O') {
-    for (unsigned i = 1; i < Args.size();) {
-      const char *A = Args[i];
-      // Ignore end-of-line response file markers
-      if (A == nullptr)
-        continue;
-      if (A[0] == '-' && A[1] == 'O' &&
-          (A[2] == '\0' ||
-           (A[3] == '\0' && (A[2] == 's' || A[2] == 'z' ||
-                             ('0' <= A[2] && A[2] <= '9'))))) {
-        OS << "### Deleting argument " << Args[i] << '\n';
-        Args.erase(Args.begin() + i);
-      } else
-        ++i;
-    }
-    OS << "### Adding argument " << Edit << " at end\n";
-    Args.push_back(GetStableCStr(SavedStrings, '-' + Edit.str()));
-  } else {
-    OS << "### Unrecognized edit: " << Edit << "\n";
-  }
-}
-
-/// ApplyQAOverride - Apply a space separated list of edits to the
-/// input argument lists. See ApplyOneQAOverride.
-static void ApplyQAOverride(SmallVectorImpl<const char*> &Args,
-                            const char *OverrideStr,
-                            std::set<std::string> &SavedStrings) {
-  raw_ostream *OS = &llvm::errs();
-
-  if (OverrideStr[0] == '#') {
-    ++OverrideStr;
-    OS = &llvm::nulls();
-  }
-
-  *OS << "### CCC_OVERRIDE_OPTIONS: " << OverrideStr << "\n";
-
-  // This does not need to be efficient.
-
-  const char *S = OverrideStr;
-  while (*S) {
-    const char *End = ::strchr(S, ' ');
-    if (!End)
-      End = S + strlen(S);
-    if (End != S)
-      ApplyOneQAOverride(*OS, Args, std::string(S, End), SavedStrings);
-    S = End;
-    if (*S != '\0')
-      ++S;
-  }
+static const char *GetStableCStr(llvm::StringSet<> &SavedStrings, StringRef S) {
+  return SavedStrings.insert(S).first->getKeyData();
 }
 
 extern int cc1_main(ArrayRef<const char *> Argv, const char *Argv0,
@@ -215,7 +87,7 @@ extern int cc1gen_reproducer_main(ArrayRef<const char *> Argv,
 
 static void insertTargetAndModeArgs(const ParsedClangName &NameParts,
                                     SmallVectorImpl<const char *> &ArgVector,
-                                    std::set<std::string> &SavedStrings) {
+                                    llvm::StringSet<> &SavedStrings) {
   // Put target and mode arguments at the start of argument list so that
   // arguments specified in command line could override them. Avoid putting
   // them at index 0, as an option like '-cc1' must remain the first.
@@ -419,12 +291,13 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
     }
   }
 
-  std::set<std::string> SavedStrings;
+  llvm::StringSet<> SavedStrings;
   // Handle CCC_OVERRIDE_OPTIONS, used for editing a command line behind the
   // scenes.
   if (const char *OverrideStr = ::getenv("CCC_OVERRIDE_OPTIONS")) {
     // FIXME: Driver shouldn't take extra initial argument.
-    ApplyQAOverride(Args, OverrideStr, SavedStrings);
+    driver::applyOverrideOptions(Args, OverrideStr, SavedStrings,
+                                 &llvm::errs());
   }
 
   std::string Path = GetExecutablePath(ToolContext.Path, CanonicalPrefixes);
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 41f827f319450..b84ca28db7e58 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -1783,6 +1783,10 @@ bool CursorVisitor::VisitAttributedTypeLoc(AttributedTypeLoc TL) {
   return Visit(TL.getModifiedLoc());
 }
 
+bool CursorVisitor::VisitCountAttributedTypeLoc(CountAttributedTypeLoc TL) {
+  return Visit(TL.getInnerLoc());
+}
+
 bool CursorVisitor::VisitBTFTagAttributedTypeLoc(BTFTagAttributedTypeLoc TL) {
   return Visit(TL.getWrappedLoc());
 }
diff --git a/clang/unittests/AST/DeclPrinterTest.cpp b/clang/unittests/AST/DeclPrinterTest.cpp
index 0e09ab2a7bba8..07fa02bd96e25 100644
--- a/clang/unittests/AST/DeclPrinterTest.cpp
+++ b/clang/unittests/AST/DeclPrinterTest.cpp
@@ -358,6 +358,59 @@ TEST(DeclPrinter, TestCXXRecordDecl11) {
     "class A : virtual public Z, private Y {}"));
 }
 
+TEST(DeclPrinter, TestCXXRecordDecl12) {
+  ASSERT_TRUE(
+      PrintedDeclCXX98Matches("struct S { int x; };"
+                              "namespace NS { class C {};}"
+                              "void foo() {using namespace NS; C c;}",
+                              "foo",
+                              "void foo() {\nusing namespace NS;\nclass "
+                              "NS::C c;\n}\n",
+                              [](PrintingPolicy &Policy) {
+                                Policy.SuppressTagKeyword = false;
+                                Policy.SuppressScope = true;
+                                Policy.TerseOutput = false;
+                              }));
+}
+
+TEST(DeclPrinter, TestCXXRecordDecl13) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches(
+      "struct S { int x; };"
+      "S s1;"
+      "S foo() {return s1;}",
+      "foo", "struct S foo() {\nreturn s1;\n}\n", [](PrintingPolicy &Policy) {
+        Policy.SuppressTagKeyword = false;
+        Policy.SuppressScope = true;
+        Policy.TerseOutput = false;
+      }));
+}
+
+TEST(DeclPrinter, TestCXXRecordDecl14) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches(
+      "struct S { int x; };"
+      "S foo(S s1) {return s1;}",
+      "foo", "struct S foo(struct S s1) {\nreturn s1;\n}\n",
+      [](PrintingPolicy &Policy) {
+        Policy.SuppressTagKeyword = false;
+        Policy.SuppressScope = true;
+        Policy.TerseOutput = false;
+      }));
+}
+TEST(DeclPrinter, TestCXXRecordDecl15) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches(
+      "struct S { int x; };"
+      "namespace NS { class C {};}"
+      "S foo(S s1, NS::C c1) {using namespace NS; C c; return s1;}",
+      "foo",
+      "struct S foo(struct S s1, class NS::C c1) {\nusing namespace NS;\nclass "
+      "NS::C c;\nreturn s1;\n}\n",
+      [](PrintingPolicy &Policy) {
+        Policy.SuppressTagKeyword = false;
+        Policy.SuppressScope = true;
+        Policy.TerseOutput = false;
+      }));
+}
+
 TEST(DeclPrinter, TestFunctionDecl1) {
   ASSERT_TRUE(PrintedDeclCXX98Matches(
     "void A();",
@@ -1333,6 +1386,75 @@ TEST(DeclPrinter, TestTemplateArgumentList16) {
   ASSERT_TRUE(PrintedDeclCXX11Matches(Code, "NT2", "int NT2 = 5"));
 }
 
+TEST(DeclPrinter, TestCXXRecordDecl17) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches("template<typename T> struct Z {};"
+                                      "struct X {};"
+                                      "Z<X> A;",
+                                      "A", "Z<X> A"));
+  [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = false; };
+}
+
+TEST(DeclPrinter, TestCXXRecordDecl18) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches("template<typename T> struct Z {};"
+                                      "struct X {};"
+                                      "Z<X> A;"
+                                      "template <typename T1, int>"
+                                      "struct Y{};"
+                                      "Y<Z<X>, 2> B;",
+                                      "B", "Y<Z<X>, 2> B"));
+  [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = false; };
+}
+
+TEST(DeclPrinter, TestCXXRecordDecl19) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches("template<typename T> struct Z {};"
+                                      "struct X {};"
+                                      "Z<X> A;"
+                                      "template <typename T1, int>"
+                                      "struct Y{};"
+                                      "Y<Z<X>, 2> B;",
+                                      "B", "Y<Z<X>, 2> B"));
+  [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = true; };
+}
+TEST(DeclPrinter, TestCXXRecordDecl20) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches(
+      "template <typename T, int N> class Inner;"
+      "template <typename T, int N>"
+      "class Inner{Inner(T val){}};"
+      "template <class InnerClass, int N> class Outer {"
+      "public:"
+      "struct NestedStruct {"
+      "int nestedValue;"
+      "NestedStruct(int val) : nestedValue(val) {}"
+      "};"
+      "InnerClass innerInstance;"
+      "Outer(const InnerClass &inner) : innerInstance(inner) {}"
+      "};"
+      "Outer<Inner<int, 10>, 5>::NestedStruct nestedInstance(100);",
+      "nestedInstance",
+      "Outer<Inner<int, 10>, 5>::NestedStruct nestedInstance(100)"));
+  [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = false; };
+}
+
+TEST(DeclPrinter, TestCXXRecordDecl21) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches(
+      "template <typename T, int N> class Inner;"
+      "template <typename T, int N>"
+      "class Inner{Inner(T val){}};"
+      "template <class InnerClass, int N> class Outer {"
+      "public:"
+      "struct NestedStruct {"
+      "int nestedValue;"
+      "NestedStruct(int val) : nestedValue(val) {}"
+      "};"
+      "InnerClass innerInstance;"
+      "Outer(const InnerClass &inner) : innerInstance(inner) {}"
+      "};"
+      "Outer<Inner<int, 10>, 5>::NestedStruct nestedInstance(100);",
+      "nestedInstance",
+      "Outer<Inner<int, 10>, 5>::NestedStruct nestedInstance(100)"));
+  [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = true; };
+}
+
 TEST(DeclPrinter, TestFunctionParamUglified) {
   llvm::StringLiteral Code = R"cpp(
     class __c;
diff --git a/clang/unittests/AST/TypePrinterTest.cpp b/clang/unittests/AST/TypePrinterTest.cpp
index f0a6eb7e9fd8c..494085a2ebca6 100644
--- a/clang/unittests/AST/TypePrinterTest.cpp
+++ b/clang/unittests/AST/TypePrinterTest.cpp
@@ -155,6 +155,22 @@ TEST(TypePrinter, TemplateIdWithNTTP) {
       }));
 }
 
+TEST(TypePrinter, TemplateArgumentsSubstitution) {
+  constexpr char Code[] = R"cpp(
+       template <typename Y> class X {};
+       typedef X<int> A;
+       int foo() {
+          return sizeof(A);
+       }
+  )cpp";
+  auto Matcher = typedefNameDecl(hasName("A"), hasType(qualType().bind("id")));
+  ASSERT_TRUE(PrintedTypeMatches(Code, {}, Matcher, "X<int>",
+                                 [](PrintingPolicy &Policy) {
+                                   Policy.SuppressTagKeyword = false;
+                                   Policy.SuppressScope = true;
+                                 }));
+}
+
 TEST(TypePrinter, TemplateArgumentsSubstitution_Expressions) {
   /// Tests clang::isSubstitutedDefaultArgument on TemplateArguments
   /// that are of kind TemplateArgument::Expression
diff --git a/clang/unittests/Analysis/FlowSensitive/DeterminismTest.cpp b/clang/unittests/Analysis/FlowSensitive/DeterminismTest.cpp
index 5ba26a0832036..e794bd4943f23 100644
--- a/clang/unittests/Analysis/FlowSensitive/DeterminismTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/DeterminismTest.cpp
@@ -8,7 +8,7 @@
 
 #include "TestingSupport.h"
 #include "clang/AST/Decl.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysis.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
@@ -34,14 +34,14 @@ std::string analyzeAndPrintExitCondition(llvm::StringRef Code) {
   const auto *Target =
       cast<FunctionDecl>(test::findValueDecl(AST.context(), "target"));
   Environment InitEnv(DACtx, *Target);
-  auto CFCtx = cantFail(ControlFlowContext::build(*Target));
+  auto ACFG = cantFail(AdornedCFG::build(*Target));
 
   NoopAnalysis Analysis(AST.context(), DataflowAnalysisOptions{});
 
-  auto Result = runDataflowAnalysis(CFCtx, Analysis, InitEnv);
+  auto Result = runDataflowAnalysis(ACFG, Analysis, InitEnv);
   EXPECT_FALSE(!Result) << Result.takeError();
 
-  Atom FinalFC = (*Result)[CFCtx.getCFG().getExit().getBlockID()]
+  Atom FinalFC = (*Result)[ACFG.getCFG().getExit().getBlockID()]
                      ->Env.getFlowConditionToken();
   std::string Textual;
   llvm::raw_string_ostream OS(Textual);
diff --git a/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp b/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
index 57920c49a7d3d..88630119ba8a1 100644
--- a/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
@@ -57,7 +57,7 @@ class TestLogger : public Logger {
 private:
   llvm::raw_string_ostream OS;
 
-  void beginAnalysis(const ControlFlowContext &,
+  void beginAnalysis(const AdornedCFG &,
                      TypeErasedDataflowAnalysis &) override {
     logText("beginAnalysis()");
   }
diff --git a/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp b/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
index cd6a37d370e85..55baa4e2a5377 100644
--- a/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
@@ -198,7 +198,7 @@ TEST(RecordOpsTest, RecordsEqual) {
       });
 }
 
-TEST(TransferTest, CopyRecordFromDerivedToBase) {
+TEST(TransferTest, CopyRecordBetweenDerivedAndBase) {
   std::string Code = R"(
     struct A {
       int i;
@@ -212,8 +212,23 @@ TEST(TransferTest, CopyRecordFromDerivedToBase) {
       // [[p]]
     }
   )";
+  auto SyntheticFieldCallback = [](QualType Ty) -> llvm::StringMap<QualType> {
+    CXXRecordDecl *ADecl = nullptr;
+    if (Ty.getAsString() == "A")
+      ADecl = Ty->getAsCXXRecordDecl();
+    else if (Ty.getAsString() == "B")
+      ADecl = Ty->getAsCXXRecordDecl()
+                  ->bases_begin()
+                  ->getType()
+                  ->getAsCXXRecordDecl();
+    else
+      return {};
+    QualType IntTy = getFieldNamed(ADecl, "i")->getType();
+    return {{"synth_int", IntTy}};
+  };
+  // Test copying derived to base class.
   runDataflow(
-      Code, /*SyntheticFieldCallback=*/{},
+      Code, SyntheticFieldCallback,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
         Environment Env = getEnvironmentAtAnnotation(Results, "p").fork();
@@ -224,11 +239,38 @@ TEST(TransferTest, CopyRecordFromDerivedToBase) {
 
         EXPECT_NE(Env.getValue(*A.getChild(*IDecl)),
                   Env.getValue(*B.getChild(*IDecl)));
+        EXPECT_NE(Env.getValue(A.getSyntheticField("synth_int")),
+                  Env.getValue(B.getSyntheticField("synth_int")));
 
         copyRecord(B, A, Env);
 
         EXPECT_EQ(Env.getValue(*A.getChild(*IDecl)),
                   Env.getValue(*B.getChild(*IDecl)));
+        EXPECT_EQ(Env.getValue(A.getSyntheticField("synth_int")),
+                  Env.getValue(B.getSyntheticField("synth_int")));
+      });
+  // Test copying base to derived class.
+  runDataflow(
+      Code, SyntheticFieldCallback,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        Environment Env = getEnvironmentAtAnnotation(Results, "p").fork();
+
+        const ValueDecl *IDecl = findValueDecl(ASTCtx, "i");
+        auto &A = getLocForDecl<RecordStorageLocation>(ASTCtx, Env, "a");
+        auto &B = getLocForDecl<RecordStorageLocation>(ASTCtx, Env, "b");
+
+        EXPECT_NE(Env.getValue(*A.getChild(*IDecl)),
+                  Env.getValue(*B.getChild(*IDecl)));
+        EXPECT_NE(Env.getValue(A.getSyntheticField("synth_int")),
+                  Env.getValue(B.getSyntheticField("synth_int")));
+
+        copyRecord(A, B, Env);
+
+        EXPECT_EQ(Env.getValue(*A.getChild(*IDecl)),
+                  Env.getValue(*B.getChild(*IDecl)));
+        EXPECT_EQ(Env.getValue(A.getSyntheticField("synth_int")),
+                  Env.getValue(B.getSyntheticField("synth_int")));
       });
 }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
index b7cf6cc966edb..e3c7ff685f572 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
@@ -28,7 +28,7 @@
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/ASTMatchers/ASTMatchersInternal.h"
 #include "clang/Analysis/CFG.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysis.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
@@ -100,7 +100,7 @@ struct AnalysisOutputs {
   const FunctionDecl *Target;
   /// Contains the control flow graph built from the body of the `Target`
   /// function and is analyzed.
-  const ControlFlowContext &CFCtx;
+  const AdornedCFG &ACFG;
   /// The analysis to be run.
   TypeErasedDataflowAnalysis &Analysis;
   /// Initial state to start the analysis.
@@ -261,9 +261,10 @@ checkDataflow(AnalysisInputs<AnalysisT> AI,
           llvm::errc::invalid_argument, "Could not find the target function.");
 
     // Build the control flow graph for the target function.
-    auto MaybeCFCtx = ControlFlowContext::build(*Target);
-    if (!MaybeCFCtx) return MaybeCFCtx.takeError();
-    auto &CFCtx = *MaybeCFCtx;
+    auto MaybeACFG = AdornedCFG::build(*Target);
+    if (!MaybeACFG)
+      return MaybeACFG.takeError();
+    auto &ACFG = *MaybeACFG;
 
     // Initialize states for running dataflow analysis.
     DataflowAnalysisContext DACtx(AI.SolverFactory(),
@@ -271,7 +272,7 @@ checkDataflow(AnalysisInputs<AnalysisT> AI,
     Environment InitEnv(DACtx, *Target);
     auto Analysis = AI.MakeAnalysis(Context, InitEnv);
 
-    AnalysisOutputs AO{AnnotatedCode, Context, Target, CFCtx,
+    AnalysisOutputs AO{AnnotatedCode, Context, Target, ACFG,
                        Analysis,      InitEnv, {}};
 
     // Additional test setup.
@@ -283,7 +284,7 @@ checkDataflow(AnalysisInputs<AnalysisT> AI,
     // the post-analysis states for the CFG blocks that have been evaluated.
     llvm::Expected<std::vector<std::optional<TypeErasedDataflowAnalysisState>>>
         MaybeBlockStates =
-            runTypeErasedDataflowAnalysis(CFCtx, Analysis, InitEnv,
+            runTypeErasedDataflowAnalysis(ACFG, Analysis, InitEnv,
                                           TypeErasedPostVisitCFG,
                                           MaxBlockVisitsInAnalysis);
     if (!MaybeBlockStates) return MaybeBlockStates.takeError();
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index a8c282f140b4c..a243535d38725 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2313,8 +2313,6 @@ TEST(TransferTest, AssignmentOperator_ArgByValue) {
 }
 
 TEST(TransferTest, AssignmentOperatorFromBase) {
-  // This is a crash repro. We don't model the copy this case, so no
-  // expectations on the copied field of the base class are checked.
   std::string Code = R"(
     struct Base {
       int base;
@@ -2326,14 +2324,33 @@ TEST(TransferTest, AssignmentOperatorFromBase) {
     void target(Base B, Derived D) {
       D.base = 1;
       D.derived = 1;
+      // [[before]]
       D = B;
-      // [[p]]
+      // [[after]]
     }
   )";
   runDataflow(
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
-         ASTContext &ASTCtx) {});
+         ASTContext &ASTCtx) {
+        const Environment &EnvBefore =
+            getEnvironmentAtAnnotation(Results, "before");
+        const Environment &EnvAfter =
+            getEnvironmentAtAnnotation(Results, "after");
+
+        auto &BLoc =
+            getLocForDecl<RecordStorageLocation>(ASTCtx, EnvBefore, "B");
+        auto &DLoc =
+            getLocForDecl<RecordStorageLocation>(ASTCtx, EnvBefore, "D");
+
+        EXPECT_NE(getFieldValue(&BLoc, "base", ASTCtx, EnvBefore),
+                  getFieldValue(&DLoc, "base", ASTCtx, EnvBefore));
+        EXPECT_EQ(getFieldValue(&BLoc, "base", ASTCtx, EnvAfter),
+                  getFieldValue(&DLoc, "base", ASTCtx, EnvAfter));
+
+        EXPECT_EQ(getFieldValue(&DLoc, "derived", ASTCtx, EnvBefore),
+                  getFieldValue(&DLoc, "derived", ASTCtx, EnvAfter));
+      });
 }
 
 TEST(TransferTest, AssignmentOperatorFromCallResult) {
@@ -2924,6 +2941,36 @@ TEST(TransferTest, ResultObjectLocation) {
       });
 }
 
+TEST(TransferTest, ResultObjectLocationForDefaultArgExpr) {
+  std::string Code = R"(
+    struct S {};
+    void funcWithDefaultArg(S s = S());
+    void target() {
+      funcWithDefaultArg();
+      // [[p]]
+    }
+  )";
+
+  using ast_matchers::cxxDefaultArgExpr;
+  using ast_matchers::match;
+  using ast_matchers::selectFirst;
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        auto *DefaultArg = selectFirst<CXXDefaultArgExpr>(
+            "default_arg",
+            match(cxxDefaultArgExpr().bind("default_arg"), ASTCtx));
+        ASSERT_NE(DefaultArg, nullptr);
+
+        // The values for default arguments aren't modeled; we merely verify
+        // that we can get a result object location for a default arg.
+        Env.getResultObjectLocation(*DefaultArg);
+      });
+}
+
 TEST(TransferTest, ResultObjectLocationForDefaultInitExpr) {
   std::string Code = R"(
     struct S {};
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 9d05a0d6ca401..bea00ab1a1f06 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -66,20 +66,20 @@ class DataflowAnalysisTest : public Test {
               AST->getASTContext()));
     assert(Func != nullptr);
 
-    CFCtx = std::make_unique<ControlFlowContext>(
-        llvm::cantFail(ControlFlowContext::build(*Func)));
+    ACFG =
+        std::make_unique<AdornedCFG>(llvm::cantFail(AdornedCFG::build(*Func)));
 
     AnalysisT Analysis = MakeAnalysis(AST->getASTContext());
     DACtx = std::make_unique<DataflowAnalysisContext>(
         std::make_unique<WatchedLiteralsSolver>());
     Environment Env(*DACtx, *Func);
 
-    return runDataflowAnalysis(*CFCtx, Analysis, Env);
+    return runDataflowAnalysis(*ACFG, Analysis, Env);
   }
 
   /// Returns the `CFGBlock` containing `S` (and asserts that it exists).
   const CFGBlock *blockForStmt(const Stmt &S) {
-    const CFGBlock *Block = CFCtx->getStmtToBlock().lookup(&S);
+    const CFGBlock *Block = ACFG->getStmtToBlock().lookup(&S);
     assert(Block != nullptr);
     return Block;
   }
@@ -105,7 +105,7 @@ class DataflowAnalysisTest : public Test {
   }
 
   std::unique_ptr<ASTUnit> AST;
-  std::unique_ptr<ControlFlowContext> CFCtx;
+  std::unique_ptr<AdornedCFG> ACFG;
   std::unique_ptr<DataflowAnalysisContext> DACtx;
 };
 
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
index b6e4973fd7cb2..9430730004dbd 100644
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
@@ -3383,6 +3383,66 @@ TEST_P(UncheckedOptionalAccessTest, LambdaCaptureStateNotPropagated) {
     }
   )");
 }
+
+TEST_P(UncheckedOptionalAccessTest, ClassDerivedFromOptional) {
+  ExpectDiagnosticsFor(R"(
+    #include "unchecked_optional_access_test.h"
+
+    struct Derived : public $ns::$optional<int> {};
+
+    void target(Derived opt) {
+      *opt;  // [[unsafe]]
+      if (opt.has_value())
+        *opt;
+
+      // The same thing, but with a pointer receiver.
+      Derived *popt = &opt;
+      **popt;  // [[unsafe]]
+      if (popt->has_value())
+        **popt;
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, ClassTemplateDerivedFromOptional) {
+  ExpectDiagnosticsFor(R"(
+    #include "unchecked_optional_access_test.h"
+
+    template <class T>
+    struct Derived : public $ns::$optional<T> {};
+
+    void target(Derived<int> opt) {
+      *opt;  // [[unsafe]]
+      if (opt.has_value())
+        *opt;
+
+      // The same thing, but with a pointer receiver.
+      Derived<int> *popt = &opt;
+      **popt;  // [[unsafe]]
+      if (popt->has_value())
+        **popt;
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, ClassDerivedPrivatelyFromOptional) {
+  // Classes that derive privately from optional can themselves still call
+  // member functions of optional. Check that we model the optional correctly
+  // in this situation.
+  ExpectDiagnosticsFor(R"(
+    #include "unchecked_optional_access_test.h"
+
+    struct Derived : private $ns::$optional<int> {
+      void Method() {
+        **this;  // [[unsafe]]
+        if (this->has_value())
+          **this;
+      }
+    };
+  )",
+                       ast_matchers::hasName("Method"));
+}
+
 // FIXME: Add support for:
 // - constructors (copy, move)
 // - assignment operators (default, copy, move)
diff --git a/clang/unittests/Format/.clang-format b/clang/unittests/Format/.clang-format
index f95602cab0f7f..d7331b3c8cf02 100644
--- a/clang/unittests/Format/.clang-format
+++ b/clang/unittests/Format/.clang-format
@@ -1,6 +1 @@
-BasedOnStyle: LLVM
-InsertBraces: true
-InsertNewlineAtEOF: true
-LineEnding: LF
-RemoveBracesLLVM: true
-RemoveParentheses: ReturnStatement
+BasedOnStyle: clang-format
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index fc367a7a5a898..bea989c8c306d 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -3802,6 +3802,27 @@ TEST_F(FormatTest, FormatsEnum) {
                "  // Comment 2\n"
                "  TWO,\n"
                "};");
+  verifyFormat("enum [[clang::enum_extensibility(open)]] E {\n"
+               "  // Comment 1\n"
+               "  ONE,\n"
+               "  // Comment 2\n"
+               "  TWO\n"
+               "};");
+  verifyFormat("enum [[nodiscard]] [[clang::enum_extensibility(open)]] E {\n"
+               "  // Comment 1\n"
+               "  ONE,\n"
+               "  // Comment 2\n"
+               "  TWO\n"
+               "};");
+  verifyFormat("enum [[clang::enum_extensibility(open)]] E { // foo\n"
+               "  A,\n"
+               "  // bar\n"
+               "  B\n"
+               "};",
+               "enum [[clang::enum_extensibility(open)]] E{// foo\n"
+               "                                           A,\n"
+               "                                           // bar\n"
+               "                                           B};");
 
   // Not enums.
   verifyFormat("enum X f() {\n"
@@ -6938,6 +6959,10 @@ TEST_F(FormatTest, PutEmptyBlocksIntoOneLine) {
                "else if (b) { }\n"
                "else { }",
                Style);
+
+  Style = getLLVMStyle(FormatStyle::LK_CSharp);
+  Style.SpaceInEmptyBlock = true;
+  verifyFormat("Event += () => { };", Style);
 }
 
 TEST_F(FormatTest, FormatBeginBlockEndMacros) {
@@ -11450,6 +11475,11 @@ TEST_F(FormatTest, UnderstandsNewAndDelete) {
                "void new (link p);\n"
                "void delete (link p);");
 
+  verifyFormat("{ p->new(); }\n"
+               "{ p->delete(); }",
+               "{ p->new (); }\n"
+               "{ p->delete (); }");
+
   FormatStyle AfterPlacementOperator = getLLVMStyle();
   AfterPlacementOperator.SpaceBeforeParens = FormatStyle::SBPO_Custom;
   EXPECT_TRUE(
diff --git a/clang/unittests/Format/FormatTestTableGen.cpp b/clang/unittests/Format/FormatTestTableGen.cpp
index 76b871e2e1a52..c96866f0840f0 100644
--- a/clang/unittests/Format/FormatTestTableGen.cpp
+++ b/clang/unittests/Format/FormatTestTableGen.cpp
@@ -332,6 +332,85 @@ TEST_F(FormatTestTableGen, Assert) {
   verifyFormat("assert !le(DefVar1, 0), \"Assert1\";\n");
 }
 
+TEST_F(FormatTestTableGen, DAGArgBreakElements) {
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_TableGen);
+  Style.ColumnLimit = 60;
+  // By default, the DAGArg does not have a break inside.
+  ASSERT_EQ(Style.TableGenBreakInsideDAGArg, FormatStyle::DAS_DontBreak);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins a:$src1, aa:$src2, aaa:$src3)\n"
+               "}\n",
+               Style);
+  // This option forces to break inside the DAGArg.
+  Style.TableGenBreakInsideDAGArg = FormatStyle::DAS_BreakElements;
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins a:$src1,\n"
+               "                    aa:$src2,\n"
+               "                    aaa:$src3);\n"
+               "}\n",
+               Style);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (other a:$src1,\n"
+               "                      aa:$src2,\n"
+               "                      aaa:$src3);\n"
+               "}\n",
+               Style);
+  // Then, limit the DAGArg operator only to "ins".
+  Style.TableGenBreakingDAGArgOperators = {"ins"};
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins a:$src1,\n"
+               "                    aa:$src2,\n"
+               "                    aaa:$src3);\n"
+               "}\n",
+               Style);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (other a:$src1, aa:$src2, aaa:$src3)\n"
+               "}\n",
+               Style);
+}
+
+TEST_F(FormatTestTableGen, DAGArgBreakAll) {
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_TableGen);
+  Style.ColumnLimit = 60;
+  // By default, the DAGArg does not have a break inside.
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins a:$src1, aa:$src2, aaa:$src3)\n"
+               "}\n",
+               Style);
+  // This option forces to break inside the DAGArg.
+  Style.TableGenBreakInsideDAGArg = FormatStyle::DAS_BreakAll;
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins\n"
+               "      a:$src1,\n"
+               "      aa:$src2,\n"
+               "      aaa:$src3\n"
+               "  );\n"
+               "}\n",
+               Style);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (other\n"
+               "      a:$src1,\n"
+               "      aa:$src2,\n"
+               "      aaa:$src3\n"
+               "  );\n"
+               "}\n",
+               Style);
+  // Then, limit the DAGArg operator only to "ins".
+  Style.TableGenBreakingDAGArgOperators = {"ins"};
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins\n"
+               "      a:$src1,\n"
+               "      aa:$src2,\n"
+               "      aaa:$src3\n"
+               "  );\n"
+               "}\n",
+               Style);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (other a:$src1, aa:$src2, aaa:$src3);\n"
+               "}\n",
+               Style);
+}
+
 TEST_F(FormatTestTableGen, CondOperatorAlignment) {
   FormatStyle Style = getGoogleStyle(FormatStyle::LK_TableGen);
   Style.ColumnLimit = 60;
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index fcba0e63c7825..1aa855b341987 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2354,6 +2354,76 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   EXPECT_TOKEN(Tokens[4], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::greater, TT_TemplateCloser);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_FunctionLBrace);
+
+  // DAGArg breaking options. They use different token types depending on what
+  // is specified.
+  Style.TableGenBreakInsideDAGArg = FormatStyle::DAS_BreakElements;
+
+  // When TableGenBreakInsideDAGArg is DAS_BreakElements and
+  // TableGenBreakingDAGArgOperators is not specified, it makes all the DAGArg
+  // elements to have line break.
+  Tokens = AnnotateValue("(ins type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorID); // ins
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  Tokens = AnnotateValue("(other type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorID); // other
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  // For non-identifier operators, breaks after the operator.
+  Tokens = AnnotateValue("(!cast<Type>(\"Name\") type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 16u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[7], tok::r_paren, TT_TableGenDAGArgOperatorToBreak);
+  EXPECT_TOKEN(Tokens[11], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[15], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  Style.TableGenBreakInsideDAGArg = FormatStyle::DAS_BreakAll;
+
+  // When TableGenBreakInsideDAGArg is DAS_BreakAll and
+  // TableGenBreakingDAGArgOperators is not specified, it makes all the DAGArg
+  // to have line break inside it.
+  Tokens = AnnotateValue("(ins type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorToBreak); // ins
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  Tokens = AnnotateValue("(other type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorToBreak); // other
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  // If TableGenBreakingDAGArgOperators is specified, it is limited to the
+  // specified operators.
+  Style.TableGenBreakingDAGArgOperators = {"ins", "outs"};
+  Tokens = AnnotateValue("(ins type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorToBreak); // ins
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  Tokens = AnnotateValue("(other type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpener);
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown); // other
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListComma);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandConstructors) {
@@ -2632,6 +2702,11 @@ TEST_F(TokenAnnotatorTest, StartOfName) {
   EXPECT_TOKEN(Tokens[2], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[4], tok::identifier, TT_StartOfName);
+
+  Tokens = annotate("@interface NSCoder (TestCoder)");
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::at, TT_ObjCDecl);
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_StartOfName);
 }
 
 TEST_F(TokenAnnotatorTest, BraceKind) {
diff --git a/clang/unittests/Tooling/StandardLibraryTest.cpp b/clang/unittests/Tooling/StandardLibraryTest.cpp
index edca31649accf..e4c109f3d580d 100644
--- a/clang/unittests/Tooling/StandardLibraryTest.cpp
+++ b/clang/unittests/Tooling/StandardLibraryTest.cpp
@@ -185,6 +185,19 @@ TEST(StdlibTest, RecognizerForC99) {
             stdlib::Symbol::named("", "uint8_t", stdlib::Lang::C));
 }
 
+TEST(StdlibTest, SpecialCMappings) {
+  TestInputs Input("typedef char size_t;");
+  Input.Language = TestLanguage::Lang_C99;
+  TestAST AST(Input);
+
+  auto &SizeT = lookup(AST, "size_t");
+  stdlib::Recognizer Recognizer;
+  auto ActualSym = Recognizer(&SizeT);
+  assert(ActualSym);
+  EXPECT_EQ(ActualSym, stdlib::Symbol::named("", "size_t", stdlib::Lang::C));
+  EXPECT_EQ(ActualSym->header()->name(), "<stddef.h>");
+}
+
 } // namespace
 } // namespace tooling
 } // namespace clang
diff --git a/clang/www/get_started.html b/clang/www/get_started.html
index dda914a46904e..8e4d36640be73 100755
--- a/clang/www/get_started.html
+++ b/clang/www/get_started.html
@@ -86,13 +86,16 @@ <h3 id="buildNix">On Unix-like Systems</h3>
   </ul>
   </li>
 
-  <li>If you intend to use Clang's C++ support, you may need to tell it how
-      to find your C++ standard library headers. In general, Clang will detect
-      the best version of libstdc++ headers available and use them - it will
-      look both for system installations of libstdc++ as well as installations
-      adjacent to Clang itself. If your configuration fits neither of these
-      scenarios, you can use the <tt>-DGCC_INSTALL_PREFIX</tt> cmake option
-      to tell Clang where the gcc containing the desired libstdc++ is installed.
+  <li>On Linux, you may need GCC runtime libraries (e.g. <tt>crtbeginS.o,
+    libstdc++.so</tt>) and libstdc++ headers. In general, Clang will detect
+    well-known GCC installation paths matching the target triple (configured at
+    build time (see <tt>clang --version</tt>); overriden by
+    <tt>--target=</tt>) and use the largest version. If your configuration fits
+    none of the standard scenarios, you can set <tt>--gcc-install-dir=</tt> to
+    the GCC installation directory (something like
+    <tt>/usr/lib/gcc/$triple/$major</tt>). If your GCC installation is under
+    <tt>/usr/lib/gcc</tt> but uses a different triple, you can set
+    <tt>--gcc-triple=$triple</tt>.
   </li>
   <li>Try it out (assuming you add llvm/build/bin to your path):
   <ul>
diff --git a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
index 2bf115973a49b..d5b2e7970f11c 100644
--- a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
@@ -65,8 +65,9 @@ function(clang_compile object_file source)
   cmake_parse_arguments(SOURCE "" "" "CFLAGS;DEPS" ${ARGN})
   get_filename_component(source_rpath ${source} REALPATH)
   if(NOT COMPILER_RT_STANDALONE_BUILD)
-    list(APPEND SOURCE_DEPS clang compiler-rt-headers)
+    list(APPEND SOURCE_DEPS clang)
   endif()
+  list(APPEND SOURCE_DEPS compiler-rt-headers)
   if (TARGET CompilerRTUnitTestCheckCxx)
     list(APPEND SOURCE_DEPS CompilerRTUnitTestCheckCxx)
   endif()
diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index 9d383b9d37c4a..da47317c0e576 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -570,6 +570,17 @@ INTERCEPTOR(char *, strcpy, char *to, const char *from) {
   return REAL(strcpy)(to, from);
 }
 
+// Windows doesn't always define the strdup identifier,
+// and when it does it's a macro defined to either _strdup
+// or _strdup_dbg, _strdup_dbg ends up calling _strdup, so
+// we want to intercept that. push/pop_macro are used to avoid problems
+// if this file ends up including <string.h> in the future.
+#  if SANITIZER_WINDOWS
+#    pragma push_macro("strdup")
+#    undef strdup
+#    define strdup _strdup
+#  endif
+
 INTERCEPTOR(char*, strdup, const char *s) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, strdup);
@@ -587,7 +598,7 @@ INTERCEPTOR(char*, strdup, const char *s) {
   return reinterpret_cast<char*>(new_mem);
 }
 
-#if ASAN_INTERCEPT___STRDUP
+#  if ASAN_INTERCEPT___STRDUP
 INTERCEPTOR(char*, __strdup, const char *s) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, strdup);
@@ -770,7 +781,7 @@ void InitializeAsanInterceptors() {
   ASAN_INTERCEPT_FUNC(strncat);
   ASAN_INTERCEPT_FUNC(strncpy);
   ASAN_INTERCEPT_FUNC(strdup);
-#if ASAN_INTERCEPT___STRDUP
+#  if ASAN_INTERCEPT___STRDUP
   ASAN_INTERCEPT_FUNC(__strdup);
 #endif
 #if ASAN_INTERCEPT_INDEX && ASAN_USE_ALIAS_ATTRIBUTE_FOR_INDEX
@@ -866,6 +877,10 @@ void InitializeAsanInterceptors() {
   VReport(1, "AddressSanitizer: libc interceptors initialized\n");
 }
 
+#  if SANITIZER_WINDOWS
+#    pragma pop_macro("strdup")
+#  endif
+
 } // namespace __asan
 
 #endif  // !SANITIZER_FUCHSIA
diff --git a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
index 0fa636bec0d00..35871a942a7a1 100644
--- a/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
+++ b/compiler-rt/lib/asan/asan_win_dll_thunk.cpp
@@ -80,7 +80,7 @@ INTERCEPT_LIBRARY_FUNCTION(strchr);
 INTERCEPT_LIBRARY_FUNCTION(strcmp);
 INTERCEPT_LIBRARY_FUNCTION(strcpy);
 INTERCEPT_LIBRARY_FUNCTION(strcspn);
-INTERCEPT_LIBRARY_FUNCTION(strdup);
+INTERCEPT_LIBRARY_FUNCTION(_strdup);
 INTERCEPT_LIBRARY_FUNCTION(strlen);
 INTERCEPT_LIBRARY_FUNCTION(strncat);
 INTERCEPT_LIBRARY_FUNCTION(strncmp);
diff --git a/compiler-rt/lib/asan/tests/CMakeLists.txt b/compiler-rt/lib/asan/tests/CMakeLists.txt
index 6ee2fb01c0df4..bda47bd7fd6a2 100644
--- a/compiler-rt/lib/asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/asan/tests/CMakeLists.txt
@@ -172,7 +172,7 @@ function(add_asan_tests arch test_runtime)
   function(generate_asan_tests test_objects test_suite testname)
     generate_compiler_rt_tests(${test_objects} ${test_suite} ${testname} ${arch}
       COMPILE_DEPS ${ASAN_UNITTEST_HEADERS} ${ASAN_IGNORELIST_FILE}
-      DEPS llvm_gtest asan
+      DEPS asan
       KIND ${TEST_KIND}
       ${ARGN}
       )
diff --git a/compiler-rt/lib/fuzzer/tests/CMakeLists.txt b/compiler-rt/lib/fuzzer/tests/CMakeLists.txt
index dd82c492e83a4..8f5707c687ac5 100644
--- a/compiler-rt/lib/fuzzer/tests/CMakeLists.txt
+++ b/compiler-rt/lib/fuzzer/tests/CMakeLists.txt
@@ -74,7 +74,7 @@ if(COMPILER_RT_DEFAULT_TARGET_ARCH IN_LIST FUZZER_SUPPORTED_ARCH)
     FuzzerUnitTests "Fuzzer-${arch}-Test" ${arch}
     SOURCES FuzzerUnittest.cpp ${COMPILER_RT_GTEST_SOURCE}
     RUNTIME ${LIBFUZZER_TEST_RUNTIME}
-    DEPS llvm_gtest ${LIBFUZZER_TEST_RUNTIME_DEPS}
+    DEPS ${LIBFUZZER_TEST_RUNTIME_DEPS}
     CFLAGS ${LIBFUZZER_UNITTEST_CFLAGS} ${LIBFUZZER_TEST_RUNTIME_CFLAGS}
     LINK_FLAGS ${LIBFUZZER_UNITTEST_LINK_FLAGS} ${LIBFUZZER_TEST_RUNTIME_LINK_FLAGS})
   set_target_properties(FuzzerUnitTests PROPERTIES
@@ -84,7 +84,7 @@ if(COMPILER_RT_DEFAULT_TARGET_ARCH IN_LIST FUZZER_SUPPORTED_ARCH)
   generate_compiler_rt_tests(FuzzedDataProviderTestObjects
     FuzzedDataProviderUnitTests "FuzzerUtils-${arch}-Test" ${arch}
     SOURCES FuzzedDataProviderUnittest.cpp ${COMPILER_RT_GTEST_SOURCE}
-    DEPS llvm_gtest ${LIBFUZZER_TEST_RUNTIME_DEPS} ${COMPILER_RT_SOURCE_DIR}/include/fuzzer/FuzzedDataProvider.h
+    DEPS ${LIBFUZZER_TEST_RUNTIME_DEPS} ${COMPILER_RT_SOURCE_DIR}/include/fuzzer/FuzzedDataProvider.h
     CFLAGS ${LIBFUZZER_UNITTEST_CFLAGS} ${LIBFUZZER_TEST_RUNTIME_CFLAGS}
     LINK_FLAGS ${LIBFUZZER_UNITTEST_LINK_FLAGS} ${LIBFUZZER_TEST_RUNTIME_LINK_FLAGS})
   set_target_properties(FuzzedDataProviderUnitTests PROPERTIES
diff --git a/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt b/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
index 4915c83d49ca9..2ec332ea74c13 100644
--- a/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
@@ -74,7 +74,7 @@ if(COMPILER_RT_DEFAULT_TARGET_ARCH IN_LIST GWP_ASAN_SUPPORTED_ARCH)
     GwpAsanUnitTests "GwpAsan-${arch}-Test" ${arch}
     SOURCES ${GWP_ASAN_UNITTESTS} ${COMPILER_RT_GTEST_SOURCE}
     RUNTIME ${GWP_ASAN_TEST_RUNTIME}
-    DEPS llvm_gtest ${GWP_ASAN_UNIT_TEST_HEADERS}
+    DEPS ${GWP_ASAN_UNIT_TEST_HEADERS}
     CFLAGS ${GWP_ASAN_UNITTEST_CFLAGS}
     LINK_FLAGS ${GWP_ASAN_UNITTEST_LINK_FLAGS})
   set_target_properties(GwpAsanUnitTests PROPERTIES
diff --git a/compiler-rt/lib/hwasan/hwasan_checks.h b/compiler-rt/lib/hwasan/hwasan_checks.h
index 0911af30dcb8f..735d21a5db774 100644
--- a/compiler-rt/lib/hwasan/hwasan_checks.h
+++ b/compiler-rt/lib/hwasan/hwasan_checks.h
@@ -140,7 +140,6 @@ __attribute__((always_inline, nodebug)) static inline uptr ShortTagSize(
 
 __attribute__((always_inline, nodebug)) static inline bool
 PossiblyShortTagMatches(tag_t mem_tag, uptr ptr, uptr sz) {
-  DCHECK(IsAligned(ptr, kShadowAlignment));
   tag_t ptr_tag = GetTagFromPointer(ptr);
   if (ptr_tag == mem_tag)
     return true;
diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 1829358705fe8..a04175ba1e4b5 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -339,7 +339,7 @@ struct TrampolineMemoryRegion {
   uptr max_size;
 };
 
-UNUSED static const uptr kTrampolineScanLimitRange = 1 << 31;  // 2 gig
+UNUSED static const uptr kTrampolineScanLimitRange = 1ull << 31;  // 2 gig
 static const int kMaxTrampolineRegion = 1024;
 static TrampolineMemoryRegion TrampolineRegions[kMaxTrampolineRegion];
 
diff --git a/compiler-rt/lib/interception/tests/CMakeLists.txt b/compiler-rt/lib/interception/tests/CMakeLists.txt
index 644a57664cc49..0a235c662af3b 100644
--- a/compiler-rt/lib/interception/tests/CMakeLists.txt
+++ b/compiler-rt/lib/interception/tests/CMakeLists.txt
@@ -107,7 +107,6 @@ macro(add_interception_tests_for_arch arch)
     RUNTIME ${INTERCEPTION_COMMON_LIB}
     SOURCES ${INTERCEPTION_UNITTESTS} ${COMPILER_RT_GTEST_SOURCE}
     COMPILE_DEPS ${INTERCEPTION_TEST_HEADERS}
-    DEPS llvm_gtest
     CFLAGS ${INTERCEPTION_TEST_CFLAGS_COMMON}
     LINK_FLAGS ${INTERCEPTION_TEST_LINK_FLAGS_COMMON})
 endmacro()
diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp
index 3cdf10c149902..a2fc27de1901b 100644
--- a/compiler-rt/lib/msan/msan.cpp
+++ b/compiler-rt/lib/msan/msan.cpp
@@ -467,7 +467,7 @@ void __msan_init() {
   __msan_clear_on_return();
   if (__msan_get_track_origins())
     VPrintf(1, "msan_track_origins\n");
-  if (!InitShadow(__msan_get_track_origins())) {
+  if (!InitShadowWithReExec(__msan_get_track_origins())) {
     Printf("FATAL: MemorySanitizer can not mmap the shadow memory.\n");
     Printf("FATAL: Make sure to compile with -fPIE and to link with -pie.\n");
     Printf("FATAL: Disabling ASLR is known to cause this error.\n");
diff --git a/compiler-rt/lib/msan/msan.h b/compiler-rt/lib/msan/msan.h
index 710447a3e1a35..7fb58be67a02c 100644
--- a/compiler-rt/lib/msan/msan.h
+++ b/compiler-rt/lib/msan/msan.h
@@ -33,12 +33,18 @@ struct MappingDesc {
   uptr start;
   uptr end;
   enum Type {
-    INVALID, APP, SHADOW, ORIGIN
+    INVALID = 1,
+    ALLOCATOR = 2,
+    APP = 4,
+    SHADOW = 8,
+    ORIGIN = 16,
   } type;
   const char *name;
 };
 
-
+// Note: MappingDesc::ALLOCATOR entries are only used to check for memory
+// layout compatibility. The actual allocation settings are in
+// msan_allocator.cpp, which need to be kept in sync.
 #if SANITIZER_LINUX && defined(__mips64)
 
 // MIPS64 maps:
@@ -84,7 +90,8 @@ const MappingDesc kMemoryLayout[] = {
     {0X0B00000000000, 0X0C00000000000, MappingDesc::SHADOW, "shadow-10-13"},
     {0X0C00000000000, 0X0D00000000000, MappingDesc::INVALID, "invalid"},
     {0X0D00000000000, 0X0E00000000000, MappingDesc::ORIGIN, "origin-10-13"},
-    {0X0E00000000000, 0X1000000000000, MappingDesc::APP, "app-15"},
+    {0x0E00000000000, 0x0E40000000000, MappingDesc::ALLOCATOR, "allocator"},
+    {0X0E40000000000, 0X1000000000000, MappingDesc::APP, "app-15"},
 };
 # define MEM_TO_SHADOW(mem) ((uptr)mem ^ 0xB00000000000ULL)
 # define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x200000000000ULL)
@@ -106,7 +113,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x510000000000ULL, 0x600000000000ULL, MappingDesc::APP, "app-2"},
     {0x600000000000ULL, 0x610000000000ULL, MappingDesc::ORIGIN, "origin-1"},
     {0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x700000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
+    {0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
 #  define MEM_TO_SHADOW(mem) (((uptr)(mem)) ^ 0x500000000000ULL)
 #  define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x100000000000ULL)
 
@@ -118,7 +126,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x180200000000ULL, 0x1C0000000000ULL, MappingDesc::INVALID, "invalid"},
     {0x1C0000000000ULL, 0x2C0200000000ULL, MappingDesc::ORIGIN, "origin"},
     {0x2C0200000000ULL, 0x300000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x300000000000ULL, 0x800000000000ULL, MappingDesc::APP, "high memory"}};
+    {0x300000000000ULL, 0x320000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x320000000000ULL, 0x800000000000ULL, MappingDesc::APP, "high memory"}};
 
 // Various kernels use different low end ranges but we can combine them into one
 // big range. They also use different high end ranges but we can map them all to
@@ -141,7 +150,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x180000000000ULL, 0x1C0000000000ULL, MappingDesc::INVALID, "invalid"},
     {0x1C0000000000ULL, 0x2C0000000000ULL, MappingDesc::ORIGIN, "origin"},
     {0x2C0000000000ULL, 0x440000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x440000000000ULL, 0x500000000000ULL, MappingDesc::APP, "high memory"}};
+    {0x440000000000ULL, 0x460000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x460000000000ULL, 0x500000000000ULL, MappingDesc::APP, "high memory"}};
 
 #define MEM_TO_SHADOW(mem) \
   ((((uptr)(mem)) & ~0xC00000000000ULL) + 0x080000000000ULL)
@@ -208,7 +218,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x510000000000ULL, 0x600000000000ULL, MappingDesc::APP, "app-2"},
     {0x600000000000ULL, 0x610000000000ULL, MappingDesc::ORIGIN, "origin-1"},
     {0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x700000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
+    {0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
 #define MEM_TO_SHADOW(mem) (((uptr)(mem)) ^ 0x500000000000ULL)
 #define SHADOW_TO_ORIGIN(mem) (((uptr)(mem)) + 0x100000000000ULL)
 
@@ -223,20 +234,22 @@ const uptr kMemoryLayoutSize = sizeof(kMemoryLayout) / sizeof(kMemoryLayout[0]);
 #ifndef __clang__
 __attribute__((optimize("unroll-loops")))
 #endif
-inline bool addr_is_type(uptr addr, MappingDesc::Type mapping_type) {
+inline bool
+addr_is_type(uptr addr, int mapping_types) {
 // It is critical for performance that this loop is unrolled (because then it is
 // simplified into just a few constant comparisons).
 #ifdef __clang__
 #pragma unroll
 #endif
   for (unsigned i = 0; i < kMemoryLayoutSize; ++i)
-    if (kMemoryLayout[i].type == mapping_type &&
+    if ((kMemoryLayout[i].type & mapping_types) &&
         addr >= kMemoryLayout[i].start && addr < kMemoryLayout[i].end)
       return true;
   return false;
 }
 
-#define MEM_IS_APP(mem) addr_is_type((uptr)(mem), MappingDesc::APP)
+#define MEM_IS_APP(mem) \
+  (addr_is_type((uptr)(mem), MappingDesc::APP | MappingDesc::ALLOCATOR))
 #define MEM_IS_SHADOW(mem) addr_is_type((uptr)(mem), MappingDesc::SHADOW)
 #define MEM_IS_ORIGIN(mem) addr_is_type((uptr)(mem), MappingDesc::ORIGIN)
 
@@ -250,7 +263,7 @@ extern bool msan_init_is_running;
 extern int msan_report_count;
 
 bool ProtectRange(uptr beg, uptr end);
-bool InitShadow(bool init_origins);
+bool InitShadowWithReExec(bool init_origins);
 char *GetProcSelfMaps();
 void InitializeInterceptors();
 
diff --git a/compiler-rt/lib/msan/msan_allocator.cpp b/compiler-rt/lib/msan/msan_allocator.cpp
index 0b2dd2b2f1883..b1bc5b9390f75 100644
--- a/compiler-rt/lib/msan/msan_allocator.cpp
+++ b/compiler-rt/lib/msan/msan_allocator.cpp
@@ -48,6 +48,9 @@ struct MsanMapUnmapCallback {
   }
 };
 
+// Note: to ensure that the allocator is compatible with the application memory
+// layout (especially with high-entropy ASLR), kSpaceBeg and kSpaceSize must be
+// duplicated as MappingDesc::ALLOCATOR in msan.h.
 #if defined(__mips64)
 static const uptr kMaxAllowedMallocSize = 2UL << 30;
 
diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp
index c7ecb7cad5666..cd2d9f5c720c5 100644
--- a/compiler-rt/lib/msan/msan_linux.cpp
+++ b/compiler-rt/lib/msan/msan_linux.cpp
@@ -20,6 +20,9 @@
 #  include <signal.h>
 #  include <stdio.h>
 #  include <stdlib.h>
+#  if SANITIZER_LINUX
+#    include <sys/personality.h>
+#  endif
 #  include <sys/resource.h>
 #  include <sys/time.h>
 #  include <unistd.h>
@@ -43,11 +46,13 @@ void ReportMapRange(const char *descr, uptr beg, uptr size) {
   }
 }
 
-static bool CheckMemoryRangeAvailability(uptr beg, uptr size) {
+static bool CheckMemoryRangeAvailability(uptr beg, uptr size, bool verbose) {
   if (size > 0) {
     uptr end = beg + size - 1;
     if (!MemoryRangeIsAvailable(beg, end)) {
-      Printf("FATAL: Memory range 0x%zx - 0x%zx is not available.\n", beg, end);
+      if (verbose)
+        Printf("FATAL: Memory range 0x%zx - 0x%zx is not available.\n", beg,
+               end);
       return false;
     }
   }
@@ -86,7 +91,7 @@ static void CheckMemoryLayoutSanity() {
     CHECK(addr_is_type(start, type));
     CHECK(addr_is_type((start + end) / 2, type));
     CHECK(addr_is_type(end - 1, type));
-    if (type == MappingDesc::APP) {
+    if (type == MappingDesc::APP || type == MappingDesc::ALLOCATOR) {
       uptr addr = start;
       CHECK(MEM_IS_SHADOW(MEM_TO_SHADOW(addr)));
       CHECK(MEM_IS_ORIGIN(MEM_TO_ORIGIN(addr)));
@@ -106,7 +111,7 @@ static void CheckMemoryLayoutSanity() {
   }
 }
 
-bool InitShadow(bool init_origins) {
+static bool InitShadow(bool init_origins, bool dry_run) {
   // Let user know mapping parameters first.
   VPrintf(1, "__msan_init %p\n", reinterpret_cast<void *>(&__msan_init));
   for (unsigned i = 0; i < kMemoryLayoutSize; ++i)
@@ -116,8 +121,9 @@ bool InitShadow(bool init_origins) {
   CheckMemoryLayoutSanity();
 
   if (!MEM_IS_APP(&__msan_init)) {
-    Printf("FATAL: Code %p is out of application range. Non-PIE build?\n",
-           reinterpret_cast<void *>(&__msan_init));
+    if (!dry_run)
+      Printf("FATAL: Code %p is out of application range. Non-PIE build?\n",
+             reinterpret_cast<void *>(&__msan_init));
     return false;
   }
 
@@ -138,20 +144,26 @@ bool InitShadow(bool init_origins) {
     bool protect = type == MappingDesc::INVALID ||
                    (!init_origins && type == MappingDesc::ORIGIN);
     CHECK(!(map && protect));
-    if (!map && !protect)
-      CHECK(type == MappingDesc::APP);
+    if (!map && !protect) {
+      CHECK(type == MappingDesc::APP || type == MappingDesc::ALLOCATOR);
+
+      if (dry_run && type == MappingDesc::ALLOCATOR &&
+          !CheckMemoryRangeAvailability(start, size, !dry_run))
+        return false;
+    }
     if (map) {
-      if (!CheckMemoryRangeAvailability(start, size))
+      if (dry_run && !CheckMemoryRangeAvailability(start, size, !dry_run))
         return false;
-      if (!MmapFixedSuperNoReserve(start, size, kMemoryLayout[i].name))
+      if (!dry_run &&
+          !MmapFixedSuperNoReserve(start, size, kMemoryLayout[i].name))
         return false;
-      if (common_flags()->use_madv_dontdump)
+      if (!dry_run && common_flags()->use_madv_dontdump)
         DontDumpShadowMemory(start, size);
     }
     if (protect) {
-      if (!CheckMemoryRangeAvailability(start, size))
+      if (dry_run && !CheckMemoryRangeAvailability(start, size, !dry_run))
         return false;
-      if (!ProtectMemoryRange(start, size, kMemoryLayout[i].name))
+      if (!dry_run && !ProtectMemoryRange(start, size, kMemoryLayout[i].name))
         return false;
     }
   }
@@ -159,6 +171,35 @@ bool InitShadow(bool init_origins) {
   return true;
 }
 
+bool InitShadowWithReExec(bool init_origins) {
+  // Start with dry run: check layout is ok, but don't print warnings because
+  // warning messages will cause tests to fail (even if we successfully re-exec
+  // after the warning).
+  bool success = InitShadow(__msan_get_track_origins(), true);
+  if (!success) {
+#  if SANITIZER_LINUX
+    // Perhaps ASLR entropy is too high. If ASLR is enabled, re-exec without it.
+    int old_personality = personality(0xffffffff);
+    bool aslr_on =
+        (old_personality != -1) && ((old_personality & ADDR_NO_RANDOMIZE) == 0);
+
+    if (aslr_on) {
+      VReport(1,
+              "WARNING: MemorySanitizer: memory layout is incompatible, "
+              "possibly due to high-entropy ASLR.\n"
+              "Re-execing with fixed virtual address space.\n"
+              "N.B. reducing ASLR entropy is preferable.\n");
+      CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
+      ReExec();
+    }
+#  endif
+  }
+
+  // The earlier dry run didn't actually map or protect anything. Run again in
+  // non-dry run mode.
+  return success && InitShadow(__msan_get_track_origins(), false);
+}
+
 static void MsanAtExit(void) {
   if (flags()->print_stats && (flags()->atexit || msan_report_count > 0))
     ReportStats();
diff --git a/compiler-rt/lib/msan/tests/CMakeLists.txt b/compiler-rt/lib/msan/tests/CMakeLists.txt
index 412a0f6b3de7f..4f09f1e6a691f 100644
--- a/compiler-rt/lib/msan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/msan/tests/CMakeLists.txt
@@ -70,7 +70,7 @@ macro(msan_compile obj_list source arch kind cflags)
     ${obj_list} ${source} ${arch}
     KIND ${kind}
     COMPILE_DEPS ${MSAN_UNITTEST_HEADERS}
-    DEPS llvm_gtest msan
+    DEPS msan
     CFLAGS -isystem ${CMAKE_CURRENT_BINARY_DIR}/../libcxx_msan_${arch}/include/c++/v1
            ${MSAN_UNITTEST_INSTRUMENTED_CFLAGS} ${cflags}
   )
@@ -120,9 +120,7 @@ macro(add_msan_tests_for_arch arch kind cflags)
   set(MSAN_TEST_DEPS ${MSAN_TEST_OBJECTS} libcxx_msan_${arch}-build
                      ${MSAN_LOADABLE_SO}
 		     "${MSAN_LIBCXX_DIR}/libc++.a" "${MSAN_LIBCXX_DIR}/libc++abi.a")
-  if(NOT COMPILER_RT_STANDALONE_BUILD)
-    list(APPEND MSAN_TEST_DEPS msan)
-  endif()
+  list(APPEND MSAN_TEST_DEPS msan)
   get_target_flags_for_arch(${arch} TARGET_LINK_FLAGS)
   add_compiler_rt_test(MsanUnitTests "Msan-${arch}${kind}-Test" ${arch}
     OBJECTS ${MSAN_TEST_OBJECTS} "${MSAN_LIBCXX_DIR}/libc++.a" "${MSAN_LIBCXX_DIR}/libc++abi.a"
diff --git a/compiler-rt/lib/orc/tests/CMakeLists.txt b/compiler-rt/lib/orc/tests/CMakeLists.txt
index 2f1cb7657c281..e8f4c95b8a657 100644
--- a/compiler-rt/lib/orc/tests/CMakeLists.txt
+++ b/compiler-rt/lib/orc/tests/CMakeLists.txt
@@ -73,7 +73,7 @@ macro(add_orc_unittest testname)
           SOURCES ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}
           RUNTIME "${ORC_RUNTIME_LIBS}"
           COMPILE_DEPS ${TEST_HEADERS} ${ORC_HEADERS}
-          DEPS llvm_gtest ${ORC_DEPS}
+          DEPS ${ORC_DEPS}
           CFLAGS ${ORC_UNITTEST_CFLAGS} ${COMPILER_RT_GTEST_CFLAGS}
           LINK_FLAGS ${ORC_UNITTEST_LINK_FLAGS})
       endif()
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
index ef1fc35497439..ece2d7d63dd61 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -104,7 +104,27 @@ static void setlim(int res, rlim_t lim) {
 
 void DisableCoreDumperIfNecessary() {
   if (common_flags()->disable_coredump) {
-    setlim(RLIMIT_CORE, 0);
+    rlimit rlim;
+    CHECK_EQ(0, getrlimit(RLIMIT_CORE, &rlim));
+    // On Linux, if the kernel.core_pattern sysctl starts with a '|' (i.e. it
+    // is being piped to a coredump handler such as systemd-coredumpd), the
+    // kernel ignores RLIMIT_CORE (since we aren't creating a file in the file
+    // system) except for the magic value of 1, which disables coredumps when
+    // piping. 1 byte is too small for any kind of valid core dump, so it
+    // also disables coredumps if kernel.core_pattern creates files directly.
+    // While most piped coredump handlers do respect the crashing processes'
+    // RLIMIT_CORE, this is notable not the case for Debian's systemd-coredump
+    // due to a local patch that changes sysctl.d/50-coredump.conf to ignore
+    // the specified limit and instead use RLIM_INFINITY.
+    //
+    // The alternative to using RLIMIT_CORE=1 would be to use prctl() with the
+    // PR_SET_DUMPABLE flag, however that also prevents ptrace(), so makes it
+    // impossible to attach a debugger.
+    //
+    // Note: we use rlim_max in the Min() call here since that is the upper
+    // limit for what can be set without getting an EINVAL error.
+    rlim.rlim_cur = Min<rlim_t>(SANITIZER_LINUX ? 1 : 0, rlim.rlim_max);
+    CHECK_EQ(0, setrlimit(RLIMIT_CORE, &rlim));
   }
 }
 
diff --git a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
index 3c709e411e48b..a3efe68715082 100644
--- a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
@@ -176,7 +176,6 @@ macro(add_sanitizer_tests_for_arch arch)
     RUNTIME "${SANITIZER_COMMON_LIB}"
     SOURCES ${SANITIZER_UNITTESTS} ${COMPILER_RT_GTEST_SOURCE} ${COMPILER_RT_GMOCK_SOURCE}
     COMPILE_DEPS ${SANITIZER_TEST_HEADERS}
-    DEPS llvm_gtest
     CFLAGS  ${SANITIZER_TEST_CFLAGS_COMMON} ${extra_flags}
     LINK_FLAGS ${SANITIZER_TEST_LINK_FLAGS_COMMON} ${TARGET_LINK_FLAGS} ${extra_flags})
 
diff --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
index 60092005cc33b..6fb4e88de3155 100644
--- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
@@ -58,6 +58,7 @@ endif()
 set(SCUDO_HEADERS
   allocator_common.h
   allocator_config.h
+  allocator_config_wrapper.h
   atomic_helpers.h
   bytemap.h
   checksum.h
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def
new file mode 100644
index 0000000000000..92f4e39872d4c
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.def
@@ -0,0 +1,124 @@
+//===-- allocator_config.def ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all the flags and types supported in Scudo. For optional
+// flags and types, only explicitly define them when interested (i.e., unused
+// optional flags or types can be skipped).
+
+#ifndef BASE_REQUIRED_TEMPLATE_TYPE
+#define BASE_REQUIRED_TEMPLATE_TYPE(...)
+#endif
+#ifndef BASE_OPTIONAL
+#define BASE_OPTIONAL(...)
+#endif
+#ifndef PRIMARY_REQUIRED_TYPE
+#define PRIMARY_REQUIRED_TYPE(...)
+#endif
+#ifndef PRIMARY_REQUIRED
+#define PRIMARY_REQUIRED(...)
+#endif
+#ifndef PRIMARY_OPTIONAL
+#define PRIMARY_OPTIONAL(...)
+#endif
+#ifndef PRIMARY_OPTIONAL_TYPE
+#define PRIMARY_OPTIONAL_TYPE(...)
+#endif
+#ifndef SECONDARY_REQUIRED_TEMPLATE_TYPE
+#define SECONDARY_REQUIRED_TEMPLATE_TYPE(...)
+#endif
+#ifndef SECONDARY_CACHE_OPTIONAL
+#define SECONDARY_CACHE_OPTIONAL(...)
+#endif
+
+// BASE_REQUIRED_TEMPLATE_TYPE(NAME)
+//
+// Thread-Specific Data Registry used, shared or exclusive.
+BASE_REQUIRED_TEMPLATE_TYPE(TSDRegistryT)
+
+// Defines the type of Primary allocator to use.
+BASE_REQUIRED_TEMPLATE_TYPE(PrimaryT)
+
+// Defines the type of Secondary allocator to use.
+BASE_REQUIRED_TEMPLATE_TYPE(SecondaryT)
+
+// BASE_OPTIONAL(TYPE, NAME, DEFAULT)
+//
+// Indicates possible support for Memory Tagging.
+BASE_OPTIONAL(const bool, MaySupportMemoryTagging, false)
+
+// PRIMARY_REQUIRED_TYPE(NAME)
+//
+// SizeClassMap to use with the Primary.
+PRIMARY_REQUIRED_TYPE(SizeClassMap)
+
+// Defines the type and scale of a compact pointer. A compact pointer can
+// be understood as the offset of a pointer within the region it belongs
+// to, in increments of a power-of-2 scale. See `CompactPtrScale` also.
+PRIMARY_REQUIRED_TYPE(CompactPtrT)
+
+// PRIMARY_REQUIRED(TYPE, NAME)
+//
+// The scale of a compact pointer. E.g., Ptr = Base + (CompactPtr << Scale).
+PRIMARY_REQUIRED(const uptr, CompactPtrScale)
+
+// Log2 of the size of a size class region, as used by the Primary.
+PRIMARY_REQUIRED(const uptr, RegionSizeLog)
+
+// Conceptually, a region will be divided into groups based on the address
+// range. Each allocation consumes blocks in the same group until exhaustion
+// then it pops out blocks in a new group. Therefore, `GroupSizeLog` is always
+// smaller or equal to `RegionSizeLog`. Note that `GroupSizeLog` needs to be
+// equal to `RegionSizeLog` for SizeClassAllocator32 because of certain
+// constraints.
+PRIMARY_REQUIRED(const uptr, GroupSizeLog)
+
+// Call map for user memory with at least this size. Only used with primary64.
+PRIMARY_REQUIRED(const uptr, MapSizeIncrement)
+
+// Defines the minimal & maximal release interval that can be set.
+PRIMARY_REQUIRED(const s32, MinReleaseToOsIntervalMs)
+PRIMARY_REQUIRED(const s32, MaxReleaseToOsIntervalMs)
+
+// PRIMARY_OPTIONAL(TYPE, NAME, DEFAULT)
+//
+// Indicates support for offsetting the start of a region by a random number of
+// pages. Only used with primary64.
+PRIMARY_OPTIONAL(const bool, EnableRandomOffset, false)
+
+// PRIMARY_OPTIONAL_TYPE(NAME, DEFAULT)
+//
+// Use condition variable to shorten the waiting time of refillment of
+// freelist. Note that this depends on the implementation of condition
+// variable on each platform and the performance may vary so that it does not
+// guarantee a performance benefit.
+PRIMARY_OPTIONAL_TYPE(ConditionVariableT, ConditionVariableDummy)
+
+// SECONDARY_REQUIRED_TEMPLATE_TYPE(NAME)
+//
+// Defines the type of Secondary Cache to use.
+SECONDARY_REQUIRED_TEMPLATE_TYPE(CacheT)
+
+// SECONDARY_CACHE_OPTIONAL(TYPE, NAME, DEFAULT)
+//
+// Defines the type of cache used by the Secondary. Some additional
+// configuration entries can be necessary depending on the Cache.
+SECONDARY_CACHE_OPTIONAL(const u32, EntriesArraySize, 0)
+SECONDARY_CACHE_OPTIONAL(const u32, QuarantineSize, 0)
+SECONDARY_CACHE_OPTIONAL(const u32, DefaultMaxEntriesCount, 0)
+SECONDARY_CACHE_OPTIONAL(const u32, DefaultMaxEntrySize, 0)
+SECONDARY_CACHE_OPTIONAL(const s32, MinReleaseToOsIntervalMs, INT32_MIN)
+SECONDARY_CACHE_OPTIONAL(const s32, MaxReleaseToOsIntervalMs, INT32_MAX)
+
+#undef SECONDARY_CACHE_OPTIONAL
+#undef SECONDARY_REQUIRED_TEMPLATE_TYPE
+#undef PRIMARY_OPTIONAL_TYPE
+#undef PRIMARY_OPTIONAL
+#undef PRIMARY_REQUIRED
+#undef PRIMARY_REQUIRED_TYPE
+#undef BASE_OPTIONAL
+#undef BASE_REQUIRED_TEMPLATE_TYPE
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h
index 3c6aa3acb0e45..1e0cf1015ba67 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.h
@@ -38,80 +38,10 @@
 
 namespace scudo {
 
-// The combined allocator uses a structure as a template argument that
-// specifies the configuration options for the various subcomponents of the
-// allocator.
-//
-// struct ExampleConfig {
-//   // Indicates possible support for Memory Tagging.
-//   static const bool MaySupportMemoryTagging = false;
-//
-//   // Thread-Specific Data Registry used, shared or exclusive.
-//   template <class A> using TSDRegistryT = TSDRegistrySharedT<A, 8U, 4U>;
-//
-//   struct Primary {
-//     // SizeClassMap to use with the Primary.
-//     using SizeClassMap = DefaultSizeClassMap;
-//
-//     // Log2 of the size of a size class region, as used by the Primary.
-//     static const uptr RegionSizeLog = 30U;
-//
-//     // Log2 of the size of block group, as used by the Primary. Each group
-//     // contains a range of memory addresses, blocks in the range will belong
-//     // to the same group. In general, single region may have 1 or 2MB group
-//     // size. Multiple regions will have the group size equal to the region
-//     // size because the region size is usually smaller than 1 MB.
-//     // Smaller value gives fine-grained control of memory usage but the
-//     // trade-off is that it may take longer time of deallocation.
-//     static const uptr GroupSizeLog = 20U;
-//
-//     // Defines the type and scale of a compact pointer. A compact pointer can
-//     // be understood as the offset of a pointer within the region it belongs
-//     // to, in increments of a power-of-2 scale.
-//     // eg: Ptr = Base + (CompactPtr << Scale).
-//     typedef u32 CompactPtrT;
-//     static const uptr CompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
-//
-//     // Indicates support for offsetting the start of a region by
-//     // a random number of pages. Only used with primary64.
-//     static const bool EnableRandomOffset = true;
-//
-//     // Call map for user memory with at least this size. Only used with
-//     // primary64.
-//     static const uptr MapSizeIncrement = 1UL << 18;
-//
-//     // Defines the minimal & maximal release interval that can be set.
-//     static const s32 MinReleaseToOsIntervalMs = INT32_MIN;
-//     static const s32 MaxReleaseToOsIntervalMs = INT32_MAX;
-//
-//     // Use condition variable to shorten the waiting time of refillment of
-//     // freelist. Note that this depends on the implementation of condition
-//     // variable on each platform and the performance may vary so that it
-//     // doesn't guarantee a performance benefit.
-//     // Note that both variables have to be defined to enable it.
-//     static const bool UseConditionVariable = true;
-//     using ConditionVariableT = ConditionVariableLinux;
-//   };
-//   // Defines the type of Primary allocator to use.
-//   template <typename Config> using PrimaryT = SizeClassAllocator64<Config>;
-//
-//   // Defines the type of cache used by the Secondary. Some additional
-//   // configuration entries can be necessary depending on the Cache.
-//   struct Secondary {
-//     struct Cache {
-//       static const u32 EntriesArraySize = 32U;
-//       static const u32 QuarantineSize = 0U;
-//       static const u32 DefaultMaxEntriesCount = 32U;
-//       static const uptr DefaultMaxEntrySize = 1UL << 19;
-//       static const s32 MinReleaseToOsIntervalMs = INT32_MIN;
-//       static const s32 MaxReleaseToOsIntervalMs = INT32_MAX;
-//     };
-//     // Defines the type of Secondary Cache to use.
-//     template <typename Config> using CacheT = MapAllocatorCache<Config>;
-//   };
-//   // Defines the type of Secondary allocator to use.
-//   template <typename Config> using SecondaryT = MapAllocator<Config>;
-// };
+// Scudo uses a structure as a template argument that specifies the
+// configuration options for the various subcomponents of the allocator. See the
+// following configs as examples and check `allocator_config.def` for all the
+// available options.
 
 #ifndef SCUDO_USE_CUSTOM_CONFIG
 
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config_wrapper.h b/compiler-rt/lib/scudo/standalone/allocator_config_wrapper.h
new file mode 100644
index 0000000000000..a51d770b46646
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/allocator_config_wrapper.h
@@ -0,0 +1,135 @@
+//===-- allocator_config_wrapper.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_ALLOCATOR_CONFIG_WRAPPER_H_
+#define SCUDO_ALLOCATOR_CONFIG_WRAPPER_H_
+
+#include "condition_variable.h"
+#include "internal_defs.h"
+#include "secondary.h"
+
+namespace {
+
+template <typename T> struct removeConst {
+  using type = T;
+};
+template <typename T> struct removeConst<const T> {
+  using type = T;
+};
+
+// This is only used for SFINAE when detecting if a type is defined.
+template <typename T> struct voidAdaptor {
+  using type = void;
+};
+
+} // namespace
+
+namespace scudo {
+
+#define OPTIONAL_TEMPLATE(TYPE, NAME, DEFAULT, MEMBER)                         \
+  template <typename Config, typename = TYPE> struct NAME##State {             \
+    static constexpr removeConst<TYPE>::type getValue() { return DEFAULT; }    \
+  };                                                                           \
+  template <typename Config>                                                   \
+  struct NAME##State<Config, decltype(Config::MEMBER)> {                       \
+    static constexpr removeConst<TYPE>::type getValue() {                      \
+      return Config::MEMBER;                                                   \
+    }                                                                          \
+  };
+
+#define OPTIONAL_TYPE_TEMPLATE(NAME, DEFAULT, MEMBER)                          \
+  template <typename Config, typename Void = void> struct NAME##Type {         \
+    static constexpr bool enabled() { return false; }                          \
+    using NAME = DEFAULT;                                                      \
+  };                                                                           \
+  template <typename Config>                                                   \
+  struct NAME##Type<Config,                                                    \
+                    typename voidAdaptor<typename Config::MEMBER>::type> {     \
+    static constexpr bool enabled() { return true; }                           \
+    using NAME = typename Config::MEMBER;                                      \
+  };
+
+template <typename AllocatorConfig> struct BaseConfig {
+#define BASE_REQUIRED_TEMPLATE_TYPE(NAME)                                      \
+  template <typename T> using NAME = typename AllocatorConfig::template NAME<T>;
+
+#define BASE_OPTIONAL(TYPE, NAME, DEFAULT)                                     \
+  OPTIONAL_TEMPLATE(TYPE, NAME, DEFAULT, NAME)                                 \
+  static constexpr removeConst<TYPE>::type get##NAME() {                       \
+    return NAME##State<AllocatorConfig>::getValue();                           \
+  }
+
+#include "allocator_config.def"
+}; // BaseConfig
+
+template <typename AllocatorConfig> struct PrimaryConfig {
+  // TODO: Pass this flag through template argument to remove this hard-coded
+  //       function.
+  static constexpr bool getMaySupportMemoryTagging() {
+    return BaseConfig<AllocatorConfig>::getMaySupportMemoryTagging();
+  }
+
+#define PRIMARY_REQUIRED_TYPE(NAME)                                            \
+  using NAME = typename AllocatorConfig::Primary::NAME;
+
+#define PRIMARY_REQUIRED(TYPE, NAME)                                           \
+  static constexpr removeConst<TYPE>::type get##NAME() {                       \
+    return AllocatorConfig::Primary::NAME;                                     \
+  }
+
+#define PRIMARY_OPTIONAL(TYPE, NAME, DEFAULT)                                  \
+  OPTIONAL_TEMPLATE(TYPE, NAME, DEFAULT, NAME)                                 \
+  static constexpr removeConst<TYPE>::type get##NAME() {                       \
+    return NAME##State<typename AllocatorConfig::Primary>::getValue();         \
+  }
+
+#define PRIMARY_OPTIONAL_TYPE(NAME, DEFAULT)                                   \
+  OPTIONAL_TYPE_TEMPLATE(NAME, DEFAULT, NAME)                                  \
+  static constexpr bool has##NAME() {                                          \
+    return NAME##Type<typename AllocatorConfig::Primary>::enabled();           \
+  }                                                                            \
+  using NAME = typename NAME##Type<typename AllocatorConfig::Primary>::NAME;
+
+#include "allocator_config.def"
+
+}; // PrimaryConfig
+
+template <typename AllocatorConfig> struct SecondaryConfig {
+  // TODO: Pass this flag through template argument to remove this hard-coded
+  //       function.
+  static constexpr bool getMaySupportMemoryTagging() {
+    return BaseConfig<AllocatorConfig>::getMaySupportMemoryTagging();
+  }
+
+#define SECONDARY_REQUIRED_TEMPLATE_TYPE(NAME)                                 \
+  template <typename T>                                                        \
+  using NAME = typename AllocatorConfig::Secondary::template NAME<T>;
+#include "allocator_config.def"
+
+  struct CacheConfig {
+    // TODO: Pass this flag through template argument to remove this hard-coded
+    //       function.
+    static constexpr bool getMaySupportMemoryTagging() {
+      return BaseConfig<AllocatorConfig>::getMaySupportMemoryTagging();
+    }
+
+#define SECONDARY_CACHE_OPTIONAL(TYPE, NAME, DEFAULT)                          \
+  OPTIONAL_TEMPLATE(TYPE, NAME, DEFAULT, Cache::NAME)                          \
+  static constexpr removeConst<TYPE>::type get##NAME() {                       \
+    return NAME##State<typename AllocatorConfig::Secondary>::getValue();       \
+  }
+#include "allocator_config.def"
+  }; // CacheConfig
+};   // SecondaryConfig
+
+#undef OPTIONAL_TEMPLATE
+#undef OPTIONAL_TEMPLATE_TYPE
+
+} // namespace scudo
+
+#endif // SCUDO_ALLOCATOR_CONFIG_WRAPPER_H_
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 9e1fd6d6dca3c..f4dd90aac6655 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -9,6 +9,7 @@
 #ifndef SCUDO_COMBINED_H_
 #define SCUDO_COMBINED_H_
 
+#include "allocator_config_wrapper.h"
 #include "atomic_helpers.h"
 #include "chunk.h"
 #include "common.h"
@@ -47,11 +48,14 @@ namespace scudo {
 template <class Config, void (*PostInitCallback)(void) = EmptyCallback>
 class Allocator {
 public:
-  using PrimaryT = typename Config::template PrimaryT<Config>;
-  using SecondaryT = typename Config::template SecondaryT<Config>;
+  using AllocatorConfig = BaseConfig<Config>;
+  using PrimaryT =
+      typename AllocatorConfig::template PrimaryT<PrimaryConfig<Config>>;
+  using SecondaryT =
+      typename AllocatorConfig::template SecondaryT<SecondaryConfig<Config>>;
   using CacheT = typename PrimaryT::CacheT;
   typedef Allocator<Config, PostInitCallback> ThisT;
-  typedef typename Config::template TSDRegistryT<ThisT> TSDRegistryT;
+  typedef typename AllocatorConfig::template TSDRegistryT<ThisT> TSDRegistryT;
 
   void callPostInitCallback() {
     pthread_once(&PostInitNonce, PostInitCallback);
@@ -72,7 +76,7 @@ class Allocator {
       Header.State = Chunk::State::Available;
       Chunk::storeHeader(Allocator.Cookie, Ptr, &Header);
 
-      if (allocatorSupportsMemoryTagging<Config>())
+      if (allocatorSupportsMemoryTagging<AllocatorConfig>())
         Ptr = untagPointer(Ptr);
       void *BlockBegin = Allocator::getBlockBegin(Ptr, &Header);
       Cache.deallocate(Header.ClassId, BlockBegin);
@@ -99,7 +103,8 @@ class Allocator {
 
       // Reset tag to 0 as this chunk may have been previously used for a tagged
       // user allocation.
-      if (UNLIKELY(useMemoryTagging<Config>(Allocator.Primary.Options.load())))
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(
+              Allocator.Primary.Options.load())))
         storeTags(reinterpret_cast<uptr>(Ptr),
                   reinterpret_cast<uptr>(Ptr) + sizeof(QuarantineBatch));
 
@@ -159,7 +164,7 @@ class Allocator {
       Primary.Options.set(OptionBit::DeallocTypeMismatch);
     if (getFlags()->delete_size_mismatch)
       Primary.Options.set(OptionBit::DeleteSizeMismatch);
-    if (allocatorSupportsMemoryTagging<Config>() &&
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>() &&
         systemSupportsMemoryTagging())
       Primary.Options.set(OptionBit::UseMemoryTagging);
 
@@ -274,7 +279,7 @@ class Allocator {
   void drainCaches() { TSDRegistry.drainCaches(this); }
 
   ALWAYS_INLINE void *getHeaderTaggedPointer(void *Ptr) {
-    if (!allocatorSupportsMemoryTagging<Config>())
+    if (!allocatorSupportsMemoryTagging<AllocatorConfig>())
       return Ptr;
     auto UntaggedPtr = untagPointer(Ptr);
     if (UntaggedPtr != Ptr)
@@ -286,7 +291,7 @@ class Allocator {
   }
 
   ALWAYS_INLINE uptr addHeaderTag(uptr Ptr) {
-    if (!allocatorSupportsMemoryTagging<Config>())
+    if (!allocatorSupportsMemoryTagging<AllocatorConfig>())
       return Ptr;
     return addFixedTag(Ptr, 2);
   }
@@ -419,7 +424,7 @@ class Allocator {
       //
       // When memory tagging is enabled, zeroing the contents is done as part of
       // setting the tag.
-      if (UNLIKELY(useMemoryTagging<Config>(Options))) {
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
         uptr PrevUserPtr;
         Chunk::UnpackedHeader Header;
         const uptr BlockSize = PrimaryT::getSizeByClassId(ClassId);
@@ -501,7 +506,7 @@ class Allocator {
     } else {
       Block = addHeaderTag(Block);
       Ptr = addHeaderTag(Ptr);
-      if (UNLIKELY(useMemoryTagging<Config>(Options))) {
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
         storeTags(reinterpret_cast<uptr>(Block), reinterpret_cast<uptr>(Ptr));
         storeSecondaryAllocationStackMaybe(Options, Ptr, Size);
       }
@@ -661,7 +666,7 @@ class Allocator {
                            (reinterpret_cast<uptr>(OldTaggedPtr) + NewSize)) &
             Chunk::SizeOrUnusedBytesMask;
         Chunk::storeHeader(Cookie, OldPtr, &Header);
-        if (UNLIKELY(useMemoryTagging<Config>(Options))) {
+        if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
           if (ClassId) {
             resizeTaggedChunk(reinterpret_cast<uptr>(OldTaggedPtr) + OldSize,
                               reinterpret_cast<uptr>(OldTaggedPtr) + NewSize,
@@ -764,8 +769,9 @@ class Allocator {
       Base = untagPointer(Base);
     const uptr From = Base;
     const uptr To = Base + Size;
-    bool MayHaveTaggedPrimary = allocatorSupportsMemoryTagging<Config>() &&
-                                systemSupportsMemoryTagging();
+    bool MayHaveTaggedPrimary =
+        allocatorSupportsMemoryTagging<AllocatorConfig>() &&
+        systemSupportsMemoryTagging();
     auto Lambda = [this, From, To, MayHaveTaggedPrimary, Callback,
                    Arg](uptr Block) {
       if (Block < From || Block >= To)
@@ -786,9 +792,9 @@ class Allocator {
       }
       if (Header.State == Chunk::State::Allocated) {
         uptr TaggedChunk = Chunk;
-        if (allocatorSupportsMemoryTagging<Config>())
+        if (allocatorSupportsMemoryTagging<AllocatorConfig>())
           TaggedChunk = untagPointer(TaggedChunk);
-        if (useMemoryTagging<Config>(Primary.Options.load()))
+        if (useMemoryTagging<AllocatorConfig>(Primary.Options.load()))
           TaggedChunk = loadTag(Chunk);
         Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header),
                  Arg);
@@ -887,7 +893,7 @@ class Allocator {
   }
 
   bool useMemoryTaggingTestOnly() const {
-    return useMemoryTagging<Config>(Primary.Options.load());
+    return useMemoryTagging<AllocatorConfig>(Primary.Options.load());
   }
   void disableMemoryTagging() {
     // If we haven't been initialized yet, we need to initialize now in order to
@@ -897,7 +903,7 @@ class Allocator {
     // callback), which may cause mappings to be created with memory tagging
     // enabled.
     TSDRegistry.initOnceMaybe(this);
-    if (allocatorSupportsMemoryTagging<Config>()) {
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>()) {
       Secondary.disableMemoryTagging();
       Primary.Options.clear(OptionBit::UseMemoryTagging);
     }
@@ -983,7 +989,7 @@ class Allocator {
     // should not be able to crash the crash dumper (crash_dump on Android).
     // See also the get_error_info_fuzzer.
     *ErrorInfo = {};
-    if (!allocatorSupportsMemoryTagging<Config>() ||
+    if (!allocatorSupportsMemoryTagging<AllocatorConfig>() ||
         MemoryAddr + MemorySize < MemoryAddr)
       return;
 
@@ -1032,7 +1038,7 @@ class Allocator {
 
   static_assert(MinAlignment >= sizeof(Chunk::PackedHeader),
                 "Minimal alignment must at least cover a chunk header.");
-  static_assert(!allocatorSupportsMemoryTagging<Config>() ||
+  static_assert(!allocatorSupportsMemoryTagging<AllocatorConfig>() ||
                     MinAlignment >= archMemoryTagGranuleSize(),
                 "");
 
@@ -1142,7 +1148,7 @@ class Allocator {
     const uptr SizeOrUnusedBytes = Header->SizeOrUnusedBytes;
     if (LIKELY(Header->ClassId))
       return SizeOrUnusedBytes;
-    if (allocatorSupportsMemoryTagging<Config>())
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>())
       Ptr = untagPointer(const_cast<void *>(Ptr));
     return SecondaryT::getBlockEnd(getBlockBegin(Ptr, Header)) -
            reinterpret_cast<uptr>(Ptr) - SizeOrUnusedBytes;
@@ -1162,12 +1168,12 @@ class Allocator {
       Header->State = Chunk::State::Available;
     else
       Header->State = Chunk::State::Quarantined;
-    Header->OriginOrWasZeroed = useMemoryTagging<Config>(Options) &&
+    Header->OriginOrWasZeroed = useMemoryTagging<AllocatorConfig>(Options) &&
                                 Header->ClassId &&
                                 !TSDRegistry.getDisableMemInit();
     Chunk::storeHeader(Cookie, Ptr, Header);
 
-    if (UNLIKELY(useMemoryTagging<Config>(Options))) {
+    if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
       u8 PrevTag = extractTag(reinterpret_cast<uptr>(TaggedPtr));
       storeDeallocationStackMaybe(Options, Ptr, PrevTag, Size);
       if (Header->ClassId) {
@@ -1184,7 +1190,7 @@ class Allocator {
       }
     }
     if (BypassQuarantine) {
-      if (allocatorSupportsMemoryTagging<Config>())
+      if (allocatorSupportsMemoryTagging<AllocatorConfig>())
         Ptr = untagPointer(Ptr);
       void *BlockBegin = getBlockBegin(Ptr, Header);
       const uptr ClassId = Header->ClassId;
@@ -1201,7 +1207,7 @@ class Allocator {
         if (CacheDrained)
           Primary.tryReleaseToOS(ClassId, ReleaseToOS::Normal);
       } else {
-        if (UNLIKELY(useMemoryTagging<Config>(Options)))
+        if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options)))
           storeTags(reinterpret_cast<uptr>(BlockBegin),
                     reinterpret_cast<uptr>(Ptr));
         Secondary.deallocate(Options, BlockBegin);
diff --git a/compiler-rt/lib/scudo/standalone/condition_variable.h b/compiler-rt/lib/scudo/standalone/condition_variable.h
index 4afebdc9d04c2..3f16c86651e73 100644
--- a/compiler-rt/lib/scudo/standalone/condition_variable.h
+++ b/compiler-rt/lib/scudo/standalone/condition_variable.h
@@ -39,22 +39,6 @@ class ConditionVariableDummy
   }
 };
 
-template <typename Config, typename = const bool>
-struct ConditionVariableState {
-  static constexpr bool enabled() { return false; }
-  // This is only used for compilation purpose so that we won't end up having
-  // many conditional compilations. If you want to use `ConditionVariableDummy`,
-  // define `ConditionVariableT` in your allocator configuration. See
-  // allocator_config.h for more details.
-  using ConditionVariableT = ConditionVariableDummy;
-};
-
-template <typename Config>
-struct ConditionVariableState<Config, decltype(Config::UseConditionVariable)> {
-  static constexpr bool enabled() { return Config::UseConditionVariable; }
-  using ConditionVariableT = typename Config::ConditionVariableT;
-};
-
 } // namespace scudo
 
 #endif // SCUDO_CONDITION_VARIABLE_H_
diff --git a/compiler-rt/lib/scudo/standalone/memtag.h b/compiler-rt/lib/scudo/standalone/memtag.h
index aaed2192ad752..1f6983e99404a 100644
--- a/compiler-rt/lib/scudo/standalone/memtag.h
+++ b/compiler-rt/lib/scudo/standalone/memtag.h
@@ -326,7 +326,7 @@ inline void *addFixedTag(void *Ptr, uptr Tag) {
 
 template <typename Config>
 inline constexpr bool allocatorSupportsMemoryTagging() {
-  return archSupportsMemoryTagging() && Config::MaySupportMemoryTagging &&
+  return archSupportsMemoryTagging() && Config::getMaySupportMemoryTagging() &&
          (1 << SCUDO_MIN_ALIGNMENT_LOG) >= archMemoryTagGranuleSize();
 }
 
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index c86e75b8fd66a..1d8a77b73e5c2 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -43,14 +43,13 @@ namespace scudo {
 
 template <typename Config> class SizeClassAllocator32 {
 public:
-  typedef typename Config::Primary::CompactPtrT CompactPtrT;
-  typedef typename Config::Primary::SizeClassMap SizeClassMap;
-  static const uptr GroupSizeLog = Config::Primary::GroupSizeLog;
+  typedef typename Config::CompactPtrT CompactPtrT;
+  typedef typename Config::SizeClassMap SizeClassMap;
+  static const uptr GroupSizeLog = Config::getGroupSizeLog();
   // The bytemap can only track UINT8_MAX - 1 classes.
   static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
   // Regions should be large enough to hold the largest Block.
-  static_assert((1UL << Config::Primary::RegionSizeLog) >=
-                    SizeClassMap::MaxSize,
+  static_assert((1UL << Config::getRegionSizeLog()) >= SizeClassMap::MaxSize,
                 "");
   typedef SizeClassAllocator32<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
@@ -331,9 +330,9 @@ template <typename Config> class SizeClassAllocator32 {
 
   bool setOption(Option O, sptr Value) {
     if (O == Option::ReleaseInterval) {
-      const s32 Interval = Max(Min(static_cast<s32>(Value),
-                                   Config::Primary::MaxReleaseToOsIntervalMs),
-                               Config::Primary::MinReleaseToOsIntervalMs);
+      const s32 Interval = Max(
+          Min(static_cast<s32>(Value), Config::getMaxReleaseToOsIntervalMs()),
+          Config::getMinReleaseToOsIntervalMs());
       atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
       return true;
     }
@@ -373,9 +372,9 @@ template <typename Config> class SizeClassAllocator32 {
 
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
-  static const uptr RegionSize = 1UL << Config::Primary::RegionSizeLog;
-  static const uptr NumRegions =
-      SCUDO_MMAP_RANGE_SIZE >> Config::Primary::RegionSizeLog;
+  static const uptr RegionSize = 1UL << Config::getRegionSizeLog();
+  static const uptr NumRegions = SCUDO_MMAP_RANGE_SIZE >>
+                                 Config::getRegionSizeLog();
   static const u32 MaxNumBatches = SCUDO_ANDROID ? 4U : 8U;
   typedef FlatByteMap<NumRegions> ByteMap;
 
@@ -408,7 +407,7 @@ template <typename Config> class SizeClassAllocator32 {
   static_assert(sizeof(SizeClassInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
   uptr computeRegionId(uptr Mem) {
-    const uptr Id = Mem >> Config::Primary::RegionSizeLog;
+    const uptr Id = Mem >> Config::getRegionSizeLog();
     CHECK_LT(Id, NumRegions);
     return Id;
   }
@@ -437,7 +436,7 @@ template <typename Config> class SizeClassAllocator32 {
       unmap(reinterpret_cast<void *>(End), MapEnd - End);
 
     DCHECK_EQ(Region % RegionSize, 0U);
-    static_assert(Config::Primary::RegionSizeLog == GroupSizeLog,
+    static_assert(Config::getRegionSizeLog() == GroupSizeLog,
                   "Memory group should be the same size as Region");
 
     return Region;
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index d89a2e6a4e5c8..f5e4ab57b4dfd 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -47,13 +47,12 @@ namespace scudo {
 
 template <typename Config> class SizeClassAllocator64 {
 public:
-  typedef typename Config::Primary::CompactPtrT CompactPtrT;
-  typedef typename Config::Primary::SizeClassMap SizeClassMap;
-  typedef typename ConditionVariableState<
-      typename Config::Primary>::ConditionVariableT ConditionVariableT;
-  static const uptr CompactPtrScale = Config::Primary::CompactPtrScale;
-  static const uptr RegionSizeLog = Config::Primary::RegionSizeLog;
-  static const uptr GroupSizeLog = Config::Primary::GroupSizeLog;
+  typedef typename Config::CompactPtrT CompactPtrT;
+  typedef typename Config::SizeClassMap SizeClassMap;
+  typedef typename Config::ConditionVariableT ConditionVariableT;
+  static const uptr CompactPtrScale = Config::getCompactPtrScale();
+  static const uptr RegionSizeLog = Config::getRegionSizeLog();
+  static const uptr GroupSizeLog = Config::getGroupSizeLog();
   static_assert(RegionSizeLog >= GroupSizeLog,
                 "Group size shouldn't be greater than the region size");
   static const uptr GroupScale = GroupSizeLog - CompactPtrScale;
@@ -74,7 +73,7 @@ template <typename Config> class SizeClassAllocator64 {
   static bool canAllocate(uptr Size) { return Size <= SizeClassMap::MaxSize; }
 
   static bool conditionVariableEnabled() {
-    return ConditionVariableState<typename Config::Primary>::enabled();
+    return Config::hasConditionVariableT();
   }
 
   void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
@@ -135,7 +134,7 @@ template <typename Config> class SizeClassAllocator64 {
       // The actual start of a region is offset by a random number of pages
       // when PrimaryEnableRandomOffset is set.
       Region->RegionBeg = (PrimaryBase + (I << RegionSizeLog)) +
-                          (Config::Primary::EnableRandomOffset
+                          (Config::getEnableRandomOffset()
                                ? ((getRandomModN(&Seed, 16) + 1) * PageSize)
                                : 0);
       Region->RandState = getRandomU32(&Seed);
@@ -400,9 +399,9 @@ template <typename Config> class SizeClassAllocator64 {
 
   bool setOption(Option O, sptr Value) {
     if (O == Option::ReleaseInterval) {
-      const s32 Interval = Max(Min(static_cast<s32>(Value),
-                                   Config::Primary::MaxReleaseToOsIntervalMs),
-                               Config::Primary::MinReleaseToOsIntervalMs);
+      const s32 Interval = Max(
+          Min(static_cast<s32>(Value), Config::getMaxReleaseToOsIntervalMs()),
+          Config::getMinReleaseToOsIntervalMs());
       atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
       return true;
     }
@@ -516,7 +515,7 @@ template <typename Config> class SizeClassAllocator64 {
   static const uptr NumClasses = SizeClassMap::NumClasses;
   static const uptr PrimarySize = RegionSize * NumClasses;
 
-  static const uptr MapSizeIncrement = Config::Primary::MapSizeIncrement;
+  static const uptr MapSizeIncrement = Config::getMapSizeIncrement();
   // Fill at most this number of batches from the newly map'd memory.
   static const u32 MaxNumBatches = SCUDO_ANDROID ? 4U : 8U;
 
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 732fd307ed2f4..202c55cc1a92b 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -173,8 +173,6 @@ template <typename T> class NonZeroLengthArray<T, 0> {
 
 template <typename Config> class MapAllocatorCache {
 public:
-  using CacheConfig = typename Config::Secondary::Cache;
-
   void getStats(ScopedString *Str) {
     ScopedLock L(Mutex);
     uptr Integral;
@@ -199,16 +197,16 @@ template <typename Config> class MapAllocatorCache {
   }
 
   // Ensure the default maximum specified fits the array.
-  static_assert(CacheConfig::DefaultMaxEntriesCount <=
-                    CacheConfig::EntriesArraySize,
+  static_assert(Config::getDefaultMaxEntriesCount() <=
+                    Config::getEntriesArraySize(),
                 "");
 
   void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(EntriesCount, 0U);
     setOption(Option::MaxCacheEntriesCount,
-              static_cast<sptr>(CacheConfig::DefaultMaxEntriesCount));
+              static_cast<sptr>(Config::getDefaultMaxEntriesCount()));
     setOption(Option::MaxCacheEntrySize,
-              static_cast<sptr>(CacheConfig::DefaultMaxEntrySize));
+              static_cast<sptr>(Config::getDefaultMaxEntrySize()));
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
@@ -253,9 +251,9 @@ template <typename Config> class MapAllocatorCache {
         // just unmap it.
         break;
       }
-      if (CacheConfig::QuarantineSize && useMemoryTagging<Config>(Options)) {
+      if (Config::getQuarantineSize() && useMemoryTagging<Config>(Options)) {
         QuarantinePos =
-            (QuarantinePos + 1) % Max(CacheConfig::QuarantineSize, 1u);
+            (QuarantinePos + 1) % Max(Config::getQuarantineSize(), 1u);
         if (!Quarantine[QuarantinePos].isValid()) {
           Quarantine[QuarantinePos] = Entry;
           return;
@@ -382,14 +380,14 @@ template <typename Config> class MapAllocatorCache {
   bool setOption(Option O, sptr Value) {
     if (O == Option::ReleaseInterval) {
       const s32 Interval = Max(
-          Min(static_cast<s32>(Value), CacheConfig::MaxReleaseToOsIntervalMs),
-          CacheConfig::MinReleaseToOsIntervalMs);
+          Min(static_cast<s32>(Value), Config::getMaxReleaseToOsIntervalMs()),
+          Config::getMinReleaseToOsIntervalMs());
       atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
       return true;
     }
     if (O == Option::MaxCacheEntriesCount) {
       const u32 MaxCount = static_cast<u32>(Value);
-      if (MaxCount > CacheConfig::EntriesArraySize)
+      if (MaxCount > Config::getEntriesArraySize())
         return false;
       atomic_store_relaxed(&MaxEntriesCount, MaxCount);
       return true;
@@ -406,7 +404,7 @@ template <typename Config> class MapAllocatorCache {
 
   void disableMemoryTagging() EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
-    for (u32 I = 0; I != CacheConfig::QuarantineSize; ++I) {
+    for (u32 I = 0; I != Config::getQuarantineSize(); ++I) {
       if (Quarantine[I].isValid()) {
         MemMapT &MemMap = Quarantine[I].MemMap;
         MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
@@ -431,11 +429,11 @@ template <typename Config> class MapAllocatorCache {
 
 private:
   void empty() {
-    MemMapT MapInfo[CacheConfig::EntriesArraySize];
+    MemMapT MapInfo[Config::getEntriesArraySize()];
     uptr N = 0;
     {
       ScopedLock L(Mutex);
-      for (uptr I = 0; I < CacheConfig::EntriesArraySize; I++) {
+      for (uptr I = 0; I < Config::getEntriesArraySize(); I++) {
         if (!Entries[I].isValid())
           continue;
         MapInfo[N] = Entries[I].MemMap;
@@ -468,9 +466,9 @@ template <typename Config> class MapAllocatorCache {
     if (!EntriesCount || OldestTime == 0 || OldestTime > Time)
       return;
     OldestTime = 0;
-    for (uptr I = 0; I < CacheConfig::QuarantineSize; I++)
+    for (uptr I = 0; I < Config::getQuarantineSize(); I++)
       releaseIfOlderThan(Quarantine[I], Time);
-    for (uptr I = 0; I < CacheConfig::EntriesArraySize; I++)
+    for (uptr I = 0; I < Config::getEntriesArraySize(); I++)
       releaseIfOlderThan(Entries[I], Time);
   }
 
@@ -485,8 +483,8 @@ template <typename Config> class MapAllocatorCache {
   u32 CallsToRetrieve GUARDED_BY(Mutex) = 0;
   u32 SuccessfulRetrieves GUARDED_BY(Mutex) = 0;
 
-  CachedBlock Entries[CacheConfig::EntriesArraySize] GUARDED_BY(Mutex) = {};
-  NonZeroLengthArray<CachedBlock, CacheConfig::QuarantineSize>
+  CachedBlock Entries[Config::getEntriesArraySize()] GUARDED_BY(Mutex) = {};
+  NonZeroLengthArray<CachedBlock, Config::getQuarantineSize()>
       Quarantine GUARDED_BY(Mutex) = {};
 };
 
@@ -555,7 +553,7 @@ template <typename Config> class MapAllocator {
   void getStats(ScopedString *Str);
 
 private:
-  typename Config::Secondary::template CacheT<Config> Cache;
+  typename Config::template CacheT<typename Config::CacheConfig> Cache;
 
   mutable HybridMutex Mutex;
   DoublyLinkedList<LargeBlock::Header> InUseBlocks GUARDED_BY(Mutex);
diff --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
index c6b6a1cb57cee..1786756fa5ea6 100644
--- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
@@ -81,7 +81,7 @@ macro(add_scudo_unittest testname)
         "${testname}-${arch}-Test" ${arch}
         SOURCES ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}
         COMPILE_DEPS ${SCUDO_TEST_HEADERS}
-        DEPS llvm_gtest scudo_standalone
+        DEPS scudo_standalone
         RUNTIME ${RUNTIME}
         CFLAGS ${SCUDO_UNITTEST_CFLAGS}
         LINK_FLAGS ${SCUDO_UNITTEST_LINK_FLAGS})
@@ -90,6 +90,7 @@ macro(add_scudo_unittest testname)
 endmacro()
 
 set(SCUDO_UNIT_TEST_SOURCES
+  allocator_config_test.cpp
   atomic_test.cpp
   bytemap_test.cpp
   checksum_test.cpp
diff --git a/compiler-rt/lib/scudo/standalone/tests/allocator_config_test.cpp b/compiler-rt/lib/scudo/standalone/tests/allocator_config_test.cpp
new file mode 100644
index 0000000000000..4c4ceb832e27b
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/tests/allocator_config_test.cpp
@@ -0,0 +1,119 @@
+//===-- allocator_config_test.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "tests/scudo_unit_test.h"
+
+#include "allocator_config.h"
+#include "allocator_config_wrapper.h"
+#include "common.h"
+#include "secondary.h"
+
+#include <type_traits>
+
+struct TestBaseConfig {
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+};
+
+struct TestBaseConfigEnableOptionalFlag : public TestBaseConfig {
+  static const bool MaySupportMemoryTagging = true;
+  // Use the getter to avoid the test to `use` the address of static const
+  // variable (which requires additional explicit definition).
+  static bool getMaySupportMemoryTagging() { return MaySupportMemoryTagging; }
+};
+
+struct TestBasePrimaryConfig {
+  using SizeClassMap = void;
+  static const scudo::uptr RegionSizeLog = 18U;
+  static const scudo::uptr GroupSizeLog = 18U;
+  static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+  typedef scudo::uptr CompactPtrT;
+  static const scudo::uptr CompactPtrScale = 0;
+  static const scudo::uptr MapSizeIncrement = 1UL << 18;
+};
+
+struct TestPrimaryConfig : public TestBaseConfig {
+  struct Primary : TestBasePrimaryConfig {};
+};
+
+struct TestPrimaryConfigEnableOptionalFlag : public TestBaseConfig {
+  struct Primary : TestBasePrimaryConfig {
+    static const bool EnableRandomOffset = true;
+    static bool getEnableRandomOffset() { return EnableRandomOffset; }
+  };
+};
+
+struct TestPrimaryConfigEnableOptionalType : public TestBaseConfig {
+  struct DummyConditionVariable {};
+
+  struct Primary : TestBasePrimaryConfig {
+    using ConditionVariableT = DummyConditionVariable;
+  };
+};
+
+struct TestSecondaryConfig : public TestPrimaryConfig {
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+};
+
+struct TestSecondaryCacheConfigEnableOptionalFlag : public TestPrimaryConfig {
+  struct Secondary {
+    struct Cache {
+      static const scudo::u32 EntriesArraySize = 256U;
+      static scudo::u32 getEntriesArraySize() { return EntriesArraySize; }
+    };
+    template <typename T> using CacheT = scudo::MapAllocatorCache<T>;
+  };
+};
+
+TEST(ScudoAllocatorConfigTest, VerifyOptionalFlags) {
+  // Test the top level allocator optional config.
+  //
+  // `MaySupportMemoryTagging` is default off.
+  EXPECT_FALSE(scudo::BaseConfig<TestBaseConfig>::getMaySupportMemoryTagging());
+  EXPECT_EQ(scudo::BaseConfig<
+                TestBaseConfigEnableOptionalFlag>::getMaySupportMemoryTagging(),
+            TestBaseConfigEnableOptionalFlag::getMaySupportMemoryTagging());
+
+  // Test primary optional config.
+  //
+  // `EnableRandomeOffset` is default off.
+  EXPECT_FALSE(
+      scudo::PrimaryConfig<TestPrimaryConfig>::getEnableRandomOffset());
+  EXPECT_EQ(
+      scudo::PrimaryConfig<
+          TestPrimaryConfigEnableOptionalFlag>::getEnableRandomOffset(),
+      TestPrimaryConfigEnableOptionalFlag::Primary::getEnableRandomOffset());
+
+  // `ConditionVariableT` is default off.
+  EXPECT_FALSE(
+      scudo::PrimaryConfig<TestPrimaryConfig>::hasConditionVariableT());
+  EXPECT_TRUE(scudo::PrimaryConfig<
+              TestPrimaryConfigEnableOptionalType>::hasConditionVariableT());
+  EXPECT_TRUE((std::is_same_v<
+               typename scudo::PrimaryConfig<
+                   TestPrimaryConfigEnableOptionalType>::ConditionVariableT,
+               typename TestPrimaryConfigEnableOptionalType::Primary::
+                   ConditionVariableT>));
+
+  // Test secondary cache optional config.
+  using NoCacheConfig =
+      scudo::SecondaryConfig<TestSecondaryConfig>::CacheConfig;
+  // `EntriesArraySize` is default 0.
+  EXPECT_EQ(NoCacheConfig::getEntriesArraySize(), 0U);
+
+  using CacheConfig = scudo::SecondaryConfig<
+      TestSecondaryCacheConfigEnableOptionalFlag>::CacheConfig;
+  EXPECT_EQ(CacheConfig::getEntriesArraySize(),
+            TestSecondaryCacheConfigEnableOptionalFlag::Secondary::Cache::
+                getEntriesArraySize());
+}
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 13d627b116809..6a311adc55e4b 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -190,7 +190,6 @@ struct TestConditionVariableConfig {
 #endif
     static const scudo::s32 MinReleaseToOsIntervalMs = 1000;
     static const scudo::s32 MaxReleaseToOsIntervalMs = 1000;
-    static const bool UseConditionVariable = true;
 #if SCUDO_LINUX
     using ConditionVariableT = scudo::ConditionVariableLinux;
 #else
diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
index f64a5143b30d4..683ce3e596596 100644
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -9,6 +9,7 @@
 #include "tests/scudo_unit_test.h"
 
 #include "allocator_config.h"
+#include "allocator_config_wrapper.h"
 #include "condition_variable.h"
 #include "primary32.h"
 #include "primary64.h"
@@ -29,6 +30,9 @@
 
 template <typename SizeClassMapT> struct TestConfig1 {
   static const bool MaySupportMemoryTagging = false;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
 
   struct Primary {
     using SizeClassMap = SizeClassMapT;
@@ -45,6 +49,9 @@ template <typename SizeClassMapT> struct TestConfig1 {
 
 template <typename SizeClassMapT> struct TestConfig2 {
   static const bool MaySupportMemoryTagging = false;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
 
   struct Primary {
     using SizeClassMap = SizeClassMapT;
@@ -66,6 +73,9 @@ template <typename SizeClassMapT> struct TestConfig2 {
 
 template <typename SizeClassMapT> struct TestConfig3 {
   static const bool MaySupportMemoryTagging = true;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
 
   struct Primary {
     using SizeClassMap = SizeClassMapT;
@@ -87,6 +97,9 @@ template <typename SizeClassMapT> struct TestConfig3 {
 
 template <typename SizeClassMapT> struct TestConfig4 {
   static const bool MaySupportMemoryTagging = true;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
 
   struct Primary {
     using SizeClassMap = SizeClassMapT;
@@ -109,6 +122,9 @@ template <typename SizeClassMapT> struct TestConfig4 {
 // This is the only test config that enables the condition variable.
 template <typename SizeClassMapT> struct TestConfig5 {
   static const bool MaySupportMemoryTagging = true;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
 
   struct Primary {
     using SizeClassMap = SizeClassMapT;
@@ -125,7 +141,6 @@ template <typename SizeClassMapT> struct TestConfig5 {
     typedef scudo::u32 CompactPtrT;
     static const bool EnableRandomOffset = true;
     static const scudo::uptr MapSizeIncrement = 1UL << 18;
-    static const bool UseConditionVariable = true;
 #if SCUDO_LINUX
     using ConditionVariableT = scudo::ConditionVariableLinux;
 #else
@@ -139,10 +154,12 @@ struct Config : public BaseConfig<SizeClassMapT> {};
 
 template <template <typename> class BaseConfig, typename SizeClassMapT>
 struct SizeClassAllocator
-    : public scudo::SizeClassAllocator64<Config<BaseConfig, SizeClassMapT>> {};
+    : public scudo::SizeClassAllocator64<
+          scudo::PrimaryConfig<Config<BaseConfig, SizeClassMapT>>> {};
 template <typename SizeClassMapT>
 struct SizeClassAllocator<TestConfig1, SizeClassMapT>
-    : public scudo::SizeClassAllocator32<Config<TestConfig1, SizeClassMapT>> {};
+    : public scudo::SizeClassAllocator32<
+          scudo::PrimaryConfig<Config<TestConfig1, SizeClassMapT>>> {};
 
 template <template <typename> class BaseConfig, typename SizeClassMapT>
 struct TestAllocator : public SizeClassAllocator<BaseConfig, SizeClassMapT> {
@@ -219,6 +236,9 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, BasicPrimary) {
 
 struct SmallRegionsConfig {
   static const bool MaySupportMemoryTagging = false;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
 
   struct Primary {
     using SizeClassMap = scudo::DefaultSizeClassMap;
@@ -236,7 +256,8 @@ struct SmallRegionsConfig {
 // The 64-bit SizeClassAllocator can be easily OOM'd with small region sizes.
 // For the 32-bit one, it requires actually exhausting memory, so we skip it.
 TEST(ScudoPrimaryTest, Primary64OOM) {
-  using Primary = scudo::SizeClassAllocator64<SmallRegionsConfig>;
+  using Primary =
+      scudo::SizeClassAllocator64<scudo::PrimaryConfig<SmallRegionsConfig>>;
   Primary Allocator;
   Allocator.init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index 18d2e187fa3ce..8f0250e88ebf3 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -10,6 +10,7 @@
 #include "tests/scudo_unit_test.h"
 
 #include "allocator_config.h"
+#include "allocator_config_wrapper.h"
 #include "secondary.h"
 
 #include <algorithm>
@@ -22,7 +23,8 @@
 #include <vector>
 
 template <typename Config> static scudo::Options getOptionsForConfig() {
-  if (!Config::MaySupportMemoryTagging || !scudo::archSupportsMemoryTagging() ||
+  if (!Config::getMaySupportMemoryTagging() ||
+      !scudo::archSupportsMemoryTagging() ||
       !scudo::systemSupportsMemoryTagging())
     return {};
   scudo::AtomicOptions AO;
@@ -31,8 +33,9 @@ template <typename Config> static scudo::Options getOptionsForConfig() {
 }
 
 template <typename Config> static void testSecondaryBasic(void) {
-  using SecondaryT = scudo::MapAllocator<Config>;
-  scudo::Options Options = getOptionsForConfig<Config>();
+  using SecondaryT = scudo::MapAllocator<scudo::SecondaryConfig<Config>>;
+  scudo::Options Options =
+      getOptionsForConfig<scudo::SecondaryConfig<Config>>();
 
   scudo::GlobalStats S;
   S.init();
@@ -84,6 +87,10 @@ template <typename Config> static void testSecondaryBasic(void) {
 
 struct NoCacheConfig {
   static const bool MaySupportMemoryTagging = false;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
+
   struct Secondary {
     template <typename Config>
     using CacheT = scudo::MapAllocatorNoCache<Config>;
@@ -92,6 +99,10 @@ struct NoCacheConfig {
 
 struct TestConfig {
   static const bool MaySupportMemoryTagging = false;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+
   struct Secondary {
     struct Cache {
       static const scudo::u32 EntriesArraySize = 128U;
@@ -114,7 +125,7 @@ TEST(ScudoSecondaryTest, SecondaryBasic) {
 
 struct MapAllocatorTest : public Test {
   using Config = scudo::DefaultConfig;
-  using LargeAllocator = scudo::MapAllocator<Config>;
+  using LargeAllocator = scudo::MapAllocator<scudo::SecondaryConfig<Config>>;
 
   void SetUp() override { Allocator->init(nullptr); }
 
@@ -122,7 +133,8 @@ struct MapAllocatorTest : public Test {
 
   std::unique_ptr<LargeAllocator> Allocator =
       std::make_unique<LargeAllocator>();
-  scudo::Options Options = getOptionsForConfig<Config>();
+  scudo::Options Options =
+      getOptionsForConfig<scudo::SecondaryConfig<Config>>();
 };
 
 // This exercises a variety of combinations of size and alignment for the
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index 8ffc703b05eac..2bebe651b994e 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -126,6 +126,7 @@ const int SIGFPE = 8;
 const int SIGSEGV = 11;
 const int SIGPIPE = 13;
 const int SIGTERM = 15;
+const int SIGPROF = 27;
 #if defined(__mips__) || SANITIZER_FREEBSD || SANITIZER_APPLE || SANITIZER_NETBSD
 const int SIGBUS = 10;
 const int SIGSYS = 12;
@@ -2168,7 +2169,8 @@ static bool is_sync_signal(ThreadSignalContext *sctx, int sig,
     return false;
 #endif
   return sig == SIGSEGV || sig == SIGBUS || sig == SIGILL || sig == SIGTRAP ||
-         sig == SIGABRT || sig == SIGFPE || sig == SIGPIPE || sig == SIGSYS;
+         sig == SIGABRT || sig == SIGFPE || sig == SIGPIPE || sig == SIGSYS ||
+         sig == SIGPROF;
 }
 
 void sighandler(int sig, __sanitizer_siginfo *info, void *ctx) {
diff --git a/compiler-rt/lib/tsan/tests/CMakeLists.txt b/compiler-rt/lib/tsan/tests/CMakeLists.txt
index c02c2279583bd..ad8cc9b0eb05a 100644
--- a/compiler-rt/lib/tsan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/tsan/tests/CMakeLists.txt
@@ -64,7 +64,7 @@ foreach (header ${TSAN_HEADERS})
   list(APPEND TSAN_RTL_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../${header})
 endforeach()
 
-set(TSAN_DEPS llvm_gtest tsan)
+set(TSAN_DEPS tsan)
 # TSan uses C++ standard library headers.
 if (TARGET cxx-headers OR HAVE_LIBCXX)
   set(TSAN_DEPS cxx-headers)
diff --git a/compiler-rt/lib/xray/tests/CMakeLists.txt b/compiler-rt/lib/xray/tests/CMakeLists.txt
index 732f982c932f0..0a428b9a30b18 100644
--- a/compiler-rt/lib/xray/tests/CMakeLists.txt
+++ b/compiler-rt/lib/xray/tests/CMakeLists.txt
@@ -109,7 +109,7 @@ macro(add_xray_unittest testname)
         ${XRAY_HEADERS} ${XRAY_ALL_SOURCE_FILES_ABS_PATHS}
         "test_helpers.h"
         RUNTIME "${XRAY_RUNTIME_LIBS}"
-        DEPS llvm_gtest xray llvm-xray LLVMXRay LLVMTestingSupport
+        DEPS xray llvm-xray LLVMXRay LLVMTestingSupport
         CFLAGS ${XRAY_UNITTEST_CFLAGS}
         LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS}
         )
diff --git a/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
index 8e1a2ae149148..12ed505883e27 100644
--- a/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
@@ -21,13 +21,13 @@ void make_access(S *s) {
 int main(void) {
   S *s = (S*)malloc(sizeof(S));
   free(s);
-// CHECK: [[ADDR]] is located 0 bytes inside of 4-byte region
-// CHECK-LABEL: freed by thread T0 here:
-// CHECK:   {{#0 .* free }}
-// CHECK:   {{#1 .* main .*bitfield_uaf.cpp}}:[[@LINE-4]]
-// CHECK-LABEL: previously allocated by thread T0 here:
-// CHECK:   {{#0 .* malloc }}
-// CHECK:   {{#1 .* main .*bitfield_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 4-byte region
+  // CHECK-LABEL: freed by thread T0 here:
+  // CHECK:   {{#0 .* free }}
+  // CHECK:   {{ #[1-2] .* main .*bitfield_uaf.cpp}}:[[@LINE-4]]
+  // CHECK-LABEL: previously allocated by thread T0 here:
+  // CHECK:   {{#0 .* malloc }}
+  // CHECK:   {{ #[1-2] .* main .*bitfield_uaf.cpp}}:[[@LINE-8]]
   make_access(s);
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
index b22c359b3dc42..e96fb6190f5a8 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   int *buffer = (int*)calloc(42, sizeof(int));
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 4 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*calloc_left_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 4 bytes before 168-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* calloc }}
-// CHECK-NEXT: {{#1 .* main .*calloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 4 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*calloc_left_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 4 bytes before 168-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK: {{#0 .* calloc }}
+  // CHECK: {{ #[1-2] .* main .*calloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
index 9e12f9cf653e0..fe0fc20e19190 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   int *buffer = (int*)calloc(42, sizeof(int));
   buffer[42] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 4 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*calloc_right_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes after 168-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* calloc }}
-// CHECK-NEXT: {{#1 .* main .*calloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 4 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*calloc_right_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes after 168-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* calloc }}
+  // CHECK: {{ #[1-2] .* main .*calloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
index 6c225d4c676d0..bf13f7d3eb662 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
@@ -7,14 +7,14 @@ int main() {
   int *buffer = (int*)calloc(42, sizeof(int));
   free(buffer);
   buffer[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 4 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*calloc_uaf.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
-// CHECK: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*calloc_uaf.cpp}}:[[@LINE-8]]
-// CHECK: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* calloc }}
-// CHECK-NEXT: {{#1 .* main .*calloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 4 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*calloc_uaf.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
+  // CHECK: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*calloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* calloc }}
+  // CHECK: {{ #[1-2] .* main .*calloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_heap_allocation.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_heap_allocation.cpp
index b8c2c1a24d4ce..3606b1c816317 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_heap_allocation.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_heap_allocation.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cl %LD %s %Fe%t.dll -DHEAP_LIBRARY %MD \
 // RUN:   %if target={{.*-windows-gnu}} %{ -Wl,--out-implib,%t.lib %}
-// RUN: %clang_cl %s %t.lib %Fe%t -fsanitize=address %MT
+// RUN: %clang_cl_asan %s %t.lib %Fe%t
 // RUN: %run %t 2>&1 | FileCheck %s
 
 // Check that ASan does not fail when releasing allocations that occurred within
diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_host.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_host.cpp
index 0757af90d211c..85b7967e86b51 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_host.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_host.cpp
@@ -5,48 +5,6 @@
 // Just make sure we can compile this.
 // The actual compile&run sequence is to be done by the DLL tests.
 // RUN: %clang_cl_asan -Od %s -Fe%t
-//
-// Get the list of ASan wrappers exported by the main module RTL:
-// note: The mangling decoration (i.e. @4 )is removed because calling convention
-//       differ from 32-bit and 64-bit.
-// RUN: dumpbin /EXPORTS %t | grep -o "__asan_wrap[^ ]*" | sed -e s/@.*// > %t.exports1
-//
-// The exception handlers differ in 32-bit and 64-bit, so we ignore them:
-// RUN: grep '[E]XPORT:' %s | sed -e 's/.*[E]XPORT: //' > %t.exports2
-// EXPORT: __asan_wrap__except_handler3
-// EXPORT: __asan_wrap__except_handler4
-// EXPORT: __asan_wrap___C_specific_handler
-//
-// Get the list of ASan wrappers imported by the DLL RTL:
-// [BEWARE: be really careful with the sed commands, as this test can be run
-//  from different environments with different shells and seds]
-// RUN: grep INTERCEPT_LIBRARY_FUNCTION %p/../../../../lib/asan/asan_win_dll_thunk.cpp \
-// RUN:  | grep -v define | sed -e s/.*(/__asan_wrap_/ -e s/).*//              \
-// RUN:  > %t.imports1
-//
-// Add functions interecepted in asan_malloc.win.cpp and asan_win.cpp.
-// RUN: grep '[I]MPORT:' %s | sed -e 's/.*[I]MPORT: //' > %t.imports2
-// IMPORT: __asan_wrap_HeapAlloc
-// IMPORT: __asan_wrap_HeapFree
-// IMPORT: __asan_wrap_HeapReAlloc
-// IMPORT: __asan_wrap_HeapSize
-// IMPORT: __asan_wrap_CreateThread
-// IMPORT: __asan_wrap_RaiseException
-// IMPORT: __asan_wrap_RtlRaiseException
-// IMPORT: __asan_wrap_SetUnhandledExceptionFilter
-// IMPORT: __asan_wrap_RtlSizeHeap
-// IMPORT: __asan_wrap_RtlAllocateHeap
-// IMPORT: __asan_wrap_RtlReAllocateHeap
-// IMPORT: __asan_wrap_RtlFreeHeap
-//
-// RUN: cat %t.imports1 %t.imports2 | sort | uniq > %t.imports-sorted
-// RUN: cat %t.exports1 %t.exports2 | sort | uniq > %t.exports-sorted
-//
-// Now make sure the DLL thunk imports everything:
-// RUN: echo
-// RUN: echo "=== NOTE === If you see a mismatch below, please update asan_win_dll_thunk.cpp"
-// RUN: diff %t.imports-sorted %t.exports-sorted
-// REQUIRES: asan-static-runtime
 
 #include <stdio.h>
 #include <windows.h>
diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_malloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_malloc_left_oob.cpp
index 6d550eb966cda..5ca48fbd68384 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_malloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_malloc_left_oob.cpp
@@ -7,17 +7,17 @@ extern "C" __declspec(dllexport)
 int test_function() {
   char *buffer = (char*)malloc(42);
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: test_function {{.*}}dll_malloc_left_oob.cpp:[[@LINE-3]]
-// CHECK-NEXT: main {{.*}}dll_host.cpp
-//
-// CHECK: [[ADDR]] is located 1 bytes before 42-byte region
-// CHECK-LABEL: allocated by thread T0 here:
-// CHECK-NEXT:   malloc
-// CHECK-NEXT:   test_function {{.*}}dll_malloc_left_oob.cpp:[[@LINE-10]]
-// CHECK-NEXT:   main {{.*}}dll_host.cpp
-// CHECK-LABEL: SUMMARY
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: test_function {{.*}}dll_malloc_left_oob.cpp:[[@LINE-3]]
+  // CHECK-NEXT: main {{.*}}dll_host.cpp
+  //
+  // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
+  // CHECK-LABEL: allocated by thread T0 here:
+  // CHECK-NEXT:   malloc
+  // CHECK:   test_function {{.*}}dll_malloc_left_oob.cpp:[[@LINE-10]]
+  // CHECK-NEXT:   main {{.*}}dll_host.cpp
+  // CHECK-LABEL: SUMMARY
   free(buffer);
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_malloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_malloc_uaf.cpp
index bc701e92961c9..ae23f86d7ddf4 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_malloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_malloc_uaf.cpp
@@ -9,20 +9,20 @@ int test_function() {
   int *buffer = (int*)malloc(42);
   free(buffer);
   buffer[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 4 at [[ADDR]] thread T0
-// CHECK-NEXT:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-3]]
-// CHECK-NEXT:  main {{.*}}dll_host
-//
-// CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
-// CHECK-LABEL: freed by thread T0 here:
-// CHECK-NEXT:  free
-// CHECK-NEXT:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-10]]
-// CHECK-NEXT:  main {{.*}}dll_host
-//
-// CHECK-LABEL: previously allocated by thread T0 here:
-// CHECK-NEXT:  malloc
-// CHECK-NEXT:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-16]]
-// CHECK-NEXT:  main {{.*}}dll_host
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 4 at [[ADDR]] thread T0
+  // CHECK-NEXT:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-3]]
+  // CHECK-NEXT:  main {{.*}}dll_host
+  //
+  // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
+  // CHECK-LABEL: freed by thread T0 here:
+  // CHECK-NEXT:  free
+  // CHECK:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-10]]
+  // CHECK-NEXT:  main {{.*}}dll_host
+  //
+  // CHECK-LABEL: previously allocated by thread T0 here:
+  // CHECK-NEXT:  malloc
+  // CHECK:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-16]]
+  // CHECK-NEXT:  main {{.*}}dll_host
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/double_free.cpp b/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
index 45568e50d0c82..e288b40fac47a 100644
--- a/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
@@ -7,15 +7,15 @@ int main() {
   int *x = (int*)malloc(42 * sizeof(int));
   free(x);
   free(x);
-// CHECK: AddressSanitizer: attempting double-free on [[ADDR:0x[0-9a-f]+]]
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*double_free.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
-// CHECK-LABEL: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*double_free.cpp}}:[[@LINE-8]]
-// CHECK-LABEL: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc }}
-// CHECK-NEXT: {{#1 .* main .*double_free.cpp}}:[[@LINE-12]]
+  // CHECK: AddressSanitizer: attempting double-free on [[ADDR:0x[0-9a-f]+]]
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
+  // CHECK-LABEL: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-8]]
+  // CHECK-LABEL: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc }}
+  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-12]]
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/intercept_strdup.cpp b/compiler-rt/test/asan/TestCases/Windows/intercept_strdup.cpp
index b519f53fe4269..fec4d1ec35597 100644
--- a/compiler-rt/test/asan/TestCases/Windows/intercept_strdup.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/intercept_strdup.cpp
@@ -15,18 +15,18 @@ int main() {
 
   subscript = -1;
   ptr[subscript] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK:   {{#0 .* main .*}}intercept_strdup.cpp:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 1 bytes before 6-byte region
-// CHECK: allocated by thread T0 here:
-//
-// The first frame is our wrapper normally but will be malloc in the dynamic
-// config.
-// CHECK:   #0 {{.*}} in {{malloc|strdup}}
-//
-// The local call to _strdup above may be the second or third frame depending
-// on whether we're using the dynamic config.
-// CHECK:   #{{[12]}} {{.*}} in main {{.*}}intercept_strdup.cpp:[[@LINE-21]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK:   {{#0 .* main .*}}intercept_strdup.cpp:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 1 bytes before 6-byte region
+  // CHECK: allocated by thread T0 here:
+  //
+  // The first frame is our wrapper normally but will be malloc in the dynamic
+  // config.
+  // CHECK:   #0 {{.*}} in {{malloc|_strdup}}
+  //
+  // The local call to _strdup above may be the second or third frame depending
+  // on whether we're using the dynamic config.
+  // CHECK:   #{{[12]}} {{.*}} in main {{.*}}intercept_strdup.cpp:[[@LINE-21]]
   free(ptr);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/interface_symbols_windows.cpp b/compiler-rt/test/asan/TestCases/Windows/interface_symbols_windows.cpp
deleted file mode 100644
index 1803911a43a9b..0000000000000
--- a/compiler-rt/test/asan/TestCases/Windows/interface_symbols_windows.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// UNSUPPORTED: target={{.*-windows-gnu}}
-
-// Check that the interface exported by asan static lib matches the list of
-// functions mentioned in sanitizer_interface.inc.
-//
-// Just make sure we can compile this.
-// RUN: %clang_cl_asan -Od %s -Fe%t
-//
-// note: The mangling decoration (i.e. @4 )is removed because calling convention
-//       differ from 32-bit and 64-bit.
-//
-// RUN: dumpbin /EXPORTS %t | sed "s/=.*//"                                    \
-// RUN:   | grep -o "\(__asan_\|__ubsan_\|__sanitizer_\|__sancov_\)[^ ]*"      \
-// RUN:   | grep -v "__asan_wrap"                                              \
-// RUN:   | sed -e s/@.*// > %t.exports
-//
-// [BEWARE: be really careful with the sed commands, as this test can be run
-//  from different environments with different shells and seds]
-//
-// RUN: sed ':a;N;$!ba;s/([\n ]*/(/g'                                          \
-// RUN:  %p/../../../../lib/asan/asan_interface.inc                            \
-// RUN:  %p/../../../../lib/ubsan/ubsan_interface.inc                          \
-// RUN:  %p/../../../../lib/sanitizer_common/sanitizer_common_interface.inc    \
-// RUN:  %p/../../../../lib/sanitizer_common/sanitizer_coverage_interface.inc  \
-// RUN:  | grep -e "^INTERFACE_FUNCTION"                                       \
-// RUN:  | sed -e "s/.*(//" -e "s/).*//" > %t.imports1
-//
-// RUN: sed ':a;N;$!ba;s/([\n ]*/(/g'                                          \
-// RUN:  %p/../../../../lib/asan/asan_interface.inc                            \
-// RUN:  %p/../../../../lib/ubsan/ubsan_interface.inc                          \
-// RUN:  %p/../../../../lib/sanitizer_common/sanitizer_common_interface.inc    \
-// RUN:  %p/../../../../lib/sanitizer_common/sanitizer_coverage_interface.inc  \
-// RUN:  | grep -e "^INTERFACE_WEAK_FUNCTION"                                  \
-// RUN:  | sed -e "s/.*(//" -e "s/).*/__dll/" > %t.imports2
-//
-// Add functions not included in the interface lists:
-// RUN: grep '[I]MPORT:' %s | sed -e 's/.*[I]MPORT: //' > %t.imports3
-// IMPORT: __asan_shadow_memory_dynamic_address
-// IMPORT: __asan_get_shadow_memory_dynamic_address
-// IMPORT: __asan_option_detect_stack_use_after_return
-// IMPORT: __asan_should_detect_stack_use_after_return
-// IMPORT: __asan_set_seh_filter
-// IMPORT: __asan_unhandled_exception_filter
-// IMPORT: __asan_test_only_reported_buggy_pointer
-// IMPORT: __ubsan_vptr_type_cache
-//
-// RUN: cat %t.imports1 %t.imports2 %t.imports3 | sort | uniq > %t.imports-sorted
-// RUN: cat %t.exports | sort | uniq > %t.exports-sorted
-//
-// Now make sure the DLL thunk imports everything:
-// RUN: echo
-// RUN: echo "=== NOTE === If you see a mismatch below, please update interface.inc files."
-// RUN: diff %t.imports-sorted %t.exports-sorted
-// REQUIRES: asan-static-runtime
-
-int main() { return 0; }
diff --git a/compiler-rt/test/asan/TestCases/Windows/issue64990.cpp b/compiler-rt/test/asan/TestCases/Windows/issue64990.cpp
index aab66502bd16f..b1b6e42148cb6 100644
--- a/compiler-rt/test/asan/TestCases/Windows/issue64990.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/issue64990.cpp
@@ -16,5 +16,5 @@ int main(int argc, char **argv) {
   }
   return 0;
 }
-
-// CHECK: SUMMARY: AddressSanitizer: global-buffer-overflow {{.*}} in __asan_memcpy
+// CHECK: #0 {{.*}} in __asan_memcpy
+// CHECK: SUMMARY: AddressSanitizer: global-buffer-overflow {{.*}} in main
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
index 2ee5fdc7abee2..7ea95d2b2184a 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   char *buffer = (char*)malloc(42);
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*malloc_left_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 1 bytes before 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc }}
-// CHECK-NEXT: {{#1 .* main .*malloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*malloc_left_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc }}
+  // CHECK: {{ #[1-2] .* main .*malloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
index dafca74b4812d..1495632456e08 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   char *buffer = (char*)malloc(42);
   buffer[42] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*malloc_right_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes after 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc }}
-// CHECK-NEXT: {{#1 .* main .*malloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*malloc_right_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes after 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc }}
+  // CHECK: {{ #[1-2] .* main .*malloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
index 256582deefe46..d1eac7e55f601 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
@@ -7,14 +7,14 @@ int main() {
   char *buffer = (char*)malloc(42);
   free(buffer);
   buffer[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*malloc_uaf.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
-// CHECK: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*malloc_uaf.cpp}}:[[@LINE-8]]
-// CHECK: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc }}
-// CHECK-NEXT: {{#1 .* main .*malloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*malloc_uaf.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
+  // CHECK: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*malloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc }}
+  // CHECK: {{ #[1-2] .* main .*malloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
index 4a5a7ab8ecfdf..ebde5f159ae38 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   char *buffer = (char*)realloc(0, 42);
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*realloc_left_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 1 bytes before 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*realloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*realloc_left_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*realloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
index 8f3109eed2d07..281efed5d3074 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   char *buffer = (char*)realloc(0, 42);
   buffer[42] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*realloc_right_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes after 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*realloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*realloc_right_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes after 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*realloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
index 074ac270f13f5..6ff2217b11a25 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
@@ -7,14 +7,14 @@ int main() {
   char *buffer = (char*)realloc(0, 42);
   free(buffer);
   buffer[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*realloc_uaf.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
-// CHECK: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*realloc_uaf.cpp}}:[[@LINE-8]]
-// CHECK: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*realloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*realloc_uaf.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
+  // CHECK: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*realloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*realloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp b/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
index d896da482f2eb..be99c89e7083e 100644
--- a/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
@@ -11,12 +11,12 @@
 int main() {
   char *buffer = (char*)malloc(42);
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*symbols_path.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 1 bytes before 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc}}
-// CHECK-NEXT: {{#1 .* main .*symbols_path.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*symbols_path.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc}}
+  // CHECK: {{ #[1-2] .* main .*symbols_path.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp b/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
index dfd25cebb6bfe..4c32c63c38fa1 100644
--- a/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
@@ -9,15 +9,15 @@ int main() {
   buffer = (char*)realloc(buffer, 64);
   // The 'stale' may now point to a free'd memory.
   stale[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*use_after_realloc.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 32-byte region
-// CHECK: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*use_after_realloc.cpp}}:[[@LINE-9]]
-// CHECK: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*use_after_realloc.cpp}}:[[@LINE-14]]
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*use_after_realloc.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 32-byte region
+  // CHECK: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*use_after_realloc.cpp}}:[[@LINE-9]]
+  // CHECK: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*use_after_realloc.cpp}}:[[@LINE-14]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/calloc-overflow.cpp b/compiler-rt/test/asan/TestCases/calloc-overflow.cpp
index 9a0d41ff82601..b930b65cd8c3b 100644
--- a/compiler-rt/test/asan/TestCases/calloc-overflow.cpp
+++ b/compiler-rt/test/asan/TestCases/calloc-overflow.cpp
@@ -11,7 +11,7 @@ int main() {
   void *p = calloc(-1, 1000);
   // CHECK: {{ERROR: AddressSanitizer: calloc parameters overflow: count \* size \(.* \* 1000\) cannot be represented in type size_t}}
   // CHECK: {{#0 0x.* in .*calloc}}
-  // CHECK: {{#1 0x.* in main .*calloc-overflow.cpp:}}[[@LINE-3]]
+  // CHECK: {{#[1-3] 0x.* in main .*calloc-overflow.cpp:}}[[@LINE-3]]
   // CHECK: SUMMARY: AddressSanitizer: calloc-overflow
 
   printf("calloc returned: %zu\n", (size_t)p);
diff --git a/compiler-rt/test/asan/TestCases/deep_stack_uaf.cpp b/compiler-rt/test/asan/TestCases/deep_stack_uaf.cpp
index bdf0dbdb4a4bd..7d8a09220689f 100644
--- a/compiler-rt/test/asan/TestCases/deep_stack_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/deep_stack_uaf.cpp
@@ -1,7 +1,7 @@
 // Check that we can store lots of stack frames if asked to.
 
 // RUN: %clangxx_asan -O0 %s -o %t 2>&1
-// RUN: %env_asan_opts=malloc_context_size=120:redzone=512 not %run %t 2>&1 | FileCheck %s
+// RUN: %env_asan_opts=malloc_context_size=125:redzone=512 not %run %t 2>&1 | FileCheck %s
 // REQUIRES: stable-runtime
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/compiler-rt/test/asan/TestCases/double-free.cpp b/compiler-rt/test/asan/TestCases/double-free.cpp
index b01e41005a27a..7b61df0715afa 100644
--- a/compiler-rt/test/asan/TestCases/double-free.cpp
+++ b/compiler-rt/test/asan/TestCases/double-free.cpp
@@ -19,10 +19,10 @@ int main(int argc, char **argv) {
   free(x + argc - 1);  // BOOM
   // CHECK: AddressSanitizer: attempting double-free{{.*}}in thread T0
   // CHECK: #0 0x{{.*}} in {{.*}}free
-  // CHECK: #1 0x{{.*}} in main {{.*}}double-free.cpp:[[@LINE-3]]
+  // CHECK: #{{[1-3]}} 0x{{.*}} in main {{.*}}double-free.cpp:[[@LINE-3]]
   // CHECK: freed by thread T0 here:
   // MALLOC-CTX: #0 0x{{.*}} in {{.*}}free
-  // MALLOC-CTX: #1 0x{{.*}} in main {{.*}}double-free.cpp:[[@LINE-7]]
+  // MALLOC-CTX: #{{[1-3]}} 0x{{.*}} in main {{.*}}double-free.cpp:[[@LINE-7]]
   // CHECK: allocated by thread T0 here:
   // MALLOC-CTX: double-free.cpp:[[@LINE-12]]
   // CHECK-RECOVER: AddressSanitizer: attempting double-free{{.*}}in thread T0
diff --git a/compiler-rt/test/asan/TestCases/malloc-size-too-big.cpp b/compiler-rt/test/asan/TestCases/malloc-size-too-big.cpp
index 71d6a3e7148a3..771640a4ac08d 100644
--- a/compiler-rt/test/asan/TestCases/malloc-size-too-big.cpp
+++ b/compiler-rt/test/asan/TestCases/malloc-size-too-big.cpp
@@ -18,7 +18,7 @@ int main() {
   void *p = malloc(kMaxAllowedMallocSizePlusOne);
   // CHECK: {{ERROR: AddressSanitizer: requested allocation size .* \(.* after adjustments for alignment, red zones etc\.\) exceeds maximum supported size}}
   // CHECK: {{#0 0x.* in .*malloc}}
-  // CHECK: {{#1 0x.* in main .*malloc-size-too-big.cpp:}}[[@LINE-3]]
+  // CHECK: {{#[1-3] 0x.* in main .*malloc-size-too-big.cpp:}}[[@LINE-3]]
   // CHECK: SUMMARY: AddressSanitizer: allocation-size-too-big
 
   printf("malloc returned: %zu\n", (size_t)p);
diff --git a/compiler-rt/test/asan/TestCases/strncpy-overflow.cpp b/compiler-rt/test/asan/TestCases/strncpy-overflow.cpp
index d852ccebd9a77..ff84052a94987 100644
--- a/compiler-rt/test/asan/TestCases/strncpy-overflow.cpp
+++ b/compiler-rt/test/asan/TestCases/strncpy-overflow.cpp
@@ -33,6 +33,6 @@ int main(int argc, char **argv) {
   // CHECK: {{0x.* is located 0 bytes after 9-byte region}}
   // CHECK: {{allocated by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*malloc}}
-  // CHECK: {{    #1 0x.* in main .*strncpy-overflow.cpp:}}[[@LINE-8]]
+  // CHECK: {{    #[1-3] 0x.* in main .*strncpy-overflow.cpp:}}[[@LINE-8]]
   return rval + sink_memory(9, short_buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/use-after-free-right.cpp b/compiler-rt/test/asan/TestCases/use-after-free-right.cpp
index 10dbe5b905684..11011e4b4fb1a 100644
--- a/compiler-rt/test/asan/TestCases/use-after-free-right.cpp
+++ b/compiler-rt/test/asan/TestCases/use-after-free-right.cpp
@@ -19,9 +19,9 @@ int main() {
   // CHECK: {{0x.* is located 0 bytes inside of 1-byte region .0x.*,0x.*}}
   // CHECK: {{freed by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*free}}
-  // CHECK: {{    #1 0x.* in main .*use-after-free-right.cpp:}}[[@LINE-9]]
+  // CHECK: {{    #[1-3] 0x.* in main .*use-after-free-right.cpp:}}[[@LINE-9]]
 
   // CHECK: {{previously allocated by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*malloc}}
-  // CHECK: {{    #1 0x.* in main .*use-after-free-right.cpp:}}[[@LINE-14]]
+  // CHECK: {{    #[1-3] 0x.* in main .*use-after-free-right.cpp:}}[[@LINE-14]]
 }
diff --git a/compiler-rt/test/asan/TestCases/use-after-free.cpp b/compiler-rt/test/asan/TestCases/use-after-free.cpp
index 4b20ee25500c0..f19c461960d36 100644
--- a/compiler-rt/test/asan/TestCases/use-after-free.cpp
+++ b/compiler-rt/test/asan/TestCases/use-after-free.cpp
@@ -16,11 +16,11 @@ int main() {
   // CHECK: {{0x.* is located 5 bytes inside of 10-byte region .0x.*,0x.*}}
   // CHECK: {{freed by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*free}}
-  // CHECK: {{    #1 0x.* in main .*use-after-free.cpp:}}[[@LINE-9]]
+  // CHECK: {{    #[1-3] 0x.* in main .*use-after-free.cpp:}}[[@LINE-9]]
 
   // CHECK: {{previously allocated by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*malloc}}
-  // CHECK: {{    #1 0x.* in main .*use-after-free.cpp:}}[[@LINE-14]]
+  // CHECK: {{    #[1-3] 0x.* in main .*use-after-free.cpp:}}[[@LINE-14]]
   // CHECK: Shadow byte legend (one shadow byte represents {{[0-9]+}} application bytes):
   // CHECK: Global redzone:
   // CHECK: ASan internal:
diff --git a/compiler-rt/test/asan_abi/CMakeLists.txt b/compiler-rt/test/asan_abi/CMakeLists.txt
index f28cf6cd1da6e..1114ed1b82793 100644
--- a/compiler-rt/test/asan_abi/CMakeLists.txt
+++ b/compiler-rt/test/asan_abi/CMakeLists.txt
@@ -9,10 +9,7 @@ macro(get_bits_for_arch arch bits)
   endif()
 endmacro()
 
-set(ASAN_ABI_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
-if(NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND ASAN_ABI_TEST_DEPS asan_abi)
-endif()
+set(ASAN_ABI_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} asan_abi)
 
 set(ASAN_ABI_TEST_ARCH ${ASAN_ABI_SUPPORTED_ARCH})
 if(APPLE)
diff --git a/compiler-rt/test/fuzzer/CMakeLists.txt b/compiler-rt/test/fuzzer/CMakeLists.txt
index f0ba087a1b326..5bcb44757bae4 100644
--- a/compiler-rt/test/fuzzer/CMakeLists.txt
+++ b/compiler-rt/test/fuzzer/CMakeLists.txt
@@ -1,12 +1,17 @@
-set(LIBFUZZER_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
+set(LIBFUZZER_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} fuzzer)
+if (COMPILER_RT_HAS_UBSAN)
+  list(APPEND LIBFUZZER_TEST_DEPS ubsan)
+endif()
+if (COMPILER_RT_HAS_ASAN)
+  list(APPEND LIBFUZZER_TEST_DEPS asan)
+endif()
+if (COMPILER_RT_HAS_MSAN)
+  list(APPEND LIBFUZZER_TEST_DEPS msan)
+endif()
+if (COMPILER_RT_HAS_DFSAN)
+  list(APPEND LIBFUZZER_TEST_DEPS dfsan)
+endif()
 if (NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND LIBFUZZER_TEST_DEPS fuzzer asan ubsan)
-  if (COMPILER_RT_HAS_MSAN)
-    list(APPEND LIBFUZZER_TEST_DEPS msan)
-  endif()
-  if (COMPILER_RT_HAS_DFSAN)
-    list(APPEND LIBFUZZER_TEST_DEPS dfsan)
-  endif()
   if(NOT APPLE AND COMPILER_RT_HAS_LLD AND "lld" IN_LIST LLVM_ENABLE_PROJECTS)
     list(APPEND LIBFUZZER_TEST_DEPS lld)
   endif()
diff --git a/compiler-rt/test/lsan/TestCases/recoverable_leak_check.cpp b/compiler-rt/test/lsan/TestCases/recoverable_leak_check.cpp
index 296f5a0a5915f..dbad784f85528 100644
--- a/compiler-rt/test/lsan/TestCases/recoverable_leak_check.cpp
+++ b/compiler-rt/test/lsan/TestCases/recoverable_leak_check.cpp
@@ -1,13 +1,10 @@
 // Test for on-demand leak checking.
 // RUN: %clangxx_lsan %s -o %t
-// RUN: %env_lsan_opts=use_stacks=0:use_registers=0 %run %t foo 2>&1 | FileCheck %s
-// RUN: %env_lsan_opts=use_stacks=0:use_registers=0 %run %t 2>&1 | FileCheck %s
+// RUN: %env_lsan_opts=use_stacks=0:use_registers=0:symbolize=0 %run %t foo 2>&1 | FileCheck %s
+// RUN: %env_lsan_opts=use_stacks=0:use_registers=0:symbolize=0 %run %t 2>&1 | FileCheck %s
 
 // UNSUPPORTED: darwin
 
-// FIXME: Investigate.
-// XFAIL: internal_symbolizer && lsan-standalone && i386-linux
-
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/memprof/CMakeLists.txt b/compiler-rt/test/memprof/CMakeLists.txt
index 8a29919b17702..3f0ba3812485d 100644
--- a/compiler-rt/test/memprof/CMakeLists.txt
+++ b/compiler-rt/test/memprof/CMakeLists.txt
@@ -11,12 +11,9 @@ macro(get_bits_for_arch arch bits)
   endif()
 endmacro()
 
-set(MEMPROF_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
-if(NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND MEMPROF_TEST_DEPS memprof)
-  if(COMPILER_RT_HAS_LLD AND TARGET lld)
-    list(APPEND MEMPROF_TEST_DEPS lld)
-  endif()
+set(MEMPROF_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} memprof)
+if(NOT COMPILER_RT_STANDALONE_BUILD AND COMPILER_RT_HAS_LLD AND TARGET lld)
+  list(APPEND MEMPROF_TEST_DEPS lld)
 endif()
 set(MEMPROF_DYNAMIC_TEST_DEPS ${MEMPROF_TEST_DEPS})
 
diff --git a/compiler-rt/test/msan/CMakeLists.txt b/compiler-rt/test/msan/CMakeLists.txt
index 01b832b5ae3f9..9f784507c4ee1 100644
--- a/compiler-rt/test/msan/CMakeLists.txt
+++ b/compiler-rt/test/msan/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(MSAN_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 set(MSAN_TESTSUITES)
-set(MSAN_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
+set(MSAN_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} msan)
 
 set(MSAN_TEST_ARCH ${MSAN_SUPPORTED_ARCH})
 if(APPLE)
@@ -41,10 +41,6 @@ foreach(arch ${MSAN_TEST_ARCH})
   endif()
 endforeach()
 
-if(NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND MSAN_TEST_DEPS msan)
-endif()
-
 if(COMPILER_RT_INCLUDE_TESTS AND
    COMPILER_RT_LIBCXX_PATH AND
    COMPILER_RT_LIBCXXABI_PATH)
diff --git a/compiler-rt/test/sanitizer_common/TestCases/corelimit.cpp b/compiler-rt/test/sanitizer_common/TestCases/corelimit.cpp
index 2378a4cfdced1..9b56471905aaf 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/corelimit.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/corelimit.cpp
@@ -10,7 +10,9 @@ int main() {
   getrlimit(RLIMIT_CORE, &lim_core);
   void *p;
   if (sizeof(p) == 8) {
-    assert(0 == lim_core.rlim_cur);
+    // rlim_cur will be set to zero or one depending on the target OS and
+    // initial core limits. See comments in DisableCoreDumperIfNecessary().
+    assert(lim_core.rlim_cur <= 1u);
   }
   return 0;
 }
diff --git a/compiler-rt/test/tsan/signal_errno.cpp b/compiler-rt/test/tsan/signal_errno.cpp
index 7e1fd4b0c5a55..99d4b6d84ca4b 100644
--- a/compiler-rt/test/tsan/signal_errno.cpp
+++ b/compiler-rt/test/tsan/signal_errno.cpp
@@ -18,7 +18,7 @@ static void MyHandler(int, siginfo_t *s, void *c) {
 
 static void* sendsignal(void *p) {
   barrier_wait(&barrier);
-  pthread_kill(mainth, SIGPROF);
+  pthread_kill(mainth, SIGALRM);
   return 0;
 }
 
@@ -37,7 +37,7 @@ int main() {
   mainth = pthread_self();
   struct sigaction act = {};
   act.sa_sigaction = &MyHandler;
-  sigaction(SIGPROF, &act, 0);
+  sigaction(SIGALRM, &act, 0);
   pthread_t th;
   pthread_create(&th, 0, sendsignal, 0);
   loop();
@@ -46,7 +46,7 @@ int main() {
 }
 
 // CHECK: WARNING: ThreadSanitizer: signal handler spoils errno
-// CHECK:   Signal 27 handler invoked at:
+// CHECK:   Signal 14 handler invoked at:
 // CHECK:     #0 MyHandler(int, {{(__)?}}siginfo{{(_t)?}}*, void*) {{.*}}signal_errno.cpp
 // CHECK:     main
 // CHECK: SUMMARY: ThreadSanitizer: signal handler spoils errno{{.*}}MyHandler
diff --git a/compiler-rt/test/tsan/signal_reset.cpp b/compiler-rt/test/tsan/signal_reset.cpp
index 82758d882382f..d76b7e5f3b5f7 100644
--- a/compiler-rt/test/tsan/signal_reset.cpp
+++ b/compiler-rt/test/tsan/signal_reset.cpp
@@ -28,12 +28,12 @@ static void* reset(void *p) {
   struct sigaction act = {};
   for (int i = 0; i < 1000000; i++) {
     act.sa_handler = &handler;
-    if (sigaction(SIGPROF, &act, 0)) {
+    if (sigaction(SIGALRM, &act, 0)) {
       perror("sigaction");
       exit(1);
     }
     act.sa_handler = SIG_IGN;
-    if (sigaction(SIGPROF, &act, 0)) {
+    if (sigaction(SIGALRM, &act, 0)) {
       perror("sigaction");
       exit(1);
     }
@@ -44,7 +44,7 @@ static void* reset(void *p) {
 int main() {
   struct sigaction act = {};
   act.sa_handler = SIG_IGN;
-  if (sigaction(SIGPROF, &act, 0)) {
+  if (sigaction(SIGALRM, &act, 0)) {
     perror("sigaction");
     exit(1);
   }
@@ -53,7 +53,7 @@ int main() {
   t.it_value.tv_sec = 0;
   t.it_value.tv_usec = 10;
   t.it_interval = t.it_value;
-  if (setitimer(ITIMER_PROF, &t, 0)) {
+  if (setitimer(ITIMER_REAL, &t, 0)) {
     perror("setitimer");
     exit(1);
   }
diff --git a/compiler-rt/test/tsan/signal_sync.cpp b/compiler-rt/test/tsan/signal_sync.cpp
index b529a1859f52a..878b3f3b88b9f 100644
--- a/compiler-rt/test/tsan/signal_sync.cpp
+++ b/compiler-rt/test/tsan/signal_sync.cpp
@@ -30,7 +30,7 @@ int main() {
 
   struct sigaction act = {};
   act.sa_handler = &handler;
-  if (sigaction(SIGPROF, &act, 0)) {
+  if (sigaction(SIGVTALRM, &act, 0)) {
     perror("sigaction");
     exit(1);
   }
@@ -39,7 +39,7 @@ int main() {
   t.it_value.tv_sec = 0;
   t.it_value.tv_usec = 10;
   t.it_interval = t.it_value;
-  if (setitimer(ITIMER_PROF, &t, 0)) {
+  if (setitimer(ITIMER_VIRTUAL, &t, 0)) {
     perror("setitimer");
     exit(1);
   }
diff --git a/compiler-rt/test/tsan/signal_thread.cpp b/compiler-rt/test/tsan/signal_thread.cpp
index aa91d1ddeb101..7bba8159bf38f 100644
--- a/compiler-rt/test/tsan/signal_thread.cpp
+++ b/compiler-rt/test/tsan/signal_thread.cpp
@@ -24,7 +24,7 @@ static void* thr(void *p) {
 int main() {
   struct sigaction act = {};
   act.sa_handler = &handler;
-  if (sigaction(SIGPROF, &act, 0)) {
+  if (sigaction(SIGVTALRM, &act, 0)) {
     perror("sigaction");
     exit(1);
   }
@@ -33,7 +33,7 @@ int main() {
   t.it_value.tv_sec = 0;
   t.it_value.tv_usec = 10;
   t.it_interval = t.it_value;
-  if (setitimer(ITIMER_PROF, &t, 0)) {
+  if (setitimer(ITIMER_VIRTUAL, &t, 0)) {
     perror("setitimer");
     exit(1);
   }
diff --git a/compiler-rt/test/tsan/signal_thread2.cpp b/compiler-rt/test/tsan/signal_thread2.cpp
index 9bde4f70b39d8..5236628e13b60 100644
--- a/compiler-rt/test/tsan/signal_thread2.cpp
+++ b/compiler-rt/test/tsan/signal_thread2.cpp
@@ -40,7 +40,7 @@ static void *thr(void *p) {
 int main() {
   struct sigaction act = {};
   act.sa_handler = &handler;
-  if (sigaction(SIGPROF, &act, 0)) {
+  if (sigaction(SIGALRM, &act, 0)) {
     perror("sigaction");
     exit(1);
   }
@@ -49,7 +49,7 @@ int main() {
   t.it_value.tv_sec = 0;
   t.it_value.tv_usec = 10;
   t.it_interval = t.it_value;
-  if (setitimer(ITIMER_PROF, &t, 0)) {
+  if (setitimer(ITIMER_REAL, &t, 0)) {
     perror("setitimer");
     exit(1);
   }
diff --git a/compiler-rt/test/ubsan/CMakeLists.txt b/compiler-rt/test/ubsan/CMakeLists.txt
index b5040f79e607b..52052c80960c2 100644
--- a/compiler-rt/test/ubsan/CMakeLists.txt
+++ b/compiler-rt/test/ubsan/CMakeLists.txt
@@ -23,9 +23,7 @@ macro(add_ubsan_testsuite test_mode sanitizer arch lld thinlto)
     ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py)
   list(APPEND UBSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
-  if(NOT COMPILER_RT_STANDALONE_BUILD)
-    list(APPEND UBSAN_TEST_DEPS ${sanitizer})
-  endif()
+  list(APPEND UBSAN_TEST_DEPS ${sanitizer})
 endmacro()
 
 macro(add_ubsan_testsuites test_mode sanitizer arch)
diff --git a/compiler-rt/test/xray/CMakeLists.txt b/compiler-rt/test/xray/CMakeLists.txt
index b97659ae08f9b..0c008b6ea5577 100644
--- a/compiler-rt/test/xray/CMakeLists.txt
+++ b/compiler-rt/test/xray/CMakeLists.txt
@@ -3,12 +3,11 @@ set(XRAY_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(XRAY_TESTSUITES)
 set(XRAY_FDR_TESTSUITES)
 
-set(XRAY_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
-set(XRAY_FDR_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
+set(XRAY_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} xray)
 
 if(NOT COMPILER_RT_STANDALONE_BUILD AND COMPILER_RT_BUILD_XRAY AND
    COMPILER_RT_HAS_XRAY)
-  list(APPEND XRAY_TEST_DEPS xray llvm-xray)
+  list(APPEND XRAY_TEST_DEPS llvm-xray)
 endif()
 
 set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH})
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
index b251f1a0538ce..1b0d4d5871cbe 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
@@ -28,6 +28,7 @@
 from dex.debugger.visualstudio.VisualStudio2015 import VisualStudio2015
 from dex.debugger.visualstudio.VisualStudio2017 import VisualStudio2017
 from dex.debugger.visualstudio.VisualStudio2019 import VisualStudio2019
+from dex.debugger.visualstudio.VisualStudio2022 import VisualStudio2022
 
 
 def _get_potential_debuggers():  # noqa
@@ -41,6 +42,7 @@ def _get_potential_debuggers():  # noqa
         VisualStudio2015.get_option_name(): VisualStudio2015,
         VisualStudio2017.get_option_name(): VisualStudio2017,
         VisualStudio2019.get_option_name(): VisualStudio2019,
+        VisualStudio2022.get_option_name(): VisualStudio2022,
     }
 
 
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio2022.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio2022.py
new file mode 100644
index 0000000000000..6fcf8af4acabc
--- /dev/null
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio2022.py
@@ -0,0 +1,23 @@
+# DExTer : Debugging Experience Tester
+# ~~~~~~   ~         ~~         ~   ~~
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Specializations for the Visual Studio 2022 interface."""
+
+from dex.debugger.visualstudio.VisualStudio import VisualStudio
+
+
+class VisualStudio2022(VisualStudio):
+    @classmethod
+    def get_name(cls):
+        return "Visual Studio 2022"
+
+    @classmethod
+    def get_option_name(cls):
+        return "vs2022"
+
+    @property
+    def _dte_version(self):
+        return "VisualStudio.DTE.17.0"
diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/list-debuggers/list-debuggers.test b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/list-debuggers/list-debuggers.test
index bbc9dd501b004..2bce540ced181 100644
--- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/list-debuggers/list-debuggers.test
+++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/list-debuggers/list-debuggers.test
@@ -5,3 +5,5 @@ RUN: %dexter_base list-debuggers | FileCheck %s
 CHECK: lldb
 CHECK: vs2015
 CHECK: vs2017
+CHECK: vs2019
+CHECK: vs2022
diff --git a/flang/docs/RuntimeEnvironment.md b/flang/docs/RuntimeEnvironment.md
new file mode 100644
index 0000000000000..c7a3dfbb2af1d
--- /dev/null
+++ b/flang/docs/RuntimeEnvironment.md
@@ -0,0 +1,57 @@
+<!--===- docs/RuntimeEnvironment.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
+```{contents}
+---
+local:
+---
+```
+
+# Environment variables of significance to Fortran execution
+
+A few environment variables are queried by the Fortran runtime support
+library.
+
+The following environment variables can affect the behavior of
+Fortran programs during execution.
+
+## `DEFAULT_UTF8=1`
+
+Set `DEFAULT_UTF8` to cause formatted external input to assume UTF-8
+encoding on input and use UTF-8 encoding on formatted external output.
+
+## `FORT_CONVERT`
+
+Determines data conversions applied to unformatted I/O.
+
+* `NATIVE`: no conversions (default)
+* `LITTLE_ENDIAN`: assume input is little-endian; emit little-endian output
+* `BIG_ENDIAN`: assume input is big-endian; emit big-endian output
+* `SWAP`: reverse endianness (always convert)
+
+## `FORT_CHECK_POINTER_DEALLOCATION`
+
+Fortran requires that a pointer that appears in a `DEALLOCATE` statement
+must have been allocated in an `ALLOCATE` statement with the same declared
+type.
+The runtime support library validates this requirement by checking the
+size of the allocated data, and will fail with an error message if
+the deallocated pointer is not valid.
+Set `FORT_CHECK_POINTER_DEALLOCATION=0` to disable this check.
+
+## `FORT_FMT_RECL`
+
+Set to an integer value to specify the record length for list-directed
+and `NAMELIST` output.
+The default is 72.
+
+## `NO_STOP_MESSAGE`
+
+Set `NO_STOP_MESSAGE=1` to disable the extra information about
+IEEE floating-point exception flags that the Fortran language
+standard requires for `STOP` and `ERROR STOP` statements.
diff --git a/flang/docs/index.md b/flang/docs/index.md
index b4dbdc87fdf68..ed749f565ff1b 100644
--- a/flang/docs/index.md
+++ b/flang/docs/index.md
@@ -80,6 +80,7 @@ on how to get in touch with us and to learn more about the current status.
    Preprocessing
    ProcedurePointer
    RuntimeDescriptor
+   RuntimeEnvironment
    RuntimeTypeInfo
    Semantics
    f2018-grammar.md
diff --git a/flang/include/flang/Common/optional.h b/flang/include/flang/Common/optional.h
new file mode 100644
index 0000000000000..b5623b84dbd36
--- /dev/null
+++ b/flang/include/flang/Common/optional.h
@@ -0,0 +1,243 @@
+//===-- include/flang/Common/optional.h -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of std::optional borrowed from LLVM's
+// libc/src/__support/CPP/optional.h with modifications (e.g. value_or, emplace
+// methods were added).
+//
+// The implementation defines optional in Fortran::common namespace.
+// This standalone implementation may be used if the target
+// does not support std::optional implementation (e.g. CUDA device env),
+// otherwise, Fortran::common::optional is an alias for std::optional.
+//
+// TODO: using libcu++ is the best option for CUDA, but there is a couple
+// of issues:
+//   * Older CUDA toolkits' libcu++ implementations do not support optional.
+//   * The include paths need to be set up such that all STD header files
+//     are taken from libcu++.
+//   * cuda:: namespace need to be forced for all std:: references.
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_COMMON_OPTIONAL_H
+#define FORTRAN_COMMON_OPTIONAL_H
+
+#include "flang/Runtime/api-attrs.h"
+#include <optional>
+#include <type_traits>
+
+#if !defined(STD_OPTIONAL_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_OPTIONAL_UNSUPPORTED 1
+#endif
+
+#define FORTRAN_OPTIONAL_INLINE_WITH_ATTRS inline RT_API_ATTRS
+#define FORTRAN_OPTIONAL_INLINE inline
+#define FORTRAN_OPTIONAL_INLINE_VAR inline
+
+namespace Fortran::common {
+
+#if STD_OPTIONAL_UNSUPPORTED
+// Trivial nullopt_t struct.
+struct nullopt_t {
+  constexpr explicit nullopt_t() = default;
+};
+
+// nullopt that can be used and returned.
+FORTRAN_OPTIONAL_INLINE_VAR constexpr nullopt_t nullopt{};
+
+// This is very simple implementation of the std::optional class. It makes
+// several assumptions that the underlying type is trivially constructible,
+// copyable, or movable.
+template <typename T> class optional {
+  template <typename U, bool = !std::is_trivially_destructible<U>::value>
+  struct OptionalStorage {
+    union {
+      char empty;
+      U stored_value;
+    };
+
+    bool in_use = false;
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS ~OptionalStorage() { reset(); }
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr OptionalStorage() : empty() {}
+
+    template <typename... Args>
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr explicit OptionalStorage(
+        std::in_place_t, Args &&...args)
+        : stored_value(std::forward<Args>(args)...) {}
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr void reset() {
+      if (in_use)
+        stored_value.~U();
+      in_use = false;
+    }
+  };
+
+  // The only difference is that this type U doesn't have a nontrivial
+  // destructor.
+  template <typename U> struct OptionalStorage<U, false> {
+    union {
+      char empty;
+      U stored_value;
+    };
+
+    bool in_use = false;
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr OptionalStorage() : empty() {}
+
+    template <typename... Args>
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr explicit OptionalStorage(
+        std::in_place_t, Args &&...args)
+        : stored_value(std::forward<Args>(args)...) {}
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr void reset() {
+      in_use = false;
+    }
+  };
+
+  OptionalStorage<T> storage;
+
+public:
+  // The default methods do not use RT_API_ATTRS, which causes
+  // warnings in CUDA compilation of form:
+  //   __device__/__host__ annotation is ignored on a function .* that is
+  //   explicitly defaulted on its first declaration
+  FORTRAN_OPTIONAL_INLINE constexpr optional() = default;
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(nullopt_t) {}
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(const T &t)
+      : storage(std::in_place, t) {
+    storage.in_use = true;
+  }
+  FORTRAN_OPTIONAL_INLINE constexpr optional(const optional &) = default;
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(T &&t)
+      : storage(std::in_place, std::move(t)) {
+    storage.in_use = true;
+  }
+  FORTRAN_OPTIONAL_INLINE constexpr optional(optional &&O) = default;
+
+  template <typename... ArgTypes>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(
+      std::in_place_t, ArgTypes &&...Args)
+      : storage(std::in_place, std::forward<ArgTypes>(Args)...) {
+    storage.in_use = true;
+  }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional &operator=(T &&t) {
+    storage.stored_value = std::move(t);
+    storage.in_use = true;
+    return *this;
+  }
+
+  FORTRAN_OPTIONAL_INLINE constexpr optional &operator=(optional &&) = default;
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional &operator=(const T &t) {
+    storage.stored_value = t;
+    storage.in_use = true;
+    return *this;
+  }
+
+  FORTRAN_OPTIONAL_INLINE constexpr optional &operator=(
+      const optional &) = default;
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr void reset() { storage.reset(); }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr const T &value() const & {
+    return storage.stored_value;
+  }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T &value() & {
+    return storage.stored_value;
+  }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr explicit operator bool() const {
+    return storage.in_use;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr bool has_value() const {
+    return storage.in_use;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr const T *operator->() const {
+    return &storage.stored_value;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T *operator->() {
+    return &storage.stored_value;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr const T &operator*() const & {
+    return storage.stored_value;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T &operator*() & {
+    return storage.stored_value;
+  }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T &&value() && {
+    return std::move(storage.stored_value);
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T &&operator*() && {
+    return std::move(storage.stored_value);
+  }
+
+  template <typename VT>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T value_or(
+      VT &&default_value) const & {
+    return storage.in_use ? storage.stored_value
+                          : static_cast<T>(std::forward<VT>(default_value));
+  }
+
+  template <typename VT>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T value_or(
+      VT &&default_value) && {
+    return storage.in_use ? std::move(storage.stored_value)
+                          : static_cast<T>(std::forward<VT>(default_value));
+  }
+
+  template <typename... ArgTypes>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS
+      std::enable_if_t<std::is_constructible_v<T, ArgTypes &&...>, T &>
+      emplace(ArgTypes &&...args) {
+    reset();
+    new (reinterpret_cast<void *>(std::addressof(storage.stored_value)))
+        T(std::forward<ArgTypes>(args)...);
+    storage.in_use = true;
+    return value();
+  }
+
+  template <typename U = T,
+      std::enable_if_t<(std::is_constructible_v<T, U &&> &&
+                           !std::is_same_v<std::decay_t<U>, std::in_place_t> &&
+                           !std::is_same_v<std::decay_t<U>, optional> &&
+                           std::is_convertible_v<U &&, T>),
+          bool> = true>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(U &&value) {
+    new (reinterpret_cast<void *>(std::addressof(storage.stored_value)))
+        T(std::forward<U>(value));
+    storage.in_use = true;
+  }
+
+  template <typename U = T,
+      std::enable_if_t<(std::is_constructible_v<T, U &&> &&
+                           !std::is_same_v<std::decay_t<U>, std::in_place_t> &&
+                           !std::is_same_v<std::decay_t<U>, optional> &&
+                           !std::is_convertible_v<U &&, T>),
+          bool> = false>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS explicit constexpr optional(U &&value) {
+    new (reinterpret_cast<void *>(std::addressof(storage.stored_value)))
+        T(std::forward<U>(value));
+    storage.in_use = true;
+  }
+};
+#else // !STD_OPTIONAL_UNSUPPORTED
+using std::nullopt;
+using std::nullopt_t;
+using std::optional;
+#endif // !STD_OPTIONAL_UNSUPPORTED
+
+} // namespace Fortran::common
+
+#endif // FORTRAN_COMMON_OPTIONAL_H
diff --git a/flang/include/flang/Common/reference-wrapper.h b/flang/include/flang/Common/reference-wrapper.h
new file mode 100644
index 0000000000000..66f924662d961
--- /dev/null
+++ b/flang/include/flang/Common/reference-wrapper.h
@@ -0,0 +1,114 @@
+//===-- include/flang/Common/reference-wrapper.h ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// clang-format off
+//
+// Implementation of std::reference_wrapper borrowed from libcu++
+// https://github.com/NVIDIA/libcudacxx/blob/f7e6cd07ed5ba826aeac0b742feafddfedc1e400/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h#L1
+// with modifications.
+//
+// The original source code is distributed under the Apache License v2.0
+// with LLVM Exceptions.
+//
+// TODO: using libcu++ is the best option for CUDA, but there is a couple
+// of issues:
+//   * The include paths need to be set up such that all STD header files
+//     are taken from libcu++.
+//   * cuda:: namespace need to be forced for all std:: references.
+//
+// clang-format on
+
+#ifndef FORTRAN_COMMON_REFERENCE_WRAPPER_H
+#define FORTRAN_COMMON_REFERENCE_WRAPPER_H
+
+#include "flang/Runtime/api-attrs.h"
+#include <functional>
+#include <type_traits>
+
+#if !defined(STD_REFERENCE_WRAPPER_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_REFERENCE_WRAPPER_UNSUPPORTED 1
+#endif
+
+namespace Fortran::common {
+
+template <class _Tp>
+using __remove_cvref_t = std::remove_cv_t<std::remove_reference_t<_Tp>>;
+template <class _Tp, class _Up>
+struct __is_same_uncvref
+    : std::is_same<__remove_cvref_t<_Tp>, __remove_cvref_t<_Up>> {};
+
+#if STD_REFERENCE_WRAPPER_UNSUPPORTED
+template <class _Tp> class reference_wrapper {
+public:
+  // types
+  typedef _Tp type;
+
+private:
+  type *__f_;
+
+  static RT_API_ATTRS void __fun(_Tp &);
+  static void __fun(_Tp &&) = delete;
+
+public:
+  template <class _Up,
+      class =
+          std::enable_if_t<!__is_same_uncvref<_Up, reference_wrapper>::value,
+              decltype(__fun(std::declval<_Up>()))>>
+  constexpr RT_API_ATTRS reference_wrapper(_Up &&__u) {
+    type &__f = static_cast<_Up &&>(__u);
+    __f_ = std::addressof(__f);
+  }
+
+  // access
+  constexpr RT_API_ATTRS operator type &() const { return *__f_; }
+  constexpr RT_API_ATTRS type &get() const { return *__f_; }
+
+  // invoke
+  template <class... _ArgTypes>
+  constexpr RT_API_ATTRS typename std::invoke_result_t<type &, _ArgTypes...>
+  operator()(_ArgTypes &&...__args) const {
+    return std::invoke(get(), std::forward<_ArgTypes>(__args)...);
+  }
+};
+
+template <class _Tp> reference_wrapper(_Tp &) -> reference_wrapper<_Tp>;
+
+template <class _Tp>
+inline constexpr RT_API_ATTRS reference_wrapper<_Tp> ref(_Tp &__t) {
+  return reference_wrapper<_Tp>(__t);
+}
+
+template <class _Tp>
+inline constexpr RT_API_ATTRS reference_wrapper<_Tp> ref(
+    reference_wrapper<_Tp> __t) {
+  return __t;
+}
+
+template <class _Tp>
+inline constexpr RT_API_ATTRS reference_wrapper<const _Tp> cref(
+    const _Tp &__t) {
+  return reference_wrapper<const _Tp>(__t);
+}
+
+template <class _Tp>
+inline constexpr RT_API_ATTRS reference_wrapper<const _Tp> cref(
+    reference_wrapper<_Tp> __t) {
+  return __t;
+}
+
+template <class _Tp> void ref(const _Tp &&) = delete;
+template <class _Tp> void cref(const _Tp &&) = delete;
+#else // !STD_REFERENCE_WRAPPER_UNSUPPORTED
+using std::cref;
+using std::ref;
+using std::reference_wrapper;
+#endif // !STD_REFERENCE_WRAPPER_UNSUPPORTED
+
+} // namespace Fortran::common
+
+#endif // FORTRAN_COMMON_REFERENCE_WRAPPER_H
diff --git a/flang/include/flang/Common/visit.h b/flang/include/flang/Common/visit.h
index d3136be3f6a1f..f733b726189c8 100644
--- a/flang/include/flang/Common/visit.h
+++ b/flang/include/flang/Common/visit.h
@@ -17,27 +17,6 @@
 //
 // Define FLANG_USE_STD_VISIT to avoid this code and make common::visit() an
 // alias for ::std::visit().
-//
-//
-// With GCC 9.3.0 on a Haswell x86 Ubuntu system, doing out-of-tree builds:
-// Before:
-//  build:
-//   6948.53user 212.48system 27:32.92elapsed 433%CPU
-//     (0avgtext+0avgdata 6429568maxresident)k
-//   36181912inputs+8943720outputs (3613684major+97908699minor)pagefaults 0swaps
-//  execution of tests:
-//   205.99user 26.05system 1:08.87elapsed 336%CPU
-//     (0avgtext+0avgdata 2671452maxresident)k
-//   244432inputs+355464outputs (422major+8746468minor)pagefaults 0swaps
-// After:
-//  build:
-//   6651.91user 182.57system 25:15.73elapsed 450%CPU
-//     (0avgtext+0avgdata 6209296maxresident)k
-//   17413480inputs+6376360outputs (1567210major+93068230minor)pagefaults 0swaps
-//  execution of tests:
-//   201.42user 25.91system 1:04.68elapsed 351%CPU
-//     (0avgtext+0avgdata 2661424maxresident)k
-//   238840inputs+295912outputs (428major+8489300minor)pagefaults 0swaps
 
 #ifndef FORTRAN_COMMON_VISIT_H_
 #define FORTRAN_COMMON_VISIT_H_
@@ -52,7 +31,22 @@ template <std::size_t LOW, std::size_t HIGH, typename RESULT, typename VISITOR,
     typename... VARIANT>
 inline RESULT Log2VisitHelper(
     VISITOR &&visitor, std::size_t which, VARIANT &&...u) {
-  if constexpr (LOW == HIGH) {
+  if constexpr (LOW + 7 >= HIGH) {
+    switch (which - LOW) {
+#define VISIT_CASE_N(N) \
+  case N: \
+    if constexpr (LOW + N <= HIGH) { \
+      return visitor(std::get<(LOW + N)>(std::forward<VARIANT>(u))...); \
+    }
+      VISIT_CASE_N(1)
+      VISIT_CASE_N(2)
+      VISIT_CASE_N(3)
+      VISIT_CASE_N(4)
+      VISIT_CASE_N(5)
+      VISIT_CASE_N(6)
+      VISIT_CASE_N(7)
+#undef VISIT_CASE_N
+    }
     return visitor(std::get<LOW>(std::forward<VARIANT>(u))...);
   } else {
     static constexpr std::size_t mid{(HIGH + LOW) / 2};
diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index f2f37866ecde8..82c31c0c40430 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -177,6 +177,14 @@ class TypeAndShape {
   int corank() const { return corank_; }
 
   int Rank() const { return GetRank(shape_); }
+
+  // Can sequence association apply to this argument?
+  bool CanBeSequenceAssociated() const {
+    constexpr Attrs notAssumedOrExplicitShape{
+        ~Attrs{Attr::AssumedSize, Attr::Coarray}};
+    return Rank() > 0 && (attrs() & notAssumedOrExplicitShape).none();
+  }
+
   bool IsCompatibleWith(parser::ContextualMessages &, const TypeAndShape &that,
       const char *thisIs = "pointer", const char *thatIs = "target",
       bool omitShapeConformanceCheck = false,
diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h
index ee83d9fc04f3b..71be7906d2fe2 100644
--- a/flang/include/flang/Evaluate/constant.h
+++ b/flang/include/flang/Evaluate/constant.h
@@ -186,6 +186,8 @@ class Constant<Type<TypeCategory::Character, KIND>> : public ConstantBounds {
 
   const Scalar<Result> &values() const { return values_; }
   ConstantSubscript LEN() const { return length_; }
+  bool wasHollerith() const { return wasHollerith_; }
+  void set_wasHollerith(bool yes = true) { wasHollerith_ = yes; }
 
   std::optional<Scalar<Result>> GetScalarValue() const {
     if (Rank() == 0) {
@@ -210,6 +212,7 @@ class Constant<Type<TypeCategory::Character, KIND>> : public ConstantBounds {
 private:
   Scalar<Result> values_; // one contiguous string
   ConstantSubscript length_;
+  bool wasHollerith_{false};
 };
 
 class StructureConstructor;
diff --git a/flang/include/flang/Evaluate/real.h b/flang/include/flang/Evaluate/real.h
index d0da9634651f3..b7af0ff6b431c 100644
--- a/flang/include/flang/Evaluate/real.h
+++ b/flang/include/flang/Evaluate/real.h
@@ -221,20 +221,33 @@ class Real : public common::RealDetails<PREC> {
     // Normalize a fraction with just its LSB set and then multiply.
     // (Set the LSB, not the MSB, in case the scale factor needs to
     //  be subnormal.)
-    auto adjust{exponentBias + binaryPrecision - 1};
+    constexpr auto adjust{exponentBias + binaryPrecision - 1};
+    constexpr auto maxCoeffExpo{maxExponent + binaryPrecision - 1};
     auto expo{adjust + by.ToInt64()};
-    Real twoPow;
     RealFlags flags;
     int rMask{1};
     if (IsZero()) {
       expo = exponentBias; // ignore by, don't overflow
-    } else if (by > INT{maxExponent}) {
-      expo = maxExponent + binaryPrecision - 1;
-    } else if (by < INT{-adjust}) { // underflow
-      expo = 0;
-      rMask = 0;
-      flags.set(RealFlag::Underflow);
+    } else if (expo > maxCoeffExpo) {
+      if (Exponent() < exponentBias) {
+        // Must implement with two multiplications
+        return SCALE(INT{exponentBias})
+            .value.SCALE(by.SubtractSigned(INT{exponentBias}).value, rounding);
+      } else { // overflow
+        expo = maxCoeffExpo;
+      }
+    } else if (expo < 0) {
+      if (Exponent() > exponentBias) {
+        // Must implement with two multiplications
+        return SCALE(INT{-exponentBias})
+            .value.SCALE(by.AddSigned(INT{exponentBias}).value, rounding);
+      } else { // underflow to zero
+        expo = 0;
+        rMask = 0;
+        flags.set(RealFlag::Underflow);
+      }
     }
+    Real twoPow;
     flags |=
         twoPow.Normalize(false, static_cast<int>(expo), Fraction::MASKR(rMask));
     ValueWithRealFlags<Real> result{Multiply(twoPow, rounding)};
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 53896072675ab..9a32062440abc 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -430,6 +430,28 @@ template <typename A> std::optional<CoarrayRef> ExtractCoarrayRef(const A &x) {
   }
 }
 
+struct ExtractSubstringHelper {
+  template <typename T> static std::optional<Substring> visit(T &&) {
+    return std::nullopt;
+  }
+
+  static std::optional<Substring> visit(const Substring &e) { return e; }
+
+  template <typename T>
+  static std::optional<Substring> visit(const Designator<T> &e) {
+    return std::visit([](auto &&s) { return visit(s); }, e.u);
+  }
+
+  template <typename T>
+  static std::optional<Substring> visit(const Expr<T> &e) {
+    return std::visit([](auto &&s) { return visit(s); }, e.u);
+  }
+};
+
+template <typename A> std::optional<Substring> ExtractSubstring(const A &x) {
+  return ExtractSubstringHelper::visit(x);
+}
+
 // If an expression is simply a whole symbol data designator,
 // extract and return that symbol, else null.
 template <typename A> const Symbol *UnwrapWholeSymbolDataRef(const A &x) {
@@ -1196,6 +1218,20 @@ bool CheckForCoindexedObject(parser::ContextualMessages &,
     const std::optional<ActualArgument> &, const std::string &procName,
     const std::string &argName);
 
+/// Check if any of the symbols part of the expression has a cuda data
+/// attribute.
+inline bool HasCUDAAttrs(const Expr<SomeType> &expr) {
+  for (const Symbol &sym : CollectSymbols(expr)) {
+    if (const auto *details =
+            sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+      if (details->cudaDataAttr()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 } // namespace Fortran::evaluate
 
 namespace Fortran::semantics {
diff --git a/flang/include/flang/ISO_Fortran_binding.h b/flang/include/flang/ISO_Fortran_binding.h
index 3f74a7e56f175..757d7f2b10cba 100644
--- a/flang/include/flang/ISO_Fortran_binding.h
+++ b/flang/include/flang/ISO_Fortran_binding.h
@@ -127,7 +127,7 @@ namespace cfi_internal {
 // because a struct cannot be empty.
 extern "C++" template <typename T> struct FlexibleArray : T {
   RT_API_ATTRS T &operator[](int index) { return *(this + index); }
-  const RT_API_ATTRS T &operator[](int index) const { return *(this + index); }
+  RT_API_ATTRS const T &operator[](int index) const { return *(this + index); }
   RT_API_ATTRS operator T *() { return this; }
   RT_API_ATTRS operator const T *() const { return this; }
 };
diff --git a/flang/include/flang/Lower/CallInterface.h b/flang/include/flang/Lower/CallInterface.h
index e77ac4e179ba8..80b0576425377 100644
--- a/flang/include/flang/Lower/CallInterface.h
+++ b/flang/include/flang/Lower/CallInterface.h
@@ -174,6 +174,12 @@ class CallInterface {
     /// May the dummy argument require INTENT(OUT) finalization
     /// on entry to the invoked procedure? Provides conservative answer.
     bool mayRequireIntentoutFinalization() const;
+    /// Is the dummy argument an explicit-shape or assumed-size array that
+    /// must be passed by descriptor? Sequence association imply the actual
+    /// argument shape/rank may differ with the dummy shape/rank (see F'2023
+    /// section 15.5.2.12), so care is needed when creating the descriptor
+    /// for the dummy argument.
+    bool isSequenceAssociatedDescriptor() const;
     /// How entity is passed by.
     PassEntityBy passBy;
     /// What is the entity (SymbolRef for callee/ActualArgument* for caller)
@@ -273,8 +279,6 @@ class CallerInterface : public CallInterface<CallerInterface> {
     actualInputs.resize(getNumFIRArguments());
   }
 
-  using ExprVisitor = std::function<void(evaluate::Expr<evaluate::SomeType>)>;
-
   /// CRTP callbacks
   bool hasAlternateReturns() const;
   std::string getMangledName() const;
@@ -312,12 +316,21 @@ class CallerInterface : public CallInterface<CallerInterface> {
   /// procedure.
   const Fortran::semantics::Symbol *getProcedureSymbol() const;
 
+  /// Return the dummy argument symbol if this is a call to a user
+  /// defined procedure with explicit interface. Returns nullptr if there
+  /// is no user defined explicit interface.
+  const Fortran::semantics::Symbol *
+  getDummySymbol(const PassedEntity &entity) const;
+
   /// Helpers to place the lowered arguments at the right place once they
   /// have been lowered.
   void placeInput(const PassedEntity &passedEntity, mlir::Value arg);
   void placeAddressAndLengthInput(const PassedEntity &passedEntity,
                                   mlir::Value addr, mlir::Value len);
 
+  /// Get lowered FIR argument given the Fortran argument.
+  mlir::Value getInput(const PassedEntity &passedEntity);
+
   /// If this is a call to a procedure pointer or dummy, returns the related
   /// procedure designator. Nullptr otherwise.
   const Fortran::evaluate::ProcedureDesignator *getIfIndirectCall() const;
@@ -333,13 +346,27 @@ class CallerInterface : public CallInterface<CallerInterface> {
   /// the result specification expressions (extents and lengths) ? If needed,
   /// this mapping must be done after argument lowering, and before the call
   /// itself.
-  bool mustMapInterfaceSymbols() const;
+  bool mustMapInterfaceSymbolsForResult() const;
+  /// Must the caller map function interface symbols in order to evaluate
+  /// the specification expressions of a given dummy argument?
+  bool mustMapInterfaceSymbolsForDummyArgument(const PassedEntity &) const;
+
+  /// Visitor for specification expression. Boolean indicate the specification
+  /// expression is for the last extent of an assumed size array.
+  using ExprVisitor =
+      std::function<void(evaluate::Expr<evaluate::SomeType>, bool)>;
 
   /// Walk the result non-deferred extent specification expressions.
-  void walkResultExtents(ExprVisitor) const;
+  void walkResultExtents(const ExprVisitor &) const;
 
   /// Walk the result non-deferred length specification expressions.
-  void walkResultLengths(ExprVisitor) const;
+  void walkResultLengths(const ExprVisitor &) const;
+  /// Walk non-deferred extent specification expressions of a dummy argument.
+  void walkDummyArgumentExtents(const PassedEntity &,
+                                const ExprVisitor &) const;
+  /// Walk non-deferred length specification expressions of a dummy argument.
+  void walkDummyArgumentLengths(const PassedEntity &,
+                                const ExprVisitor &) const;
 
   /// Get the mlir::Value that is passed as argument \p sym of the function
   /// being called. The arguments must have been placed before calling this
@@ -355,6 +382,9 @@ class CallerInterface : public CallInterface<CallerInterface> {
   /// returns the storage type.
   mlir::Type getResultStorageType() const;
 
+  /// Return FIR type of argument.
+  mlir::Type getDummyArgumentType(const PassedEntity &) const;
+
   // Copy of base implementation.
   static constexpr bool hasHostAssociated() { return false; }
   mlir::Type getHostAssociatedTy() const {
diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index b13bb412f0f3e..ab30e317d1d9d 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -93,9 +93,16 @@ void mapSymbolAttributes(AbstractConverter &, const semantics::SymbolRef &,
 /// Instantiate the variables that appear in the specification expressions
 /// of the result of a function call. The instantiated variables are added
 /// to \p symMap.
-void mapCallInterfaceSymbols(AbstractConverter &,
-                             const Fortran::lower::CallerInterface &caller,
-                             SymMap &symMap);
+void mapCallInterfaceSymbolsForResult(
+    AbstractConverter &, const Fortran::lower::CallerInterface &caller,
+    SymMap &symMap);
+
+/// Instantiate the variables that appear in the specification expressions
+/// of a dummy argument of a procedure call. The instantiated variables are
+/// added to \p symMap.
+void mapCallInterfaceSymbolsForDummyArgument(
+    AbstractConverter &, const Fortran::lower::CallerInterface &caller,
+    SymMap &symMap, const Fortran::semantics::Symbol &dummySymbol);
 
 // TODO: consider saving the initial expression symbol dependence analysis in
 // in the PFT variable and dealing with the dependent symbols instantiation in
diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def
index 9de69ac5c80f5..be080a4d29d73 100644
--- a/flang/include/flang/Lower/LoweringOptions.def
+++ b/flang/include/flang/Lower/LoweringOptions.def
@@ -24,9 +24,6 @@ LOWERINGOPT(Name, Bits, Default)
 /// If true, lower transpose without a runtime call.
 ENUM_LOWERINGOPT(OptimizeTranspose, unsigned, 1, 1)
 
-/// If true, enable polymorphic type lowering feature. On by default.
-ENUM_LOWERINGOPT(PolymorphicTypeImpl, unsigned, 1, 1)
-
 /// If true, lower to High level FIR before lowering to FIR. On by default.
 ENUM_LOWERINGOPT(LowerToHighLevelFIR, unsigned, 1, 1)
 
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index ca15b4bc34b29..6927488517e63 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -208,6 +208,9 @@ struct IntrinsicLibrary {
   void genCFProcPointer(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+  template <mlir::arith::CmpIPredicate pred>
+  fir::ExtendedValue genCPtrCompare(mlir::Type,
+                                    llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genCosd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genDateAndTime(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genDim(mlir::Type, llvm::ArrayRef<mlir::Value>);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
index c1a72478e224c..fec8c9906effe 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
@@ -30,6 +30,10 @@ mlir::Value genFraction(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value genMod(fir::FirOpBuilder &builder, mlir::Location loc,
                    mlir::Value a, mlir::Value p);
 
+/// Generate call to Modulo intrinsic runtime routine.
+mlir::Value genModulo(fir::FirOpBuilder &builder, mlir::Location loc,
+                      mlir::Value a, mlir::Value p);
+
 /// Generate call to Nearest intrinsic runtime routine.
 mlir::Value genNearest(fir::FirOpBuilder &builder, mlir::Location loc,
                        mlir::Value x, mlir::Value s);
diff --git a/flang/include/flang/Optimizer/CodeGen/CodeGen.h b/flang/include/flang/Optimizer/CodeGen/CodeGen.h
index cbf02ec391236..26097dabf56c4 100644
--- a/flang/include/flang/Optimizer/CodeGen/CodeGen.h
+++ b/flang/include/flang/Optimizer/CodeGen/CodeGen.h
@@ -87,6 +87,9 @@ void populateFIRToLLVMConversionPatterns(fir::LLVMTypeConverter &converter,
                                          mlir::RewritePatternSet &patterns,
                                          fir::FIRToLLVMPassOptions &options);
 
+/// Populate the pattern set with the PreCGRewrite patterns.
+void populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns);
+
 // declarative passes
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/CodeGen/CGPasses.h.inc"
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 65a86d25333b5..8a7e36e42457f 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2454,6 +2454,7 @@ def fir_CUDAKernelLaunch : fir_Op<"cuda_kernel_launch", [CallOpInterface,
     SymbolRefAttr:$callee,
     I32:$grid_x,
     I32:$grid_y,
+    I32:$grid_z,
     I32:$block_x,
     I32:$block_y,
     I32:$block_z,
@@ -2463,9 +2464,9 @@ def fir_CUDAKernelLaunch : fir_Op<"cuda_kernel_launch", [CallOpInterface,
   );
 
   let assemblyFormat = [{
-    $callee `<` `<` `<` $grid_x `,` $grid_y `,` $block_x `,` $block_y `,`
-        $block_z ( `,` $bytes^ ( `,` $stream^ )? )? `>` `>` `>`
-        `` `(` ( $args^ `:` type($args) )? `)` attr-dict
+    $callee `<` `<` `<` $grid_x `,` $grid_y `,` $grid_z `,`$block_x `,`
+        $block_y `,` $block_z ( `,` $bytes^ ( `,` $stream^ )? )? `>` `>` `>`
+        `` `(` $args `)` `:` `(` type($args) `)` attr-dict
   }];
 
   let extraClassDeclaration = [{
@@ -3130,6 +3131,16 @@ def fir_BoxOffsetOp : fir_Op<"box_offset", [NoMemoryEffect]> {
 def fir_CUDAKernelOp : fir_Op<"cuda_kernel", [AttrSizedOperandSegments,
     DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
 
+  let description = [{
+    Represent the CUDA Fortran kernel directive. The operation is a loop like
+    operation that represents the iteration range of the embedded loop nest.
+
+    When grid or block variadic operands are empty, a `*` only syntax was used
+    in the Fortran code.
+    If the `*` is mixed with values for either grid or block, these are
+    represented by a 0 constant value.
+  }];
+
   let arguments = (ins
     Variadic<I32>:$grid, // empty means `*`
     Variadic<I32>:$block, // empty means `*`
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 048008a8d80c7..b2c3d92909375 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -233,6 +233,7 @@ class ParseTreeDumper {
   NODE(parser, CriticalStmt)
   NODE(parser, CUDAAttributesStmt)
   NODE(parser, CUFKernelDoConstruct)
+  NODE(CUFKernelDoConstruct, StarOrExpr)
   NODE(CUFKernelDoConstruct, Directive)
   NODE(parser, CycleStmt)
   NODE(parser, DataComponentDefStmt)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index f7b72c3af0916..c96abfba491d4 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4297,16 +4297,18 @@ struct OpenACCConstruct {
 // CUF-kernel-do-construct ->
 //     !$CUF KERNEL DO [ (scalar-int-constant-expr) ] <<< grid, block [, stream]
 //     >>> do-construct
-// grid -> * | scalar-int-expr | ( scalar-int-expr-list )
-// block -> * | scalar-int-expr | ( scalar-int-expr-list )
+// star-or-expr -> * | scalar-int-expr
+// grid -> * | scalar-int-expr | ( star-or-expr-list )
+// block -> * | scalar-int-expr | ( star-or-expr-list )
 // stream -> 0, scalar-int-expr | STREAM = scalar-int-expr
 struct CUFKernelDoConstruct {
   TUPLE_CLASS_BOILERPLATE(CUFKernelDoConstruct);
+  WRAPPER_CLASS(StarOrExpr, std::optional<ScalarIntExpr>);
   struct Directive {
     TUPLE_CLASS_BOILERPLATE(Directive);
     CharBlock source;
-    std::tuple<std::optional<ScalarIntConstantExpr>, std::list<ScalarIntExpr>,
-        std::list<ScalarIntExpr>, std::optional<ScalarIntExpr>>
+    std::tuple<std::optional<ScalarIntConstantExpr>, std::list<StarOrExpr>,
+        std::list<StarOrExpr>, std::optional<ScalarIntExpr>>
         t;
   };
   std::tuple<Directive, std::optional<DoConstruct>> t;
diff --git a/flang/include/flang/Runtime/api-attrs.h b/flang/include/flang/Runtime/api-attrs.h
index 9c8a67ffc34a8..fc3eb42e1b73f 100644
--- a/flang/include/flang/Runtime/api-attrs.h
+++ b/flang/include/flang/Runtime/api-attrs.h
@@ -109,6 +109,18 @@
 #endif
 #endif /* !defined(RT_CONST_VAR_ATTRS) */
 
+/*
+ * RT_VAR_ATTRS is marking non-const/constexpr module scope variables
+ * referenced by Flang runtime.
+ */
+#ifndef RT_VAR_ATTRS
+#if (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define RT_VAR_ATTRS __device__
+#else
+#define RT_VAR_ATTRS
+#endif
+#endif /* !defined(RT_VAR_ATTRS) */
+
 /*
  * RT_DEVICE_COMPILATION is defined for any device compilation.
  * Note that it can only be used reliably with compilers that perform
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 7ad548d6c72b4..96d56d9b43a62 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -102,7 +102,7 @@ class DescriptorAddendum {
       : derivedType_{dt}, len_{0} {}
   RT_API_ATTRS DescriptorAddendum &operator=(const DescriptorAddendum &);
 
-  const RT_API_ATTRS typeInfo::DerivedType *derivedType() const {
+  RT_API_ATTRS const typeInfo::DerivedType *derivedType() const {
     return derivedType_;
   }
   RT_API_ATTRS DescriptorAddendum &set_derivedType(
@@ -204,7 +204,7 @@ class Descriptor {
       ISO::CFI_attribute_t attribute = CFI_attribute_other);
 
   RT_API_ATTRS ISO::CFI_cdesc_t &raw() { return raw_; }
-  const RT_API_ATTRS ISO::CFI_cdesc_t &raw() const { return raw_; }
+  RT_API_ATTRS const ISO::CFI_cdesc_t &raw() const { return raw_; }
   RT_API_ATTRS std::size_t ElementBytes() const { return raw_.elem_len; }
   RT_API_ATTRS int rank() const { return raw_.rank; }
   RT_API_ATTRS TypeCode type() const { return TypeCode{raw_.type}; }
@@ -225,7 +225,7 @@ class Descriptor {
   RT_API_ATTRS Dimension &GetDimension(int dim) {
     return *reinterpret_cast<Dimension *>(&raw_.dim[dim]);
   }
-  const RT_API_ATTRS Dimension &GetDimension(int dim) const {
+  RT_API_ATTRS const Dimension &GetDimension(int dim) const {
     return *reinterpret_cast<const Dimension *>(&raw_.dim[dim]);
   }
 
@@ -345,7 +345,7 @@ class Descriptor {
       return nullptr;
     }
   }
-  const RT_API_ATTRS DescriptorAddendum *Addendum() const {
+  RT_API_ATTRS const DescriptorAddendum *Addendum() const {
     if (raw_.f18Addendum != 0) {
       return reinterpret_cast<const DescriptorAddendum *>(
           &GetDimension(rank()));
@@ -448,7 +448,7 @@ class alignas(Descriptor) StaticDescriptor {
   RT_API_ATTRS Descriptor &descriptor() {
     return *reinterpret_cast<Descriptor *>(storage_);
   }
-  const RT_API_ATTRS Descriptor &descriptor() const {
+  RT_API_ATTRS const Descriptor &descriptor() const {
     return *reinterpret_cast<const Descriptor *>(storage_);
   }
 
diff --git a/flang/include/flang/Runtime/type-code.h b/flang/include/flang/Runtime/type-code.h
index 3757840cfdef9..f7419249c2ba9 100644
--- a/flang/include/flang/Runtime/type-code.h
+++ b/flang/include/flang/Runtime/type-code.h
@@ -10,6 +10,7 @@
 #define FORTRAN_RUNTIME_TYPE_CODE_H_
 
 #include "flang/Common/Fortran.h"
+#include "flang/Common/optional.h"
 #include "flang/ISO_Fortran_binding_wrapper.h"
 #include <optional>
 #include <utility>
@@ -54,7 +55,7 @@ class TypeCode {
     return IsValid() && !IsDerived();
   }
 
-  RT_API_ATTRS std::optional<std::pair<TypeCategory, int>>
+  RT_API_ATTRS Fortran::common::optional<std::pair<TypeCategory, int>>
   GetCategoryAndKind() const;
 
   RT_API_ATTRS bool operator==(TypeCode that) const {
diff --git a/flang/include/flang/Semantics/module-dependences.h b/flang/include/flang/Semantics/module-dependences.h
index 29813a19a4b1b..3401d64c95938 100644
--- a/flang/include/flang/Semantics/module-dependences.h
+++ b/flang/include/flang/Semantics/module-dependences.h
@@ -23,9 +23,9 @@ class ModuleDependences {
   void AddDependence(
       std::string &&name, bool intrinsic, ModuleCheckSumType hash) {
     if (intrinsic) {
-      intrinsicMap_.emplace(std::move(name), hash);
+      intrinsicMap_.insert_or_assign(std::move(name), hash);
     } else {
-      nonIntrinsicMap_.emplace(std::move(name), hash);
+      nonIntrinsicMap_.insert_or_assign(std::move(name), hash);
     }
   }
   std::optional<ModuleCheckSumType> GetRequiredHash(
diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h
index a4f27b00152e2..91773ae3ea9a3 100644
--- a/flang/include/flang/Semantics/openmp-directive-sets.h
+++ b/flang/include/flang/Semantics/openmp-directive-sets.h
@@ -20,32 +20,27 @@ namespace llvm::omp {
 // Directive sets for single directives
 //===----------------------------------------------------------------------===//
 // - top<Directive>Set: The directive appears alone or as the first in a
-//   combined construct.
-// - all<Directive>Set: All standalone or combined uses of the directive.
+//   compound construct.
+// - all<Directive>Set: All standalone or compound uses of the directive.
 
-static const OmpDirectiveSet topParallelSet{
-    Directive::OMPD_parallel,
-    Directive::OMPD_parallel_do,
-    Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_parallel_sections,
-    Directive::OMPD_parallel_workshare,
-};
-
-static const OmpDirectiveSet allParallelSet{
+static const OmpDirectiveSet topDistributeSet{
+    Directive::OMPD_distribute,
     Directive::OMPD_distribute_parallel_do,
     Directive::OMPD_distribute_parallel_do_simd,
-    Directive::OMPD_parallel,
-    Directive::OMPD_parallel_do,
-    Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_parallel_sections,
-    Directive::OMPD_parallel_workshare,
-    Directive::OMPD_target_parallel,
-    Directive::OMPD_target_parallel_do,
-    Directive::OMPD_target_parallel_do_simd,
-    Directive::OMPD_target_teams_distribute_parallel_do,
-    Directive::OMPD_target_teams_distribute_parallel_do_simd,
-    Directive::OMPD_teams_distribute_parallel_do,
-    Directive::OMPD_teams_distribute_parallel_do_simd,
+    Directive::OMPD_distribute_simd,
+};
+
+static const OmpDirectiveSet allDistributeSet{
+    OmpDirectiveSet{
+        llvm::omp::OMPD_target_teams_distribute,
+        llvm::omp::OMPD_target_teams_distribute_parallel_do,
+        llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
+        llvm::omp::OMPD_target_teams_distribute_simd,
+        llvm::omp::OMPD_teams_distribute,
+        llvm::omp::OMPD_teams_distribute_parallel_do,
+        llvm::omp::OMPD_teams_distribute_parallel_do_simd,
+        llvm::omp::OMPD_teams_distribute_simd,
+    } | topDistributeSet,
 };
 
 static const OmpDirectiveSet topDoSet{
@@ -54,26 +49,69 @@ static const OmpDirectiveSet topDoSet{
 };
 
 static const OmpDirectiveSet allDoSet{
-    Directive::OMPD_distribute_parallel_do,
-    Directive::OMPD_distribute_parallel_do_simd,
+    OmpDirectiveSet{
+        Directive::OMPD_distribute_parallel_do,
+        Directive::OMPD_distribute_parallel_do_simd,
+        Directive::OMPD_parallel_do,
+        Directive::OMPD_parallel_do_simd,
+        Directive::OMPD_target_parallel_do,
+        Directive::OMPD_target_parallel_do_simd,
+        Directive::OMPD_target_teams_distribute_parallel_do,
+        Directive::OMPD_target_teams_distribute_parallel_do_simd,
+        Directive::OMPD_teams_distribute_parallel_do,
+        Directive::OMPD_teams_distribute_parallel_do_simd,
+    } | topDoSet,
+};
+
+static const OmpDirectiveSet topParallelSet{
+    Directive::OMPD_parallel,
     Directive::OMPD_parallel_do,
     Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_do,
-    Directive::OMPD_do_simd,
-    Directive::OMPD_target_parallel_do,
-    Directive::OMPD_target_parallel_do_simd,
-    Directive::OMPD_target_teams_distribute_parallel_do,
-    Directive::OMPD_target_teams_distribute_parallel_do_simd,
-    Directive::OMPD_teams_distribute_parallel_do,
-    Directive::OMPD_teams_distribute_parallel_do_simd,
+    Directive::OMPD_parallel_masked_taskloop,
+    Directive::OMPD_parallel_masked_taskloop_simd,
+    Directive::OMPD_parallel_master_taskloop,
+    Directive::OMPD_parallel_master_taskloop_simd,
+    Directive::OMPD_parallel_sections,
+    Directive::OMPD_parallel_workshare,
 };
 
-static const OmpDirectiveSet topTaskloopSet{
-    Directive::OMPD_taskloop,
-    Directive::OMPD_taskloop_simd,
+static const OmpDirectiveSet allParallelSet{
+    OmpDirectiveSet{
+        Directive::OMPD_distribute_parallel_do,
+        Directive::OMPD_distribute_parallel_do_simd,
+        Directive::OMPD_target_parallel,
+        Directive::OMPD_target_parallel_do,
+        Directive::OMPD_target_parallel_do_simd,
+        Directive::OMPD_target_teams_distribute_parallel_do,
+        Directive::OMPD_target_teams_distribute_parallel_do_simd,
+        Directive::OMPD_teams_distribute_parallel_do,
+        Directive::OMPD_teams_distribute_parallel_do_simd,
+    } | topParallelSet,
+};
+
+static const OmpDirectiveSet topSimdSet{
+    Directive::OMPD_simd,
 };
 
-static const OmpDirectiveSet allTaskloopSet{topTaskloopSet};
+static const OmpDirectiveSet allSimdSet{
+    OmpDirectiveSet{
+        Directive::OMPD_distribute_parallel_do_simd,
+        Directive::OMPD_distribute_simd,
+        Directive::OMPD_do_simd,
+        Directive::OMPD_masked_taskloop_simd,
+        Directive::OMPD_master_taskloop_simd,
+        Directive::OMPD_parallel_do_simd,
+        Directive::OMPD_parallel_masked_taskloop_simd,
+        Directive::OMPD_parallel_master_taskloop_simd,
+        Directive::OMPD_target_parallel_do_simd,
+        Directive::OMPD_target_simd,
+        Directive::OMPD_target_teams_distribute_parallel_do_simd,
+        Directive::OMPD_target_teams_distribute_simd,
+        Directive::OMPD_taskloop_simd,
+        Directive::OMPD_teams_distribute_parallel_do_simd,
+        Directive::OMPD_teams_distribute_simd,
+    } | topSimdSet,
+};
 
 static const OmpDirectiveSet topTargetSet{
     Directive::OMPD_target,
@@ -90,23 +128,22 @@ static const OmpDirectiveSet topTargetSet{
 
 static const OmpDirectiveSet allTargetSet{topTargetSet};
 
-static const OmpDirectiveSet topSimdSet{
-    Directive::OMPD_simd,
+static const OmpDirectiveSet topTaskloopSet{
+    Directive::OMPD_taskloop,
+    Directive::OMPD_taskloop_simd,
 };
 
-static const OmpDirectiveSet allSimdSet{
-    Directive::OMPD_distribute_parallel_do_simd,
-    Directive::OMPD_distribute_simd,
-    Directive::OMPD_do_simd,
-    Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_simd,
-    Directive::OMPD_target_parallel_do_simd,
-    Directive::OMPD_target_simd,
-    Directive::OMPD_target_teams_distribute_parallel_do_simd,
-    Directive::OMPD_target_teams_distribute_simd,
-    Directive::OMPD_taskloop_simd,
-    Directive::OMPD_teams_distribute_parallel_do_simd,
-    Directive::OMPD_teams_distribute_simd,
+static const OmpDirectiveSet allTaskloopSet{
+    OmpDirectiveSet{
+        Directive::OMPD_masked_taskloop,
+        Directive::OMPD_masked_taskloop_simd,
+        Directive::OMPD_master_taskloop,
+        Directive::OMPD_master_taskloop_simd,
+        Directive::OMPD_parallel_masked_taskloop,
+        Directive::OMPD_parallel_masked_taskloop_simd,
+        Directive::OMPD_parallel_master_taskloop,
+        Directive::OMPD_parallel_master_taskloop_simd,
+    } | topTaskloopSet,
 };
 
 static const OmpDirectiveSet topTeamsSet{
@@ -118,178 +155,134 @@ static const OmpDirectiveSet topTeamsSet{
 };
 
 static const OmpDirectiveSet allTeamsSet{
-    llvm::omp::OMPD_target_teams,
-    llvm::omp::OMPD_target_teams_distribute,
-    llvm::omp::OMPD_target_teams_distribute_parallel_do,
-    llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
-    llvm::omp::OMPD_target_teams_distribute_simd,
-    llvm::omp::OMPD_teams,
-    llvm::omp::OMPD_teams_distribute,
-    llvm::omp::OMPD_teams_distribute_parallel_do,
-    llvm::omp::OMPD_teams_distribute_parallel_do_simd,
-    llvm::omp::OMPD_teams_distribute_simd,
-};
-
-static const OmpDirectiveSet topDistributeSet{
-    Directive::OMPD_distribute,
-    Directive::OMPD_distribute_parallel_do,
-    Directive::OMPD_distribute_parallel_do_simd,
-    Directive::OMPD_distribute_simd,
-};
-
-static const OmpDirectiveSet allDistributeSet{
-    llvm::omp::OMPD_distribute,
-    llvm::omp::OMPD_distribute_parallel_do,
-    llvm::omp::OMPD_distribute_parallel_do_simd,
-    llvm::omp::OMPD_distribute_simd,
-    llvm::omp::OMPD_target_teams_distribute,
-    llvm::omp::OMPD_target_teams_distribute_parallel_do,
-    llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
-    llvm::omp::OMPD_target_teams_distribute_simd,
-    llvm::omp::OMPD_teams_distribute,
-    llvm::omp::OMPD_teams_distribute_parallel_do,
-    llvm::omp::OMPD_teams_distribute_parallel_do_simd,
-    llvm::omp::OMPD_teams_distribute_simd,
+    OmpDirectiveSet{
+        llvm::omp::OMPD_target_teams,
+        llvm::omp::OMPD_target_teams_distribute,
+        llvm::omp::OMPD_target_teams_distribute_parallel_do,
+        llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
+        llvm::omp::OMPD_target_teams_distribute_simd,
+    } | topTeamsSet,
 };
 
 //===----------------------------------------------------------------------===//
 // Directive sets for groups of multiple directives
 //===----------------------------------------------------------------------===//
 
+// Composite constructs
+static const OmpDirectiveSet allDistributeParallelDoSet{
+    allDistributeSet & allParallelSet & allDoSet};
+static const OmpDirectiveSet allDistributeParallelDoSimdSet{
+    allDistributeSet & allParallelSet & allDoSet & allSimdSet};
+static const OmpDirectiveSet allDistributeSimdSet{
+    allDistributeSet & allSimdSet};
 static const OmpDirectiveSet allDoSimdSet{allDoSet & allSimdSet};
+static const OmpDirectiveSet allTaskloopSimdSet{allTaskloopSet & allSimdSet};
 
-static const OmpDirectiveSet workShareSet{
-    OmpDirectiveSet{
-        Directive::OMPD_workshare,
-        Directive::OMPD_parallel_workshare,
-        Directive::OMPD_parallel_sections,
-        Directive::OMPD_sections,
-        Directive::OMPD_single,
-    } | allDoSet,
-};
-
-static const OmpDirectiveSet taskGeneratingSet{
-    OmpDirectiveSet{
-        Directive::OMPD_task,
-    } | allTaskloopSet,
-};
-
-static const OmpDirectiveSet nonPartialVarSet{
-    Directive::OMPD_allocate,
-    Directive::OMPD_allocators,
-    Directive::OMPD_threadprivate,
-    Directive::OMPD_declare_target,
+static const OmpDirectiveSet blockConstructSet{
+    Directive::OMPD_master,
+    Directive::OMPD_ordered,
+    Directive::OMPD_parallel,
+    Directive::OMPD_parallel_workshare,
+    Directive::OMPD_single,
+    Directive::OMPD_target,
+    Directive::OMPD_target_data,
+    Directive::OMPD_target_parallel,
+    Directive::OMPD_target_teams,
+    Directive::OMPD_task,
+    Directive::OMPD_taskgroup,
+    Directive::OMPD_teams,
+    Directive::OMPD_workshare,
 };
 
 static const OmpDirectiveSet loopConstructSet{
-    Directive::OMPD_distribute_parallel_do_simd,
+    Directive::OMPD_distribute,
     Directive::OMPD_distribute_parallel_do,
+    Directive::OMPD_distribute_parallel_do_simd,
     Directive::OMPD_distribute_simd,
-    Directive::OMPD_distribute,
-    Directive::OMPD_do_simd,
     Directive::OMPD_do,
-    Directive::OMPD_parallel_do_simd,
+    Directive::OMPD_do_simd,
+    Directive::OMPD_masked_taskloop,
+    Directive::OMPD_masked_taskloop_simd,
+    Directive::OMPD_master_taskloop,
+    Directive::OMPD_master_taskloop_simd,
     Directive::OMPD_parallel_do,
+    Directive::OMPD_parallel_do_simd,
+    Directive::OMPD_parallel_masked_taskloop,
+    Directive::OMPD_parallel_masked_taskloop_simd,
+    Directive::OMPD_parallel_master_taskloop,
+    Directive::OMPD_parallel_master_taskloop_simd,
     Directive::OMPD_simd,
-    Directive::OMPD_target_parallel_do_simd,
     Directive::OMPD_target_parallel_do,
+    Directive::OMPD_target_parallel_do_simd,
     Directive::OMPD_target_simd,
-    Directive::OMPD_target_teams_distribute_parallel_do_simd,
+    Directive::OMPD_target_teams_distribute,
     Directive::OMPD_target_teams_distribute_parallel_do,
+    Directive::OMPD_target_teams_distribute_parallel_do_simd,
     Directive::OMPD_target_teams_distribute_simd,
-    Directive::OMPD_target_teams_distribute,
-    Directive::OMPD_taskloop_simd,
     Directive::OMPD_taskloop,
-    Directive::OMPD_teams_distribute_parallel_do_simd,
+    Directive::OMPD_taskloop_simd,
+    Directive::OMPD_teams_distribute,
     Directive::OMPD_teams_distribute_parallel_do,
+    Directive::OMPD_teams_distribute_parallel_do_simd,
     Directive::OMPD_teams_distribute_simd,
-    Directive::OMPD_teams_distribute,
     Directive::OMPD_tile,
     Directive::OMPD_unroll,
 };
 
-static const OmpDirectiveSet blockConstructSet{
-    Directive::OMPD_master,
-    Directive::OMPD_ordered,
-    Directive::OMPD_parallel_workshare,
-    Directive::OMPD_parallel,
-    Directive::OMPD_single,
-    Directive::OMPD_target_data,
-    Directive::OMPD_target_parallel,
-    Directive::OMPD_target_teams,
-    Directive::OMPD_target,
-    Directive::OMPD_task,
-    Directive::OMPD_taskgroup,
-    Directive::OMPD_teams,
-    Directive::OMPD_workshare,
-};
-
-//===----------------------------------------------------------------------===//
-// Directive sets for allowed/not allowed nested directives
-//===----------------------------------------------------------------------===//
-
-static const OmpDirectiveSet nestedOrderedErrSet{
-    Directive::OMPD_critical,
-    Directive::OMPD_ordered,
-    Directive::OMPD_atomic,
-    Directive::OMPD_task,
-    Directive::OMPD_taskloop,
+static const OmpDirectiveSet nonPartialVarSet{
+    Directive::OMPD_allocate,
+    Directive::OMPD_allocators,
+    Directive::OMPD_threadprivate,
+    Directive::OMPD_declare_target,
 };
 
-static const OmpDirectiveSet nestedWorkshareErrSet{
+static const OmpDirectiveSet taskGeneratingSet{
     OmpDirectiveSet{
         Directive::OMPD_task,
-        Directive::OMPD_taskloop,
-        Directive::OMPD_critical,
-        Directive::OMPD_ordered,
-        Directive::OMPD_atomic,
-        Directive::OMPD_master,
-    } | workShareSet,
+    } | allTaskloopSet,
 };
 
-static const OmpDirectiveSet nestedMasterErrSet{
+static const OmpDirectiveSet workShareSet{
     OmpDirectiveSet{
-        Directive::OMPD_atomic,
-    } | taskGeneratingSet |
-        workShareSet,
+        Directive::OMPD_workshare,
+        Directive::OMPD_parallel_workshare,
+        Directive::OMPD_parallel_sections,
+        Directive::OMPD_sections,
+        Directive::OMPD_single,
+    } | allDoSet,
 };
 
+//===----------------------------------------------------------------------===//
+// Directive sets for allowed/not allowed nested directives
+//===----------------------------------------------------------------------===//
+
 static const OmpDirectiveSet nestedBarrierErrSet{
     OmpDirectiveSet{
-        Directive::OMPD_critical,
-        Directive::OMPD_ordered,
         Directive::OMPD_atomic,
+        Directive::OMPD_critical,
         Directive::OMPD_master,
+        Directive::OMPD_ordered,
     } | taskGeneratingSet |
         workShareSet,
 };
 
-static const OmpDirectiveSet nestedTeamsAllowedSet{
-    Directive::OMPD_parallel,
-    Directive::OMPD_parallel_do,
-    Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_parallel_master,
-    Directive::OMPD_parallel_master_taskloop,
-    Directive::OMPD_parallel_master_taskloop_simd,
-    Directive::OMPD_parallel_sections,
-    Directive::OMPD_parallel_workshare,
-    Directive::OMPD_distribute,
+static const OmpDirectiveSet nestedCancelDoAllowedSet{
     Directive::OMPD_distribute_parallel_do,
-    Directive::OMPD_distribute_parallel_do_simd,
-    Directive::OMPD_distribute_simd,
+    Directive::OMPD_do,
+    Directive::OMPD_parallel_do,
+    Directive::OMPD_target_parallel_do,
+    Directive::OMPD_target_teams_distribute_parallel_do,
+    Directive::OMPD_teams_distribute_parallel_do,
 };
 
-static const OmpDirectiveSet nestedOrderedParallelErrSet{
+static const OmpDirectiveSet nestedCancelParallelAllowedSet{
     Directive::OMPD_parallel,
     Directive::OMPD_target_parallel,
-    Directive::OMPD_parallel_sections,
-    Directive::OMPD_parallel_workshare,
 };
 
-static const OmpDirectiveSet nestedOrderedDoAllowedSet{
-    Directive::OMPD_do,
-    Directive::OMPD_parallel_do,
-    Directive::OMPD_target_parallel_do,
+static const OmpDirectiveSet nestedCancelSectionsAllowedSet{
+    Directive::OMPD_parallel_sections,
+    Directive::OMPD_sections,
 };
 
 static const OmpDirectiveSet nestedCancelTaskgroupAllowedSet{
@@ -297,29 +290,64 @@ static const OmpDirectiveSet nestedCancelTaskgroupAllowedSet{
     Directive::OMPD_taskloop,
 };
 
-static const OmpDirectiveSet nestedCancelSectionsAllowedSet{
-    Directive::OMPD_sections,
-    Directive::OMPD_parallel_sections,
+static const OmpDirectiveSet nestedMasterErrSet{
+    OmpDirectiveSet{
+        Directive::OMPD_atomic,
+    } | taskGeneratingSet |
+        workShareSet,
 };
 
-static const OmpDirectiveSet nestedCancelDoAllowedSet{
+static const OmpDirectiveSet nestedOrderedDoAllowedSet{
     Directive::OMPD_do,
-    Directive::OMPD_distribute_parallel_do,
     Directive::OMPD_parallel_do,
     Directive::OMPD_target_parallel_do,
-    Directive::OMPD_target_teams_distribute_parallel_do,
-    Directive::OMPD_teams_distribute_parallel_do,
 };
 
-static const OmpDirectiveSet nestedCancelParallelAllowedSet{
+static const OmpDirectiveSet nestedOrderedErrSet{
+    Directive::OMPD_atomic,
+    Directive::OMPD_critical,
+    Directive::OMPD_ordered,
+    Directive::OMPD_task,
+    Directive::OMPD_taskloop,
+};
+
+static const OmpDirectiveSet nestedOrderedParallelErrSet{
     Directive::OMPD_parallel,
+    Directive::OMPD_parallel_sections,
+    Directive::OMPD_parallel_workshare,
     Directive::OMPD_target_parallel,
 };
 
 static const OmpDirectiveSet nestedReduceWorkshareAllowedSet{
     Directive::OMPD_do,
-    Directive::OMPD_sections,
     Directive::OMPD_do_simd,
+    Directive::OMPD_sections,
+};
+
+static const OmpDirectiveSet nestedTeamsAllowedSet{
+    Directive::OMPD_distribute,
+    Directive::OMPD_distribute_parallel_do,
+    Directive::OMPD_distribute_parallel_do_simd,
+    Directive::OMPD_distribute_simd,
+    Directive::OMPD_parallel,
+    Directive::OMPD_parallel_do,
+    Directive::OMPD_parallel_do_simd,
+    Directive::OMPD_parallel_master,
+    Directive::OMPD_parallel_master_taskloop,
+    Directive::OMPD_parallel_master_taskloop_simd,
+    Directive::OMPD_parallel_sections,
+    Directive::OMPD_parallel_workshare,
+};
+
+static const OmpDirectiveSet nestedWorkshareErrSet{
+    OmpDirectiveSet{
+        Directive::OMPD_atomic,
+        Directive::OMPD_critical,
+        Directive::OMPD_master,
+        Directive::OMPD_ordered,
+        Directive::OMPD_task,
+        Directive::OMPD_taskloop,
+    } | workShareSet,
 };
 } // namespace llvm::omp
 
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index c3175a5d1a111..67153ffb3be9f 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -91,12 +91,15 @@ class ModuleDetails : public WithOmpDeclarative {
     return moduleFileHash_;
   }
   void set_moduleFileHash(ModuleCheckSumType x) { moduleFileHash_ = x; }
+  const Symbol *previous() const { return previous_; }
+  void set_previous(const Symbol *p) { previous_ = p; }
 
 private:
   bool isSubmodule_;
   bool isDefaultPrivate_{false};
   const Scope *scope_{nullptr};
   std::optional<ModuleCheckSumType> moduleFileHash_;
+  const Symbol *previous_{nullptr}; // same name, different module file hash
 };
 
 class MainProgramDetails : public WithOmpDeclarative {
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index df66e1adb5502..dc3cd6c894a2c 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -180,7 +180,8 @@ const Symbol *IsFinalizable(const Symbol &,
 const Symbol *IsFinalizable(const DerivedTypeSpec &,
     std::set<const DerivedTypeSpec *> * = nullptr,
     bool withImpureFinalizer = false, std::optional<int> rank = std::nullopt);
-const Symbol *HasImpureFinal(const Symbol &);
+const Symbol *HasImpureFinal(
+    const Symbol &, std::optional<int> rank = std::nullopt);
 // Is this type finalizable or does it contain any polymorphic allocatable
 // ultimate components?
 bool MayRequireFinalization(const DerivedTypeSpec &derived);
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 2e3fa1f6e6603..c830c7af2462c 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1191,11 +1191,6 @@ bool CompilerInvocation::createFromArgs(
     invoc.loweringOpts.setLowerToHighLevelFIR(false);
   }
 
-  if (args.hasArg(
-          clang::driver::options::OPT_flang_experimental_polymorphism)) {
-    invoc.loweringOpts.setPolymorphicTypeImpl(true);
-  }
-
   // -fno-ppc-native-vector-element-order
   if (args.hasArg(clang::driver::options::OPT_fno_ppc_native_vec_elem_order)) {
     invoc.loweringOpts.setNoPPCNativeVecElemOrder(true);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index a668ba4116faa..c3cb9ba6a47e3 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -748,7 +748,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
   void copyVar(mlir::Location loc, mlir::Value dst,
                mlir::Value src) override final {
-    copyVarHLFIR(loc, dst, src);
+    copyVarHLFIR(loc, Fortran::lower::SymbolBox::Intrinsic{dst},
+                 Fortran::lower::SymbolBox::Intrinsic{src});
   }
 
   void copyHostAssociateVar(
@@ -1009,10 +1010,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       // `omp.private`'s `alloc` block. If this is the case, we return this
       // `SymbolBox::Intrinsic` value.
       if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym))
-        return v.match(
-            [&](const Fortran::lower::SymbolBox::Intrinsic &)
-                -> Fortran::lower::SymbolBox { return v; },
-            [](const auto &) -> Fortran::lower::SymbolBox { return {}; });
+        return v;
 
       return {};
     }
@@ -1060,15 +1058,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                const Fortran::lower::SymbolBox &rhs_sb) {
     mlir::Location loc = genLocation(sym.name());
     if (lowerToHighLevelFIR())
-      copyVarHLFIR(loc, lhs_sb.getAddr(), rhs_sb.getAddr());
+      copyVarHLFIR(loc, lhs_sb, rhs_sb);
     else
       copyVarFIR(loc, sym, lhs_sb, rhs_sb);
   }
 
-  void copyVarHLFIR(mlir::Location loc, mlir::Value dst, mlir::Value src) {
+  void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst,
+                    Fortran::lower::SymbolBox src) {
     assert(lowerToHighLevelFIR());
-    hlfir::Entity lhs{dst};
-    hlfir::Entity rhs{src};
+    hlfir::Entity lhs{dst.getAddr()};
+    hlfir::Entity rhs{src.getAddr()};
     // Temporary_lhs is set to true in hlfir.assign below to avoid user
     // assignment to be used and finalization to be called on the LHS.
     // This may or may not be correct but mimics the current behaviour
@@ -1082,7 +1081,22 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           /*keepLhsLengthInAllocatableAssignment=*/false,
           /*temporary_lhs=*/true);
     };
-    if (lhs.isAllocatable()) {
+
+    bool isBoxAllocatable = dst.match(
+        [](const fir::MutableBoxValue &box) { return box.isAllocatable(); },
+        [](const fir::FortranVariableOpInterface &box) {
+          return fir::FortranVariableOpInterface(box).isAllocatable();
+        },
+        [](const auto &box) { return false; });
+
+    bool isBoxPointer = dst.match(
+        [](const fir::MutableBoxValue &box) { return box.isPointer(); },
+        [](const fir::FortranVariableOpInterface &box) {
+          return fir::FortranVariableOpInterface(box).isPointer();
+        },
+        [](const auto &box) { return false; });
+
+    if (isBoxAllocatable) {
       // Deep copy allocatable if it is allocated.
       // Note that when allocated, the RHS is already allocated with the LHS
       // shape for copy on entry in createHostAssociateVarClone.
@@ -1097,7 +1111,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             copyData(lhs, rhs);
           })
           .end();
-    } else if (lhs.isPointer()) {
+    } else if (isBoxPointer) {
       // Set LHS target to the target of RHS (do not copy the RHS
       // target data into the LHS target storage).
       auto loadVal = builder->create<fir::LoadOp>(loc, rhs);
@@ -2508,19 +2522,51 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     if (nestedLoops > 1)
       n = builder->getIntegerAttr(builder->getI64Type(), nestedLoops);
 
-    const std::list<Fortran::parser::ScalarIntExpr> &grid = std::get<1>(dir.t);
-    const std::list<Fortran::parser::ScalarIntExpr> &block = std::get<2>(dir.t);
+    const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr> &grid =
+        std::get<1>(dir.t);
+    const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr> &block =
+        std::get<2>(dir.t);
     const std::optional<Fortran::parser::ScalarIntExpr> &stream =
         std::get<3>(dir.t);
 
+    auto isOnlyStars =
+        [&](const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr>
+                &list) -> bool {
+      for (const Fortran::parser::CUFKernelDoConstruct::StarOrExpr &expr :
+           list) {
+        if (expr.v)
+          return false;
+      }
+      return true;
+    };
+
+    mlir::Value zero =
+        builder->createIntegerConstant(loc, builder->getI32Type(), 0);
+
     llvm::SmallVector<mlir::Value> gridValues;
-    for (const Fortran::parser::ScalarIntExpr &expr : grid)
-      gridValues.push_back(fir::getBase(
-          genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)));
+    if (!isOnlyStars(grid)) {
+      for (const Fortran::parser::CUFKernelDoConstruct::StarOrExpr &expr :
+           grid) {
+        if (expr.v) {
+          gridValues.push_back(fir::getBase(
+              genExprValue(*Fortran::semantics::GetExpr(*expr.v), stmtCtx)));
+        } else {
+          gridValues.push_back(zero);
+        }
+      }
+    }
     llvm::SmallVector<mlir::Value> blockValues;
-    for (const Fortran::parser::ScalarIntExpr &expr : block)
-      blockValues.push_back(fir::getBase(
-          genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)));
+    if (!isOnlyStars(block)) {
+      for (const Fortran::parser::CUFKernelDoConstruct::StarOrExpr &expr :
+           block) {
+        if (expr.v) {
+          blockValues.push_back(fir::getBase(
+              genExprValue(*Fortran::semantics::GetExpr(*expr.v), stmtCtx)));
+        } else {
+          blockValues.push_back(zero);
+        }
+      }
+    }
     mlir::Value streamValue;
     if (stream)
       streamValue = fir::getBase(
@@ -3665,6 +3711,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
     mlir::Location loc = getCurrentLocation();
     fir::FirOpBuilder &builder = getFirOpBuilder();
+
+    if (Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
+        Fortran::evaluate::HasCUDAAttrs(assign.rhs))
+      TODO(loc, "Assignement with CUDA Fortran variables");
+
     // Gather some information about the assignment that will impact how it is
     // lowered.
     const bool isWholeAllocatableAssignment =
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index 5577a60f1daea..f92d1a2bc7de1 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -25,6 +25,7 @@ add_flang_library(FortranLower
   Mangler.cpp
   OpenACC.cpp
   OpenMP/ClauseProcessor.cpp
+  OpenMP/Clauses.cpp
   OpenMP/DataSharingProcessor.cpp
   OpenMP/OpenMP.cpp
   OpenMP/ReductionProcessor.cpp
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 6b71aabf7fdc8..c65becc497459 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -310,21 +310,22 @@ bool Fortran::lower::CallerInterface::verifyActualInputs() const {
   return true;
 }
 
-void Fortran::lower::CallerInterface::walkResultLengths(
-    ExprVisitor visitor) const {
-  assert(characteristic && "characteristic was not computed");
-  const Fortran::evaluate::characteristics::FunctionResult &result =
-      characteristic->functionResult.value();
-  const Fortran::evaluate::characteristics::TypeAndShape *typeAndShape =
-      result.GetTypeAndShape();
-  assert(typeAndShape && "no result type");
-  Fortran::evaluate::DynamicType dynamicType = typeAndShape->type();
-  // Visit result length specification expressions that are explicit.
+mlir::Value
+Fortran::lower::CallerInterface::getInput(const PassedEntity &passedEntity) {
+  return actualInputs[passedEntity.firArgument];
+}
+
+static void walkLengths(
+    const Fortran::evaluate::characteristics::TypeAndShape &typeAndShape,
+    const Fortran::lower::CallerInterface::ExprVisitor &visitor,
+    Fortran::lower::AbstractConverter &converter) {
+  Fortran::evaluate::DynamicType dynamicType = typeAndShape.type();
+  // Visit length specification expressions that are explicit.
   if (dynamicType.category() == Fortran::common::TypeCategory::Character) {
     if (std::optional<Fortran::evaluate::ExtentExpr> length =
             dynamicType.GetCharLength())
-      visitor(toEvExpr(*length));
-  } else if (dynamicType.category() == common::TypeCategory::Derived &&
+      visitor(toEvExpr(*length), /*assumedSize=*/false);
+  } else if (dynamicType.category() == Fortran::common::TypeCategory::Derived &&
              !dynamicType.IsUnlimitedPolymorphic()) {
     const Fortran::semantics::DerivedTypeSpec &derivedTypeSpec =
         dynamicType.GetDerivedTypeSpec();
@@ -334,11 +335,33 @@ void Fortran::lower::CallerInterface::walkResultLengths(
   }
 }
 
+void Fortran::lower::CallerInterface::walkResultLengths(
+    const ExprVisitor &visitor) const {
+  assert(characteristic && "characteristic was not computed");
+  const Fortran::evaluate::characteristics::FunctionResult &result =
+      characteristic->functionResult.value();
+  const Fortran::evaluate::characteristics::TypeAndShape *typeAndShape =
+      result.GetTypeAndShape();
+  assert(typeAndShape && "no result type");
+  return walkLengths(*typeAndShape, visitor, converter);
+}
+
+void Fortran::lower::CallerInterface::walkDummyArgumentLengths(
+    const PassedEntity &passedEntity, const ExprVisitor &visitor) const {
+  if (!passedEntity.characteristics)
+    return;
+  if (const auto *dummy =
+          std::get_if<Fortran::evaluate::characteristics::DummyDataObject>(
+              &passedEntity.characteristics->u))
+    walkLengths(dummy->type, visitor, converter);
+}
+
 // Compute extent expr from shapeSpec of an explicit shape.
-// TODO: Allow evaluate shape analysis to work in a mode where it disregards
-// the non-constant aspects when building the shape to avoid having this here.
 static Fortran::evaluate::ExtentExpr
 getExtentExpr(const Fortran::semantics::ShapeSpec &shapeSpec) {
+  if (shapeSpec.ubound().isStar())
+    // F'2023 18.5.3 point 5.
+    return Fortran::evaluate::ExtentExpr{-1};
   const auto &ubound = shapeSpec.ubound().GetExplicit();
   const auto &lbound = shapeSpec.lbound().GetExplicit();
   assert(lbound && ubound && "shape must be explicit");
@@ -346,20 +369,27 @@ getExtentExpr(const Fortran::semantics::ShapeSpec &shapeSpec) {
          Fortran::evaluate::ExtentExpr{1};
 }
 
+static void
+walkExtents(const Fortran::semantics::Symbol &symbol,
+            const Fortran::lower::CallerInterface::ExprVisitor &visitor) {
+  if (const auto *objectDetails =
+          symbol.detailsIf<Fortran::semantics::ObjectEntityDetails>())
+    if (objectDetails->shape().IsExplicitShape() ||
+        Fortran::semantics::IsAssumedSizeArray(symbol))
+      for (const Fortran::semantics::ShapeSpec &shapeSpec :
+           objectDetails->shape())
+        visitor(Fortran::evaluate::AsGenericExpr(getExtentExpr(shapeSpec)),
+                /*assumedSize=*/shapeSpec.ubound().isStar());
+}
+
 void Fortran::lower::CallerInterface::walkResultExtents(
-    ExprVisitor visitor) const {
+    const ExprVisitor &visitor) const {
   // Walk directly the result symbol shape (the characteristic shape may contain
   // descriptor inquiries to it that would fail to lower on the caller side).
   const Fortran::semantics::SubprogramDetails *interfaceDetails =
       getInterfaceDetails();
   if (interfaceDetails) {
-    const Fortran::semantics::Symbol &result = interfaceDetails->result();
-    if (const auto *objectDetails =
-            result.detailsIf<Fortran::semantics::ObjectEntityDetails>())
-      if (objectDetails->shape().IsExplicitShape())
-        for (const Fortran::semantics::ShapeSpec &shapeSpec :
-             objectDetails->shape())
-          visitor(Fortran::evaluate::AsGenericExpr(getExtentExpr(shapeSpec)));
+    walkExtents(interfaceDetails->result(), visitor);
   } else {
     if (procRef.Rank() != 0)
       fir::emitFatalError(
@@ -368,7 +398,18 @@ void Fortran::lower::CallerInterface::walkResultExtents(
   }
 }
 
-bool Fortran::lower::CallerInterface::mustMapInterfaceSymbols() const {
+void Fortran::lower::CallerInterface::walkDummyArgumentExtents(
+    const PassedEntity &passedEntity, const ExprVisitor &visitor) const {
+  const Fortran::semantics::SubprogramDetails *interfaceDetails =
+      getInterfaceDetails();
+  if (!interfaceDetails)
+    return;
+  const Fortran::semantics::Symbol *dummy = getDummySymbol(passedEntity);
+  assert(dummy && "dummy symbol was not set");
+  walkExtents(*dummy, visitor);
+}
+
+bool Fortran::lower::CallerInterface::mustMapInterfaceSymbolsForResult() const {
   assert(characteristic && "characteristic was not computed");
   const std::optional<Fortran::evaluate::characteristics::FunctionResult>
       &result = characteristic->functionResult;
@@ -376,7 +417,7 @@ bool Fortran::lower::CallerInterface::mustMapInterfaceSymbols() const {
       !getInterfaceDetails() || result->IsProcedurePointer())
     return false;
   bool allResultSpecExprConstant = true;
-  auto visitor = [&](const Fortran::lower::SomeExpr &e) {
+  auto visitor = [&](const Fortran::lower::SomeExpr &e, bool) {
     allResultSpecExprConstant &= Fortran::evaluate::IsConstantExpr(e);
   };
   walkResultLengths(visitor);
@@ -384,6 +425,17 @@ bool Fortran::lower::CallerInterface::mustMapInterfaceSymbols() const {
   return !allResultSpecExprConstant;
 }
 
+bool Fortran::lower::CallerInterface::mustMapInterfaceSymbolsForDummyArgument(
+    const PassedEntity &arg) const {
+  bool allResultSpecExprConstant = true;
+  auto visitor = [&](const Fortran::lower::SomeExpr &e, bool) {
+    allResultSpecExprConstant &= Fortran::evaluate::IsConstantExpr(e);
+  };
+  walkDummyArgumentLengths(arg, visitor);
+  walkDummyArgumentExtents(arg, visitor);
+  return !allResultSpecExprConstant;
+}
+
 mlir::Value Fortran::lower::CallerInterface::getArgumentValue(
     const semantics::Symbol &sym) const {
   mlir::Location loc = converter.getCurrentLocation();
@@ -401,6 +453,24 @@ mlir::Value Fortran::lower::CallerInterface::getArgumentValue(
   return actualInputs[mlirArgIndex];
 }
 
+const Fortran::semantics::Symbol *
+Fortran::lower::CallerInterface::getDummySymbol(
+    const PassedEntity &passedEntity) const {
+  const Fortran::semantics::SubprogramDetails *ifaceDetails =
+      getInterfaceDetails();
+  if (!ifaceDetails)
+    return nullptr;
+  std::size_t argPosition = 0;
+  for (const auto &arg : getPassedArguments()) {
+    if (&arg == &passedEntity)
+      break;
+    ++argPosition;
+  }
+  if (argPosition >= ifaceDetails->dummyArgs().size())
+    return nullptr;
+  return ifaceDetails->dummyArgs()[argPosition];
+}
+
 mlir::Type Fortran::lower::CallerInterface::getResultStorageType() const {
   if (passedResult)
     return fir::dyn_cast_ptrEleTy(inputs[passedResult->firArgument].type);
@@ -408,6 +478,11 @@ mlir::Type Fortran::lower::CallerInterface::getResultStorageType() const {
   return outputs[0].type;
 }
 
+mlir::Type Fortran::lower::CallerInterface::getDummyArgumentType(
+    const PassedEntity &passedEntity) const {
+  return inputs[passedEntity.firArgument].type;
+}
+
 const Fortran::semantics::Symbol &
 Fortran::lower::CallerInterface::getResultSymbol() const {
   mlir::Location loc = converter.getCurrentLocation();
@@ -975,12 +1050,6 @@ class Fortran::lower::CallInterfaceImpl {
     Fortran::common::TypeCategory cat = dynamicType.category();
     // DERIVED
     if (cat == Fortran::common::TypeCategory::Derived) {
-      // TODO is kept under experimental flag until feature is complete.
-      if (dynamicType.IsPolymorphic() &&
-          !getConverter().getLoweringOptions().getPolymorphicTypeImpl())
-        TODO(interface.converter.getCurrentLocation(),
-             "support for polymorphic types");
-
       if (dynamicType.IsUnlimitedPolymorphic())
         return mlir::NoneType::get(&mlirContext);
       return getConverter().genType(dynamicType.GetDerivedTypeSpec());
@@ -1387,6 +1456,17 @@ bool Fortran::lower::CallInterface<
   return Fortran::semantics::IsFinalizable(*derived);
 }
 
+template <typename T>
+bool Fortran::lower::CallInterface<
+    T>::PassedEntity::isSequenceAssociatedDescriptor() const {
+  if (!characteristics || passBy != PassEntityBy::Box)
+    return false;
+  const auto *dummy =
+      std::get_if<Fortran::evaluate::characteristics::DummyDataObject>(
+          &characteristics->u);
+  return dummy && dummy->type.CanBeSequenceAssociated();
+}
+
 template <typename T>
 void Fortran::lower::CallInterface<T>::determineInterface(
     bool isImplicit,
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 6e3ce101ef1af..6eba243c237cf 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -164,6 +164,125 @@ static mlir::Value readDim3Value(fir::FirOpBuilder &builder, mlir::Location loc,
   return hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{designate});
 }
 
+static mlir::Value remapActualToDummyDescriptor(
+    mlir::Location loc, Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::SymMap &symMap,
+    const Fortran::lower::CallerInterface::PassedEntity &arg,
+    Fortran::lower::CallerInterface &caller, bool isBindcCall) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mlir::IndexType idxTy = builder.getIndexType();
+  mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
+  Fortran::lower::StatementContext localStmtCtx;
+  auto lowerSpecExpr = [&](const auto &expr,
+                           bool isAssumedSizeExtent) -> mlir::Value {
+    mlir::Value convertExpr = builder.createConvert(
+        loc, idxTy, fir::getBase(converter.genExprValue(expr, localStmtCtx)));
+    if (isAssumedSizeExtent)
+      return convertExpr;
+    return fir::factory::genMaxWithZero(builder, loc, convertExpr);
+  };
+  bool mapSymbols = caller.mustMapInterfaceSymbolsForDummyArgument(arg);
+  if (mapSymbols) {
+    symMap.pushScope();
+    const Fortran::semantics::Symbol *sym = caller.getDummySymbol(arg);
+    assert(sym && "call must have explicit interface to map interface symbols");
+    Fortran::lower::mapCallInterfaceSymbolsForDummyArgument(converter, caller,
+                                                            symMap, *sym);
+  }
+  llvm::SmallVector<mlir::Value> extents;
+  llvm::SmallVector<mlir::Value> lengths;
+  mlir::Type dummyBoxType = caller.getDummyArgumentType(arg);
+  mlir::Type dummyBaseType = fir::unwrapPassByRefType(dummyBoxType);
+  if (dummyBaseType.isa<fir::SequenceType>())
+    caller.walkDummyArgumentExtents(
+        arg, [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
+          extents.emplace_back(lowerSpecExpr(e, isAssumedSizeExtent));
+        });
+  mlir::Value shape;
+  if (!extents.empty()) {
+    if (isBindcCall) {
+      // Preserve zero lower bounds (see F'2023 18.5.3).
+      llvm::SmallVector<mlir::Value> lowerBounds(extents.size(), zero);
+      shape = builder.genShape(loc, lowerBounds, extents);
+    } else {
+      shape = builder.genShape(loc, extents);
+    }
+  }
+
+  hlfir::Entity explicitArgument = hlfir::Entity{caller.getInput(arg)};
+  mlir::Type dummyElementType = fir::unwrapSequenceType(dummyBaseType);
+  if (auto recType = llvm::dyn_cast<fir::RecordType>(dummyElementType))
+    if (recType.getNumLenParams() > 0)
+      TODO(loc, "sequence association of length parameterized derived type "
+                "dummy arguments");
+  if (fir::isa_char(dummyElementType))
+    lengths.emplace_back(hlfir::genCharLength(loc, builder, explicitArgument));
+  mlir::Value baseAddr =
+      hlfir::genVariableRawAddress(loc, builder, explicitArgument);
+  baseAddr = builder.createConvert(loc, fir::ReferenceType::get(dummyBaseType),
+                                   baseAddr);
+  mlir::Value mold;
+  if (fir::isPolymorphicType(dummyBoxType))
+    mold = explicitArgument;
+  mlir::Value remapped =
+      builder.create<fir::EmboxOp>(loc, dummyBoxType, baseAddr, shape,
+                                   /*slice=*/mlir::Value{}, lengths, mold);
+  if (mapSymbols)
+    symMap.popScope();
+  return remapped;
+}
+
+/// Create a descriptor for sequenced associated descriptor that are passed
+/// by descriptor. Sequence association (F'2023 15.5.2.12) implies that the
+/// dummy shape and rank need to not be the same as the actual argument. This
+/// helper creates a descriptor based on the dummy shape and rank (sequence
+/// association can only happen with explicit and assumed-size array) so that it
+/// is safe to assume the rank of the incoming descriptor inside the callee.
+/// This helper must be called once all the actual arguments have been lowered
+/// and placed inside "caller". Copy-in/copy-out must already have been
+/// generated if needed using the actual argument shape (the dummy shape may be
+/// assumed-size).
+static void remapActualToDummyDescriptors(
+    mlir::Location loc, Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::SymMap &symMap,
+    const Fortran::lower::PreparedActualArguments &loweredActuals,
+    Fortran::lower::CallerInterface &caller, bool isBindcCall) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  for (auto [preparedActual, arg] :
+       llvm::zip(loweredActuals, caller.getPassedArguments())) {
+    if (arg.isSequenceAssociatedDescriptor()) {
+      if (!preparedActual.value().handleDynamicOptional()) {
+        mlir::Value remapped = remapActualToDummyDescriptor(
+            loc, converter, symMap, arg, caller, isBindcCall);
+        caller.placeInput(arg, remapped);
+      } else {
+        // Absent optional actual argument descriptor cannot be read and
+        // remapped unconditionally.
+        mlir::Type dummyType = caller.getDummyArgumentType(arg);
+        mlir::Value isPresent = preparedActual.value().getIsPresent();
+        auto &argLambdaCapture = arg;
+        mlir::Value remapped =
+            builder
+                .genIfOp(loc, {dummyType}, isPresent,
+                         /*withElseRegion=*/true)
+                .genThen([&]() {
+                  mlir::Value newBox = remapActualToDummyDescriptor(
+                      loc, converter, symMap, argLambdaCapture, caller,
+                      isBindcCall);
+                  builder.create<fir::ResultOp>(loc, newBox);
+                })
+                .genElse([&]() {
+                  mlir::Value absent =
+                      builder.create<fir::AbsentOp>(loc, dummyType);
+                  builder.create<fir::ResultOp>(loc, absent);
+                })
+                .getResults()[0];
+        caller.placeInput(arg, remapped);
+      }
+    }
+  }
+}
+
 std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     mlir::Location loc, Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symMap, Fortran::lower::StatementContext &stmtCtx,
@@ -171,12 +290,11 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     std::optional<mlir::Type> resultType, bool isElemental) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   using PassBy = Fortran::lower::CallerInterface::PassEntityBy;
-  // Handle cases where caller must allocate the result or a fir.box for it.
   bool mustPopSymMap = false;
-  if (caller.mustMapInterfaceSymbols()) {
+  if (caller.mustMapInterfaceSymbolsForResult()) {
     symMap.pushScope();
     mustPopSymMap = true;
-    Fortran::lower::mapCallInterfaceSymbols(converter, caller, symMap);
+    Fortran::lower::mapCallInterfaceSymbolsForResult(converter, caller, symMap);
   }
   // If this is an indirect call, retrieve the function address. Also retrieve
   // the result length if this is a character function (note that this length
@@ -221,12 +339,16 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
       return {};
     mlir::Type type = caller.getResultStorageType();
     if (type.isa<fir::SequenceType>())
-      caller.walkResultExtents([&](const Fortran::lower::SomeExpr &e) {
-        extents.emplace_back(lowerSpecExpr(e));
-      });
-    caller.walkResultLengths([&](const Fortran::lower::SomeExpr &e) {
-      lengths.emplace_back(lowerSpecExpr(e));
-    });
+      caller.walkResultExtents(
+          [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
+            assert(!isAssumedSizeExtent && "result cannot be assumed-size");
+            extents.emplace_back(lowerSpecExpr(e));
+          });
+    caller.walkResultLengths(
+        [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
+          assert(!isAssumedSizeExtent && "result cannot be assumed-size");
+          lengths.emplace_back(lowerSpecExpr(e));
+        });
 
     // Result length parameters should not be provided to box storage
     // allocation and save_results, but they are still useful information to
@@ -416,7 +538,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     mlir::Type i32Ty = builder.getI32Type();
     mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
 
-    mlir::Value grid_x, grid_y;
+    mlir::Value grid_x, grid_y, grid_z;
     if (caller.getCallDescription().chevrons()[0].GetType()->category() ==
         Fortran::common::TypeCategory::Integer) {
       // If grid is an integer, it is converted to dim3(grid,1,1). Since z is
@@ -426,11 +548,13 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
           fir::getBase(converter.genExprValue(
               caller.getCallDescription().chevrons()[0], stmtCtx)));
       grid_y = one;
+      grid_z = one;
     } else {
       auto dim3Addr = converter.genExprAddr(
           caller.getCallDescription().chevrons()[0], stmtCtx);
       grid_x = readDim3Value(builder, loc, fir::getBase(dim3Addr), "x");
       grid_y = readDim3Value(builder, loc, fir::getBase(dim3Addr), "y");
+      grid_z = readDim3Value(builder, loc, fir::getBase(dim3Addr), "z");
     }
 
     mlir::Value block_x, block_y, block_z;
@@ -466,8 +590,8 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
               caller.getCallDescription().chevrons()[3], stmtCtx)));
 
     builder.create<fir::CUDAKernelLaunch>(
-        loc, funcType.getResults(), funcSymbolAttr, grid_x, grid_y, block_x,
-        block_y, block_z, bytes, stream, operands);
+        loc, funcType.getResults(), funcSymbolAttr, grid_x, grid_y, grid_z,
+        block_x, block_y, block_z, bytes, stream, operands);
     callNumResults = 0;
   } else if (caller.requireDispatchCall()) {
     // Procedure call requiring a dynamic dispatch. Call is created with
@@ -1054,10 +1178,16 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // Create dummy type with actual argument rank when the dummy is an assumed
   // rank. That way, all the operation to create dummy descriptors are ranked if
   // the actual argument is ranked, which allows simple code generation.
+  // Also do the same when the dummy is a sequence associated descriptor
+  // because the actual shape/rank may mismatch with the dummy, and the dummy
+  // may be an assumed-size array, so any descriptor manipulation should use the
+  // actual argument shape information. A descriptor with the dummy shape
+  // information will be created later when all actual arguments are ready.
   mlir::Type dummyTypeWithActualRank = dummyType;
   if (auto baseBoxDummy = mlir::dyn_cast<fir::BaseBoxType>(dummyType))
     if (baseBoxDummy.isAssumedRank() ||
-        arg.testTKR(Fortran::common::IgnoreTKR::Rank))
+        arg.testTKR(Fortran::common::IgnoreTKR::Rank) ||
+        arg.isSequenceAssociatedDescriptor())
       dummyTypeWithActualRank =
           baseBoxDummy.getBoxTypeWithNewShape(actual.getType());
   // Preserve the actual type in the argument preparation in case IgnoreTKR(t)
@@ -1340,6 +1470,7 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
             mlir::FunctionType callSiteType, CallContext &callContext) {
   using PassBy = Fortran::lower::CallerInterface::PassEntityBy;
   mlir::Location loc = callContext.loc;
+  bool mustRemapActualToDummyDescriptors = false;
   fir::FirOpBuilder &builder = callContext.getBuilder();
   llvm::SmallVector<CallCleanUp> callCleanUps;
   for (auto [preparedActual, arg] :
@@ -1396,6 +1527,9 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
       callCleanUps.append(preparedDummy.cleanups.rbegin(),
                           preparedDummy.cleanups.rend());
       caller.placeInput(arg, preparedDummy.dummy);
+      if (arg.passBy == PassBy::Box)
+        mustRemapActualToDummyDescriptors |=
+            arg.isSequenceAssociatedDescriptor();
     } break;
     case PassBy::BoxProcRef: {
       PreparedDummyArgument preparedDummy =
@@ -1488,6 +1622,12 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
     } break;
     }
   }
+  // Handle cases where caller must allocate the result or a fir.box for it.
+  if (mustRemapActualToDummyDescriptors)
+    remapActualToDummyDescriptors(loc, callContext.converter,
+                                  callContext.symMap, loweredActuals, caller,
+                                  callContext.isBindcCall());
+
   // Prepare lowered arguments according to the interface
   // and map the lowered values to the dummy
   // arguments.
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index 21564e8b81d70..e6557d7f0b767 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -263,10 +263,6 @@ struct TypeBuilderImpl {
         llvm::SmallVector<Fortran::lower::LenParameterTy> params;
         translateLenParameters(params, tySpec->category(), ultimate);
         ty = genFIRType(context, tySpec->category(), kind, params);
-      } else if (type->IsPolymorphic() &&
-                 !converter.getLoweringOptions().getPolymorphicTypeImpl()) {
-        // TODO is kept under experimental flag until feature is complete.
-        TODO(loc, "support for polymorphic types");
       } else if (type->IsUnlimitedPolymorphic()) {
         ty = mlir::NoneType::get(context);
       } else if (const Fortran::semantics::DerivedTypeSpec *tySpec =
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index a673a18cd20d9..94d849862099e 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -2260,19 +2260,20 @@ void Fortran::lower::instantiateVariable(AbstractConverter &converter,
     instantiateLocal(converter, var, symMap);
 }
 
-void Fortran::lower::mapCallInterfaceSymbols(
-    AbstractConverter &converter, const Fortran::lower::CallerInterface &caller,
-    SymMap &symMap) {
+static void
+mapCallInterfaceSymbol(const Fortran::semantics::Symbol &interfaceSymbol,
+                       Fortran::lower::AbstractConverter &converter,
+                       const Fortran::lower::CallerInterface &caller,
+                       Fortran::lower::SymMap &symMap) {
   Fortran::lower::AggregateStoreMap storeMap;
-  const Fortran::semantics::Symbol &result = caller.getResultSymbol();
   for (Fortran::lower::pft::Variable var :
-       Fortran::lower::pft::getDependentVariableList(result)) {
+       Fortran::lower::pft::getDependentVariableList(interfaceSymbol)) {
     if (var.isAggregateStore()) {
       instantiateVariable(converter, var, symMap, storeMap);
       continue;
     }
     const Fortran::semantics::Symbol &sym = var.getSymbol();
-    if (&sym == &result)
+    if (&sym == &interfaceSymbol)
       continue;
     const auto *hostDetails =
         sym.detailsIf<Fortran::semantics::HostAssocDetails>();
@@ -2293,7 +2294,8 @@ void Fortran::lower::mapCallInterfaceSymbols(
       // instantiateVariable that would try to allocate a new storage.
       continue;
     }
-    if (Fortran::semantics::IsDummy(sym) && sym.owner() == result.owner()) {
+    if (Fortran::semantics::IsDummy(sym) &&
+        sym.owner() == interfaceSymbol.owner()) {
       // Get the argument for the dummy argument symbols of the current call.
       symMap.addSymbol(sym, caller.getArgumentValue(sym));
       // All the properties of the dummy variable may not come from the actual
@@ -2307,6 +2309,19 @@ void Fortran::lower::mapCallInterfaceSymbols(
   }
 }
 
+void Fortran::lower::mapCallInterfaceSymbolsForResult(
+    AbstractConverter &converter, const Fortran::lower::CallerInterface &caller,
+    SymMap &symMap) {
+  const Fortran::semantics::Symbol &result = caller.getResultSymbol();
+  mapCallInterfaceSymbol(result, converter, caller, symMap);
+}
+
+void Fortran::lower::mapCallInterfaceSymbolsForDummyArgument(
+    AbstractConverter &converter, const Fortran::lower::CallerInterface &caller,
+    SymMap &symMap, const Fortran::semantics::Symbol &dummySymbol) {
+  mapCallInterfaceSymbol(dummySymbol, converter, caller, symMap);
+}
+
 void Fortran::lower::mapSymbolAttributes(
     AbstractConverter &converter, const Fortran::semantics::SymbolRef &symbol,
     Fortran::lower::SymMap &symMap, Fortran::lower::StatementContext &stmtCtx,
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index a41f8312a28c9..13347c8cf7b65 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ClauseProcessor.h"
+#include "Clauses.h"
 
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Parser/tools.h"
@@ -30,64 +31,55 @@ static void checkMapType(mlir::Location location, mlir::Type type) {
 }
 
 static mlir::omp::ScheduleModifier
-translateScheduleModifier(const Fortran::parser::OmpScheduleModifierType &m) {
-  switch (m.v) {
-  case Fortran::parser::OmpScheduleModifierType::ModType::Monotonic:
+translateScheduleModifier(const omp::clause::Schedule::ModType &m) {
+  switch (m) {
+  case omp::clause::Schedule::ModType::Monotonic:
     return mlir::omp::ScheduleModifier::monotonic;
-  case Fortran::parser::OmpScheduleModifierType::ModType::Nonmonotonic:
+  case omp::clause::Schedule::ModType::Nonmonotonic:
     return mlir::omp::ScheduleModifier::nonmonotonic;
-  case Fortran::parser::OmpScheduleModifierType::ModType::Simd:
+  case omp::clause::Schedule::ModType::Simd:
     return mlir::omp::ScheduleModifier::simd;
   }
   return mlir::omp::ScheduleModifier::none;
 }
 
 static mlir::omp::ScheduleModifier
-getScheduleModifier(const Fortran::parser::OmpScheduleClause &x) {
-  const auto &modifier =
-      std::get<std::optional<Fortran::parser::OmpScheduleModifier>>(x.t);
+getScheduleModifier(const omp::clause::Schedule &clause) {
+  using ScheduleModifier = omp::clause::Schedule::ScheduleModifier;
+  const auto &modifier = std::get<std::optional<ScheduleModifier>>(clause.t);
   // The input may have the modifier any order, so we look for one that isn't
   // SIMD. If modifier is not set at all, fall down to the bottom and return
   // "none".
   if (modifier) {
-    const auto &modType1 =
-        std::get<Fortran::parser::OmpScheduleModifier::Modifier1>(modifier->t);
-    if (modType1.v.v ==
-        Fortran::parser::OmpScheduleModifierType::ModType::Simd) {
-      const auto &modType2 = std::get<
-          std::optional<Fortran::parser::OmpScheduleModifier::Modifier2>>(
-          modifier->t);
-      if (modType2 &&
-          modType2->v.v !=
-              Fortran::parser::OmpScheduleModifierType::ModType::Simd)
-        return translateScheduleModifier(modType2->v);
-
+    using ModType = omp::clause::Schedule::ModType;
+    const auto &modType1 = std::get<ModType>(modifier->t);
+    if (modType1 == ModType::Simd) {
+      const auto &modType2 = std::get<std::optional<ModType>>(modifier->t);
+      if (modType2 && *modType2 != ModType::Simd)
+        return translateScheduleModifier(*modType2);
       return mlir::omp::ScheduleModifier::none;
     }
 
-    return translateScheduleModifier(modType1.v);
+    return translateScheduleModifier(modType1);
   }
   return mlir::omp::ScheduleModifier::none;
 }
 
 static mlir::omp::ScheduleModifier
-getSimdModifier(const Fortran::parser::OmpScheduleClause &x) {
-  const auto &modifier =
-      std::get<std::optional<Fortran::parser::OmpScheduleModifier>>(x.t);
+getSimdModifier(const omp::clause::Schedule &clause) {
+  using ScheduleModifier = omp::clause::Schedule::ScheduleModifier;
+  const auto &modifier = std::get<std::optional<ScheduleModifier>>(clause.t);
   // Either of the two possible modifiers in the input can be the SIMD modifier,
   // so look in either one, and return simd if we find one. Not found = return
   // "none".
   if (modifier) {
-    const auto &modType1 =
-        std::get<Fortran::parser::OmpScheduleModifier::Modifier1>(modifier->t);
-    if (modType1.v.v == Fortran::parser::OmpScheduleModifierType::ModType::Simd)
+    using ModType = omp::clause::Schedule::ModType;
+    const auto &modType1 = std::get<ModType>(modifier->t);
+    if (modType1 == ModType::Simd)
       return mlir::omp::ScheduleModifier::simd;
 
-    const auto &modType2 = std::get<
-        std::optional<Fortran::parser::OmpScheduleModifier::Modifier2>>(
-        modifier->t);
-    if (modType2 && modType2->v.v ==
-                        Fortran::parser::OmpScheduleModifierType::ModType::Simd)
+    const auto &modType2 = std::get<std::optional<ModType>>(modifier->t);
+    if (modType2 && *modType2 == ModType::Simd)
       return mlir::omp::ScheduleModifier::simd;
   }
   return mlir::omp::ScheduleModifier::none;
@@ -95,29 +87,25 @@ getSimdModifier(const Fortran::parser::OmpScheduleClause &x) {
 
 static void
 genAllocateClause(Fortran::lower::AbstractConverter &converter,
-                  const Fortran::parser::OmpAllocateClause &ompAllocateClause,
+                  const omp::clause::Allocate &clause,
                   llvm::SmallVectorImpl<mlir::Value> &allocatorOperands,
                   llvm::SmallVectorImpl<mlir::Value> &allocateOperands) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::Location currentLocation = converter.getCurrentLocation();
   Fortran::lower::StatementContext stmtCtx;
 
-  mlir::Value allocatorOperand;
-  const Fortran::parser::OmpObjectList &ompObjectList =
-      std::get<Fortran::parser::OmpObjectList>(ompAllocateClause.t);
-  const auto &allocateModifier = std::get<
-      std::optional<Fortran::parser::OmpAllocateClause::AllocateModifier>>(
-      ompAllocateClause.t);
+  const omp::ObjectList &objectList = std::get<omp::ObjectList>(clause.t);
+  const auto &modifier =
+      std::get<std::optional<omp::clause::Allocate::Modifier>>(clause.t);
 
   // If the allocate modifier is present, check if we only use the allocator
   // submodifier.  ALIGN in this context is unimplemented
   const bool onlyAllocator =
-      allocateModifier &&
-      std::holds_alternative<
-          Fortran::parser::OmpAllocateClause::AllocateModifier::Allocator>(
-          allocateModifier->u);
+      modifier &&
+      std::holds_alternative<omp::clause::Allocate::Modifier::Allocator>(
+          modifier->u);
 
-  if (allocateModifier && !onlyAllocator) {
+  if (modifier && !onlyAllocator) {
     TODO(currentLocation, "OmpAllocateClause ALIGN modifier");
   }
 
@@ -125,37 +113,34 @@ genAllocateClause(Fortran::lower::AbstractConverter &converter,
   // to list of allocators, otherwise, add default allocator to
   // list of allocators.
   if (onlyAllocator) {
-    const auto &allocatorValue = std::get<
-        Fortran::parser::OmpAllocateClause::AllocateModifier::Allocator>(
-        allocateModifier->u);
-    allocatorOperand = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(allocatorValue.v), stmtCtx));
-    allocatorOperands.insert(allocatorOperands.end(), ompObjectList.v.size(),
-                             allocatorOperand);
+    const auto &value =
+        std::get<omp::clause::Allocate::Modifier::Allocator>(modifier->u);
+    mlir::Value operand =
+        fir::getBase(converter.genExprValue(value.v, stmtCtx));
+    allocatorOperands.append(objectList.size(), operand);
   } else {
-    allocatorOperand = firOpBuilder.createIntegerConstant(
+    mlir::Value operand = firOpBuilder.createIntegerConstant(
         currentLocation, firOpBuilder.getI32Type(), 1);
-    allocatorOperands.insert(allocatorOperands.end(), ompObjectList.v.size(),
-                             allocatorOperand);
+    allocatorOperands.append(objectList.size(), operand);
   }
-  genObjectList(ompObjectList, converter, allocateOperands);
+  genObjectList(objectList, converter, allocateOperands);
 }
 
-static mlir::omp::ClauseProcBindKindAttr genProcBindKindAttr(
-    fir::FirOpBuilder &firOpBuilder,
-    const Fortran::parser::OmpClause::ProcBind *procBindClause) {
+static mlir::omp::ClauseProcBindKindAttr
+genProcBindKindAttr(fir::FirOpBuilder &firOpBuilder,
+                    const omp::clause::ProcBind &clause) {
   mlir::omp::ClauseProcBindKind procBindKind;
-  switch (procBindClause->v.v) {
-  case Fortran::parser::OmpProcBindClause::Type::Master:
+  switch (clause.v) {
+  case omp::clause::ProcBind::Type::Master:
     procBindKind = mlir::omp::ClauseProcBindKind::Master;
     break;
-  case Fortran::parser::OmpProcBindClause::Type::Close:
+  case omp::clause::ProcBind::Type::Close:
     procBindKind = mlir::omp::ClauseProcBindKind::Close;
     break;
-  case Fortran::parser::OmpProcBindClause::Type::Spread:
+  case omp::clause::ProcBind::Type::Spread:
     procBindKind = mlir::omp::ClauseProcBindKind::Spread;
     break;
-  case Fortran::parser::OmpProcBindClause::Type::Primary:
+  case omp::clause::ProcBind::Type::Primary:
     procBindKind = mlir::omp::ClauseProcBindKind::Primary;
     break;
   }
@@ -165,20 +150,17 @@ static mlir::omp::ClauseProcBindKindAttr genProcBindKindAttr(
 
 static mlir::omp::ClauseTaskDependAttr
 genDependKindAttr(fir::FirOpBuilder &firOpBuilder,
-                  const Fortran::parser::OmpClause::Depend *dependClause) {
+                  const omp::clause::Depend &clause) {
   mlir::omp::ClauseTaskDepend pbKind;
-  switch (
-      std::get<Fortran::parser::OmpDependenceType>(
-          std::get<Fortran::parser::OmpDependClause::InOut>(dependClause->v.u)
-              .t)
-          .v) {
-  case Fortran::parser::OmpDependenceType::Type::In:
+  const auto &inOut = std::get<omp::clause::Depend::InOut>(clause.u);
+  switch (std::get<omp::clause::Depend::Type>(inOut.t)) {
+  case omp::clause::Depend::Type::In:
     pbKind = mlir::omp::ClauseTaskDepend::taskdependin;
     break;
-  case Fortran::parser::OmpDependenceType::Type::Out:
+  case omp::clause::Depend::Type::Out:
     pbKind = mlir::omp::ClauseTaskDepend::taskdependout;
     break;
-  case Fortran::parser::OmpDependenceType::Type::Inout:
+  case omp::clause::Depend::Type::Inout:
     pbKind = mlir::omp::ClauseTaskDepend::taskdependinout;
     break;
   default:
@@ -189,45 +171,41 @@ genDependKindAttr(fir::FirOpBuilder &firOpBuilder,
                                               pbKind);
 }
 
-static mlir::Value getIfClauseOperand(
-    Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::OmpClause::If *ifClause,
-    Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName,
-    mlir::Location clauseLocation) {
+static mlir::Value
+getIfClauseOperand(Fortran::lower::AbstractConverter &converter,
+                   const omp::clause::If &clause,
+                   omp::clause::If::DirectiveNameModifier directiveName,
+                   mlir::Location clauseLocation) {
   // Only consider the clause if it's intended for the given directive.
-  auto &directive = std::get<
-      std::optional<Fortran::parser::OmpIfClause::DirectiveNameModifier>>(
-      ifClause->v.t);
+  auto &directive =
+      std::get<std::optional<omp::clause::If::DirectiveNameModifier>>(clause.t);
   if (directive && directive.value() != directiveName)
     return nullptr;
 
   Fortran::lower::StatementContext stmtCtx;
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  auto &expr = std::get<Fortran::parser::ScalarLogicalExpr>(ifClause->v.t);
   mlir::Value ifVal = fir::getBase(
-      converter.genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx));
+      converter.genExprValue(std::get<omp::SomeExpr>(clause.t), stmtCtx));
   return firOpBuilder.createConvert(clauseLocation, firOpBuilder.getI1Type(),
                                     ifVal);
 }
 
 static void
 addUseDeviceClause(Fortran::lower::AbstractConverter &converter,
-                   const Fortran::parser::OmpObjectList &useDeviceClause,
+                   const omp::ObjectList &objects,
                    llvm::SmallVectorImpl<mlir::Value> &operands,
                    llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
                    llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
                    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
                        &useDeviceSymbols) {
-  genObjectList(useDeviceClause, converter, operands);
+  genObjectList(objects, converter, operands);
   for (mlir::Value &operand : operands) {
     checkMapType(operand.getLoc(), operand.getType());
     useDeviceTypes.push_back(operand.getType());
     useDeviceLocs.push_back(operand.getLoc());
   }
-  for (const Fortran::parser::OmpObject &ompObject : useDeviceClause.v) {
-    Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject);
-    useDeviceSymbols.push_back(sym);
-  }
+  for (const omp::Object &object : objects)
+    useDeviceSymbols.push_back(object.id());
 }
 
 //===----------------------------------------------------------------------===//
@@ -253,9 +231,8 @@ bool ClauseProcessor::processCollapse(
   }
 
   std::int64_t collapseValue = 1l;
-  if (auto *collapseClause = findUniqueClause<ClauseTy::Collapse>()) {
-    const auto *expr = Fortran::semantics::GetExpr(collapseClause->v);
-    collapseValue = Fortran::evaluate::ToInt64(*expr).value();
+  if (auto *clause = findUniqueClause<omp::clause::Collapse>()) {
+    collapseValue = Fortran::evaluate::ToInt64(clause->v).value();
     found = true;
   }
 
@@ -294,19 +271,19 @@ bool ClauseProcessor::processCollapse(
 }
 
 bool ClauseProcessor::processDefault() const {
-  if (auto *defaultClause = findUniqueClause<ClauseTy::Default>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Default>()) {
     // Private, Firstprivate, Shared, None
-    switch (defaultClause->v.v) {
-    case Fortran::parser::OmpDefaultClause::Type::Shared:
-    case Fortran::parser::OmpDefaultClause::Type::None:
+    switch (clause->v) {
+    case omp::clause::Default::Type::Shared:
+    case omp::clause::Default::Type::None:
       // Default clause with shared or none do not require any handling since
       // Shared is the default behavior in the IR and None is only required
       // for semantic checks.
       break;
-    case Fortran::parser::OmpDefaultClause::Type::Private:
+    case omp::clause::Default::Type::Private:
       // TODO Support default(private)
       break;
-    case Fortran::parser::OmpDefaultClause::Type::Firstprivate:
+    case omp::clause::Default::Type::Firstprivate:
       // TODO Support default(firstprivate)
       break;
     }
@@ -318,20 +295,17 @@ bool ClauseProcessor::processDefault() const {
 bool ClauseProcessor::processDevice(Fortran::lower::StatementContext &stmtCtx,
                                     mlir::Value &result) const {
   const Fortran::parser::CharBlock *source = nullptr;
-  if (auto *deviceClause = findUniqueClause<ClauseTy::Device>(&source)) {
+  if (auto *clause = findUniqueClause<omp::clause::Device>(&source)) {
     mlir::Location clauseLocation = converter.genLocation(*source);
-    if (auto deviceModifier = std::get<
-            std::optional<Fortran::parser::OmpDeviceClause::DeviceModifier>>(
-            deviceClause->v.t)) {
-      if (deviceModifier ==
-          Fortran::parser::OmpDeviceClause::DeviceModifier::Ancestor) {
+    if (auto deviceModifier =
+            std::get<std::optional<omp::clause::Device::DeviceModifier>>(
+                clause->t)) {
+      if (deviceModifier == omp::clause::Device::DeviceModifier::Ancestor) {
         TODO(clauseLocation, "OMPD_target Device Modifier Ancestor");
       }
     }
-    if (const auto *deviceExpr = Fortran::semantics::GetExpr(
-            std::get<Fortran::parser::ScalarIntExpr>(deviceClause->v.t))) {
-      result = fir::getBase(converter.genExprValue(*deviceExpr, stmtCtx));
-    }
+    const auto &deviceExpr = std::get<omp::SomeExpr>(clause->t);
+    result = fir::getBase(converter.genExprValue(deviceExpr, stmtCtx));
     return true;
   }
   return false;
@@ -339,16 +313,16 @@ bool ClauseProcessor::processDevice(Fortran::lower::StatementContext &stmtCtx,
 
 bool ClauseProcessor::processDeviceType(
     mlir::omp::DeclareTargetDeviceType &result) const {
-  if (auto *deviceTypeClause = findUniqueClause<ClauseTy::DeviceType>()) {
+  if (auto *clause = findUniqueClause<omp::clause::DeviceType>()) {
     // Case: declare target ... device_type(any | host | nohost)
-    switch (deviceTypeClause->v.v) {
-    case Fortran::parser::OmpDeviceTypeClause::Type::Nohost:
+    switch (clause->v) {
+    case omp::clause::DeviceType::Type::Nohost:
       result = mlir::omp::DeclareTargetDeviceType::nohost;
       break;
-    case Fortran::parser::OmpDeviceTypeClause::Type::Host:
+    case omp::clause::DeviceType::Type::Host:
       result = mlir::omp::DeclareTargetDeviceType::host;
       break;
-    case Fortran::parser::OmpDeviceTypeClause::Type::Any:
+    case omp::clause::DeviceType::Type::Any:
       result = mlir::omp::DeclareTargetDeviceType::any;
       break;
     }
@@ -360,12 +334,12 @@ bool ClauseProcessor::processDeviceType(
 bool ClauseProcessor::processFinal(Fortran::lower::StatementContext &stmtCtx,
                                    mlir::Value &result) const {
   const Fortran::parser::CharBlock *source = nullptr;
-  if (auto *finalClause = findUniqueClause<ClauseTy::Final>(&source)) {
+  if (auto *clause = findUniqueClause<omp::clause::Final>(&source)) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     mlir::Location clauseLocation = converter.genLocation(*source);
 
-    mlir::Value finalVal = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(finalClause->v), stmtCtx));
+    mlir::Value finalVal =
+        fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     result = firOpBuilder.createConvert(clauseLocation,
                                         firOpBuilder.getI1Type(), finalVal);
     return true;
@@ -374,10 +348,9 @@ bool ClauseProcessor::processFinal(Fortran::lower::StatementContext &stmtCtx,
 }
 
 bool ClauseProcessor::processHint(mlir::IntegerAttr &result) const {
-  if (auto *hintClause = findUniqueClause<ClauseTy::Hint>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Hint>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    const auto *expr = Fortran::semantics::GetExpr(hintClause->v);
-    int64_t hintValue = *Fortran::evaluate::ToInt64(*expr);
+    int64_t hintValue = *Fortran::evaluate::ToInt64(clause->v);
     result = firOpBuilder.getI64IntegerAttr(hintValue);
     return true;
   }
@@ -385,20 +358,19 @@ bool ClauseProcessor::processHint(mlir::IntegerAttr &result) const {
 }
 
 bool ClauseProcessor::processMergeable(mlir::UnitAttr &result) const {
-  return markClauseOccurrence<ClauseTy::Mergeable>(result);
+  return markClauseOccurrence<omp::clause::Mergeable>(result);
 }
 
 bool ClauseProcessor::processNowait(mlir::UnitAttr &result) const {
-  return markClauseOccurrence<ClauseTy::Nowait>(result);
+  return markClauseOccurrence<omp::clause::Nowait>(result);
 }
 
 bool ClauseProcessor::processNumTeams(Fortran::lower::StatementContext &stmtCtx,
                                       mlir::Value &result) const {
   // TODO Get lower and upper bounds for num_teams when parser is updated to
   // accept both.
-  if (auto *numTeamsClause = findUniqueClause<ClauseTy::NumTeams>()) {
-    result = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(numTeamsClause->v), stmtCtx));
+  if (auto *clause = findUniqueClause<omp::clause::NumTeams>()) {
+    result = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
@@ -406,23 +378,20 @@ bool ClauseProcessor::processNumTeams(Fortran::lower::StatementContext &stmtCtx,
 
 bool ClauseProcessor::processNumThreads(
     Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const {
-  if (auto *numThreadsClause = findUniqueClause<ClauseTy::NumThreads>()) {
+  if (auto *clause = findUniqueClause<omp::clause::NumThreads>()) {
     // OMPIRBuilder expects `NUM_THREADS` clause as a `Value`.
-    result = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(numThreadsClause->v), stmtCtx));
+    result = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
 }
 
 bool ClauseProcessor::processOrdered(mlir::IntegerAttr &result) const {
-  if (auto *orderedClause = findUniqueClause<ClauseTy::Ordered>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Ordered>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     int64_t orderedClauseValue = 0l;
-    if (orderedClause->v.has_value()) {
-      const auto *expr = Fortran::semantics::GetExpr(orderedClause->v);
-      orderedClauseValue = *Fortran::evaluate::ToInt64(*expr);
-    }
+    if (clause->v.has_value())
+      orderedClauseValue = *Fortran::evaluate::ToInt64(*clause->v);
     result = firOpBuilder.getI64IntegerAttr(orderedClauseValue);
     return true;
   }
@@ -431,9 +400,8 @@ bool ClauseProcessor::processOrdered(mlir::IntegerAttr &result) const {
 
 bool ClauseProcessor::processPriority(Fortran::lower::StatementContext &stmtCtx,
                                       mlir::Value &result) const {
-  if (auto *priorityClause = findUniqueClause<ClauseTy::Priority>()) {
-    result = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(priorityClause->v), stmtCtx));
+  if (auto *clause = findUniqueClause<omp::clause::Priority>()) {
+    result = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
@@ -441,20 +409,19 @@ bool ClauseProcessor::processPriority(Fortran::lower::StatementContext &stmtCtx,
 
 bool ClauseProcessor::processProcBind(
     mlir::omp::ClauseProcBindKindAttr &result) const {
-  if (auto *procBindClause = findUniqueClause<ClauseTy::ProcBind>()) {
+  if (auto *clause = findUniqueClause<omp::clause::ProcBind>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    result = genProcBindKindAttr(firOpBuilder, procBindClause);
+    result = genProcBindKindAttr(firOpBuilder, *clause);
     return true;
   }
   return false;
 }
 
 bool ClauseProcessor::processSafelen(mlir::IntegerAttr &result) const {
-  if (auto *safelenClause = findUniqueClause<ClauseTy::Safelen>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Safelen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    const auto *expr = Fortran::semantics::GetExpr(safelenClause->v);
     const std::optional<std::int64_t> safelenVal =
-        Fortran::evaluate::ToInt64(*expr);
+        Fortran::evaluate::ToInt64(clause->v);
     result = firOpBuilder.getI64IntegerAttr(*safelenVal);
     return true;
   }
@@ -465,41 +432,38 @@ bool ClauseProcessor::processSchedule(
     mlir::omp::ClauseScheduleKindAttr &valAttr,
     mlir::omp::ScheduleModifierAttr &modifierAttr,
     mlir::UnitAttr &simdModifierAttr) const {
-  if (auto *scheduleClause = findUniqueClause<ClauseTy::Schedule>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Schedule>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     mlir::MLIRContext *context = firOpBuilder.getContext();
-    const Fortran::parser::OmpScheduleClause &scheduleType = scheduleClause->v;
-    const auto &scheduleClauseKind =
-        std::get<Fortran::parser::OmpScheduleClause::ScheduleType>(
-            scheduleType.t);
+    const auto &scheduleType =
+        std::get<omp::clause::Schedule::ScheduleType>(clause->t);
 
     mlir::omp::ClauseScheduleKind scheduleKind;
-    switch (scheduleClauseKind) {
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Static:
+    switch (scheduleType) {
+    case omp::clause::Schedule::ScheduleType::Static:
       scheduleKind = mlir::omp::ClauseScheduleKind::Static;
       break;
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Dynamic:
+    case omp::clause::Schedule::ScheduleType::Dynamic:
       scheduleKind = mlir::omp::ClauseScheduleKind::Dynamic;
       break;
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Guided:
+    case omp::clause::Schedule::ScheduleType::Guided:
       scheduleKind = mlir::omp::ClauseScheduleKind::Guided;
       break;
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Auto:
+    case omp::clause::Schedule::ScheduleType::Auto:
       scheduleKind = mlir::omp::ClauseScheduleKind::Auto;
       break;
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Runtime:
+    case omp::clause::Schedule::ScheduleType::Runtime:
       scheduleKind = mlir::omp::ClauseScheduleKind::Runtime;
       break;
     }
 
-    mlir::omp::ScheduleModifier scheduleModifier =
-        getScheduleModifier(scheduleClause->v);
+    mlir::omp::ScheduleModifier scheduleModifier = getScheduleModifier(*clause);
 
     if (scheduleModifier != mlir::omp::ScheduleModifier::none)
       modifierAttr =
           mlir::omp::ScheduleModifierAttr::get(context, scheduleModifier);
 
-    if (getSimdModifier(scheduleClause->v) != mlir::omp::ScheduleModifier::none)
+    if (getSimdModifier(*clause) != mlir::omp::ScheduleModifier::none)
       simdModifierAttr = firOpBuilder.getUnitAttr();
 
     valAttr = mlir::omp::ClauseScheduleKindAttr::get(context, scheduleKind);
@@ -510,25 +474,19 @@ bool ClauseProcessor::processSchedule(
 
 bool ClauseProcessor::processScheduleChunk(
     Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const {
-  if (auto *scheduleClause = findUniqueClause<ClauseTy::Schedule>()) {
-    if (const auto &chunkExpr =
-            std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
-                scheduleClause->v.t)) {
-      if (const auto *expr = Fortran::semantics::GetExpr(*chunkExpr)) {
-        result = fir::getBase(converter.genExprValue(*expr, stmtCtx));
-      }
-    }
+  if (auto *clause = findUniqueClause<omp::clause::Schedule>()) {
+    if (const auto &chunkExpr = std::get<omp::MaybeExpr>(clause->t))
+      result = fir::getBase(converter.genExprValue(*chunkExpr, stmtCtx));
     return true;
   }
   return false;
 }
 
 bool ClauseProcessor::processSimdlen(mlir::IntegerAttr &result) const {
-  if (auto *simdlenClause = findUniqueClause<ClauseTy::Simdlen>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Simdlen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    const auto *expr = Fortran::semantics::GetExpr(simdlenClause->v);
     const std::optional<std::int64_t> simdlenVal =
-        Fortran::evaluate::ToInt64(*expr);
+        Fortran::evaluate::ToInt64(clause->v);
     result = firOpBuilder.getI64IntegerAttr(*simdlenVal);
     return true;
   }
@@ -537,16 +495,15 @@ bool ClauseProcessor::processSimdlen(mlir::IntegerAttr &result) const {
 
 bool ClauseProcessor::processThreadLimit(
     Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const {
-  if (auto *threadLmtClause = findUniqueClause<ClauseTy::ThreadLimit>()) {
-    result = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(threadLmtClause->v), stmtCtx));
+  if (auto *clause = findUniqueClause<omp::clause::ThreadLimit>()) {
+    result = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
 }
 
 bool ClauseProcessor::processUntied(mlir::UnitAttr &result) const {
-  return markClauseOccurrence<ClauseTy::Untied>(result);
+  return markClauseOccurrence<omp::clause::Untied>(result);
 }
 
 //===----------------------------------------------------------------------===//
@@ -556,10 +513,10 @@ bool ClauseProcessor::processUntied(mlir::UnitAttr &result) const {
 bool ClauseProcessor::processAllocate(
     llvm::SmallVectorImpl<mlir::Value> &allocatorOperands,
     llvm::SmallVectorImpl<mlir::Value> &allocateOperands) const {
-  return findRepeatableClause<ClauseTy::Allocate>(
-      [&](const ClauseTy::Allocate *allocateClause,
+  return findRepeatableClause<omp::clause::Allocate>(
+      [&](const omp::clause::Allocate &clause,
           const Fortran::parser::CharBlock &) {
-        genAllocateClause(converter, allocateClause->v, allocatorOperands,
+        genAllocateClause(converter, clause, allocatorOperands,
                           allocateOperands);
       });
 }
@@ -576,12 +533,12 @@ bool ClauseProcessor::processCopyin() const {
         if (converter.isPresentShallowLookup(*sym))
           converter.copyHostAssociateVar(*sym, copyAssignIP);
       };
-  bool hasCopyin = findRepeatableClause<ClauseTy::Copyin>(
-      [&](const ClauseTy::Copyin *copyinClause,
+  bool hasCopyin = findRepeatableClause<omp::clause::Copyin>(
+      [&](const omp::clause::Copyin &clause,
           const Fortran::parser::CharBlock &) {
-        const Fortran::parser::OmpObjectList &ompObjectList = copyinClause->v;
-        for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) {
-          Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject);
+        for (const omp::Object &object : clause.v) {
+          Fortran::semantics::Symbol *sym = object.id();
+          assert(sym && "Expecting symbol");
           if (const auto *commonDetails =
                   sym->detailsIf<Fortran::semantics::CommonBlockDetails>()) {
             for (const auto &mem : commonDetails->objects())
@@ -620,7 +577,7 @@ class TypeInfo {
   }
 
   // Returns the shape of array types.
-  const llvm::SmallVector<int64_t> &getShape() const { return shape; }
+  llvm::ArrayRef<int64_t> getShape() const { return shape; }
 
   // Is the type inside a box?
   bool isBox() const { return inBox; }
@@ -745,13 +702,11 @@ bool ClauseProcessor::processCopyPrivate(
     copyPrivateFuncs.push_back(mlir::SymbolRefAttr::get(funcOp));
   };
 
-  bool hasCopyPrivate = findRepeatableClause<ClauseTy::Copyprivate>(
-      [&](const ClauseTy::Copyprivate *copyPrivateClause,
+  bool hasCopyPrivate = findRepeatableClause<clause::Copyprivate>(
+      [&](const clause::Copyprivate &clause,
           const Fortran::parser::CharBlock &) {
-        const Fortran::parser::OmpObjectList &ompObjectList =
-            copyPrivateClause->v;
-        for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) {
-          Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject);
+        for (const Object &object : clause.v) {
+          Fortran::semantics::Symbol *sym = object.id();
           if (const auto *commonDetails =
                   sym->detailsIf<Fortran::semantics::CommonBlockDetails>()) {
             for (const auto &mem : commonDetails->objects())
@@ -770,38 +725,30 @@ bool ClauseProcessor::processDepend(
     llvm::SmallVectorImpl<mlir::Value> &dependOperands) const {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  return findRepeatableClause<ClauseTy::Depend>(
-      [&](const ClauseTy::Depend *dependClause,
+  return findRepeatableClause<omp::clause::Depend>(
+      [&](const omp::clause::Depend &clause,
           const Fortran::parser::CharBlock &) {
-        const std::list<Fortran::parser::Designator> &depVal =
-            std::get<std::list<Fortran::parser::Designator>>(
-                std::get<Fortran::parser::OmpDependClause::InOut>(
-                    dependClause->v.u)
-                    .t);
+        assert(std::holds_alternative<omp::clause::Depend::InOut>(clause.u) &&
+               "Only InOut is handled at the moment");
+        const auto &inOut = std::get<omp::clause::Depend::InOut>(clause.u);
+        const auto &objects = std::get<omp::ObjectList>(inOut.t);
+
         mlir::omp::ClauseTaskDependAttr dependTypeOperand =
-            genDependKindAttr(firOpBuilder, dependClause);
-        dependTypeOperands.insert(dependTypeOperands.end(), depVal.size(),
-                                  dependTypeOperand);
-        for (const Fortran::parser::Designator &ompObject : depVal) {
-          Fortran::semantics::Symbol *sym = nullptr;
-          std::visit(
-              Fortran::common::visitors{
-                  [&](const Fortran::parser::DataRef &designator) {
-                    if (const Fortran::parser::Name *name =
-                            std::get_if<Fortran::parser::Name>(&designator.u)) {
-                      sym = name->symbol;
-                    } else if (std::get_if<Fortran::common::Indirection<
-                                   Fortran::parser::ArrayElement>>(
-                                   &designator.u)) {
-                      TODO(converter.getCurrentLocation(),
-                           "array sections not supported for task depend");
-                    }
-                  },
-                  [&](const Fortran::parser::Substring &designator) {
-                    TODO(converter.getCurrentLocation(),
-                         "substring not supported for task depend");
-                  }},
-              (ompObject).u);
+            genDependKindAttr(firOpBuilder, clause);
+        dependTypeOperands.append(objects.size(), dependTypeOperand);
+
+        for (const omp::Object &object : objects) {
+          assert(object.ref() && "Expecting designator");
+
+          if (Fortran::evaluate::ExtractSubstring(*object.ref())) {
+            TODO(converter.getCurrentLocation(),
+                 "substring not supported for task depend");
+          } else if (Fortran::evaluate::IsArrayElement(*object.ref())) {
+            TODO(converter.getCurrentLocation(),
+                 "array sections not supported for task depend");
+          }
+
+          Fortran::semantics::Symbol *sym = object.id();
           const mlir::Value variable = converter.getSymbolAddress(*sym);
           dependOperands.push_back(variable);
         }
@@ -809,14 +756,14 @@ bool ClauseProcessor::processDepend(
 }
 
 bool ClauseProcessor::processIf(
-    Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName,
+    omp::clause::If::DirectiveNameModifier directiveName,
     mlir::Value &result) const {
   bool found = false;
-  findRepeatableClause<ClauseTy::If>(
-      [&](const ClauseTy::If *ifClause,
+  findRepeatableClause<omp::clause::If>(
+      [&](const omp::clause::If &clause,
           const Fortran::parser::CharBlock &source) {
         mlir::Location clauseLocation = converter.genLocation(source);
-        mlir::Value operand = getIfClauseOperand(converter, ifClause,
+        mlir::Value operand = getIfClauseOperand(converter, clause,
                                                  directiveName, clauseLocation);
         // Assume that, at most, a single 'if' clause will be applicable to the
         // given directive.
@@ -830,20 +777,19 @@ bool ClauseProcessor::processIf(
 
 bool ClauseProcessor::processLink(
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const {
-  return findRepeatableClause<ClauseTy::Link>(
-      [&](const ClauseTy::Link *linkClause,
-          const Fortran::parser::CharBlock &) {
+  return findRepeatableClause<omp::clause::Link>(
+      [&](const omp::clause::Link &clause, const Fortran::parser::CharBlock &) {
         // Case: declare target link(var1, var2)...
         gatherFuncAndVarSyms(
-            linkClause->v, mlir::omp::DeclareTargetCaptureClause::link, result);
+            clause.v, mlir::omp::DeclareTargetCaptureClause::link, result);
       });
 }
 
 mlir::omp::MapInfoOp
 createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name,
-                mlir::SmallVector<mlir::Value> bounds,
-                mlir::SmallVector<mlir::Value> members, uint64_t mapType,
+                llvm::ArrayRef<mlir::Value> bounds,
+                llvm::ArrayRef<mlir::Value> members, uint64_t mapType,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
                 bool isVal) {
   if (auto boxTy = baseAddr.getType().dyn_cast<fir::BaseBoxType>()) {
@@ -872,7 +818,7 @@ bool ClauseProcessor::processMap(
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *mapSymbols)
     const {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  return findRepeatableClause<ClauseTy::Map>(
+  return findRepeatableClause2<ClauseTy::Map>(
       [&](const ClauseTy::Map *mapClause,
           const Fortran::parser::CharBlock &source) {
         mlir::Location clauseLocation = converter.genLocation(source);
@@ -964,43 +910,41 @@ bool ClauseProcessor::processReduction(
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols)
     const {
-  return findRepeatableClause<ClauseTy::Reduction>(
-      [&](const ClauseTy::Reduction *reductionClause,
+  return findRepeatableClause<omp::clause::Reduction>(
+      [&](const omp::clause::Reduction &clause,
           const Fortran::parser::CharBlock &) {
         ReductionProcessor rp;
-        rp.addReductionDecl(currentLocation, converter, reductionClause->v,
-                            reductionVars, reductionDeclSymbols,
-                            reductionSymbols);
+        rp.addReductionDecl(currentLocation, converter, clause, reductionVars,
+                            reductionDeclSymbols, reductionSymbols);
       });
 }
 
 bool ClauseProcessor::processSectionsReduction(
     mlir::Location currentLocation) const {
-  return findRepeatableClause<ClauseTy::Reduction>(
-      [&](const ClauseTy::Reduction *, const Fortran::parser::CharBlock &) {
+  return findRepeatableClause<omp::clause::Reduction>(
+      [&](const omp::clause::Reduction &, const Fortran::parser::CharBlock &) {
         TODO(currentLocation, "OMPC_Reduction");
       });
 }
 
 bool ClauseProcessor::processTo(
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const {
-  return findRepeatableClause<ClauseTy::To>(
-      [&](const ClauseTy::To *toClause, const Fortran::parser::CharBlock &) {
+  return findRepeatableClause<omp::clause::To>(
+      [&](const omp::clause::To &clause, const Fortran::parser::CharBlock &) {
         // Case: declare target to(func, var1, var2)...
-        gatherFuncAndVarSyms(toClause->v,
+        gatherFuncAndVarSyms(clause.v,
                              mlir::omp::DeclareTargetCaptureClause::to, result);
       });
 }
 
 bool ClauseProcessor::processEnter(
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const {
-  return findRepeatableClause<ClauseTy::Enter>(
-      [&](const ClauseTy::Enter *enterClause,
+  return findRepeatableClause<omp::clause::Enter>(
+      [&](const omp::clause::Enter &clause,
           const Fortran::parser::CharBlock &) {
         // Case: declare target enter(func, var1, var2)...
-        gatherFuncAndVarSyms(enterClause->v,
-                             mlir::omp::DeclareTargetCaptureClause::enter,
-                             result);
+        gatherFuncAndVarSyms(
+            clause.v, mlir::omp::DeclareTargetCaptureClause::enter, result);
       });
 }
 
@@ -1010,11 +954,11 @@ bool ClauseProcessor::processUseDeviceAddr(
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &useDeviceSymbols)
     const {
-  return findRepeatableClause<ClauseTy::UseDeviceAddr>(
-      [&](const ClauseTy::UseDeviceAddr *devAddrClause,
+  return findRepeatableClause<omp::clause::UseDeviceAddr>(
+      [&](const omp::clause::UseDeviceAddr &clause,
           const Fortran::parser::CharBlock &) {
-        addUseDeviceClause(converter, devAddrClause->v, operands,
-                           useDeviceTypes, useDeviceLocs, useDeviceSymbols);
+        addUseDeviceClause(converter, clause.v, operands, useDeviceTypes,
+                           useDeviceLocs, useDeviceSymbols);
       });
 }
 
@@ -1024,10 +968,10 @@ bool ClauseProcessor::processUseDevicePtr(
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &useDeviceSymbols)
     const {
-  return findRepeatableClause<ClauseTy::UseDevicePtr>(
-      [&](const ClauseTy::UseDevicePtr *devPtrClause,
+  return findRepeatableClause<omp::clause::UseDevicePtr>(
+      [&](const omp::clause::UseDevicePtr &clause,
           const Fortran::parser::CharBlock &) {
-        addUseDeviceClause(converter, devPtrClause->v, operands, useDeviceTypes,
+        addUseDeviceClause(converter, clause.v, operands, useDeviceTypes,
                            useDeviceLocs, useDeviceSymbols);
       });
 }
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 11aff0be25053..3f6adcce8ae87 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -12,6 +12,7 @@
 #ifndef FORTRAN_LOWER_CLAUASEPROCESSOR_H
 #define FORTRAN_LOWER_CLAUASEPROCESSOR_H
 
+#include "Clauses.h"
 #include "DirectivesCommon.h"
 #include "ReductionProcessor.h"
 #include "Utils.h"
@@ -51,7 +52,8 @@ class ClauseProcessor {
   ClauseProcessor(Fortran::lower::AbstractConverter &converter,
                   Fortran::semantics::SemanticsContext &semaCtx,
                   const Fortran::parser::OmpClauseList &clauses)
-      : converter(converter), semaCtx(semaCtx), clauses(clauses) {}
+      : converter(converter), semaCtx(semaCtx), clauses2(clauses),
+        clauses(makeList(clauses, semaCtx)) {}
 
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool
@@ -103,9 +105,8 @@ class ClauseProcessor {
                      llvm::SmallVectorImpl<mlir::Value> &dependOperands) const;
   bool
   processEnter(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
-  bool
-  processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName,
-            mlir::Value &result) const;
+  bool processIf(omp::clause::If::DirectiveNameModifier directiveName,
+                 mlir::Value &result) const;
   bool
   processLink(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
 
@@ -155,11 +156,15 @@ class ClauseProcessor {
                    llvm::omp::Directive directive) const;
 
 private:
-  using ClauseIterator = std::list<ClauseTy>::const_iterator;
+  using ClauseIterator = List<Clause>::const_iterator;
+  using ClauseIterator2 = std::list<ClauseTy>::const_iterator;
 
   /// Utility to find a clause within a range in the clause list.
   template <typename T>
   static ClauseIterator findClause(ClauseIterator begin, ClauseIterator end);
+  template <typename T>
+  static ClauseIterator2 findClause2(ClauseIterator2 begin,
+                                     ClauseIterator2 end);
 
   /// Return the first instance of the given clause found in the clause list or
   /// `nullptr` if not present. If more than one instance is expected, use
@@ -172,6 +177,10 @@ class ClauseProcessor {
   /// if at least one instance was found.
   template <typename T>
   bool findRepeatableClause(
+      std::function<void(const T &, const Fortran::parser::CharBlock &source)>
+          callbackFn) const;
+  template <typename T>
+  bool findRepeatableClause2(
       std::function<void(const T *, const Fortran::parser::CharBlock &source)>
           callbackFn) const;
 
@@ -181,14 +190,15 @@ class ClauseProcessor {
 
   Fortran::lower::AbstractConverter &converter;
   Fortran::semantics::SemanticsContext &semaCtx;
-  const Fortran::parser::OmpClauseList &clauses;
+  const Fortran::parser::OmpClauseList &clauses2;
+  List<Clause> clauses;
 };
 
 template <typename T>
 bool ClauseProcessor::processMotionClauses(
     Fortran::lower::StatementContext &stmtCtx,
     llvm::SmallVectorImpl<mlir::Value> &mapOperands) {
-  return findRepeatableClause<T>(
+  return findRepeatableClause2<T>(
       [&](const T *motionClause, const Fortran::parser::CharBlock &source) {
         mlir::Location clauseLocation = converter.genLocation(source);
         fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -248,7 +258,7 @@ void ClauseProcessor::processTODO(mlir::Location currentLocation,
              " construct");
   };
 
-  for (ClauseIterator it = clauses.v.begin(); it != clauses.v.end(); ++it)
+  for (ClauseIterator2 it = clauses2.v.begin(); it != clauses2.v.end(); ++it)
     (checkUnhandledClause(std::get_if<Ts>(&it->u)), ...);
 }
 
@@ -263,11 +273,22 @@ ClauseProcessor::findClause(ClauseIterator begin, ClauseIterator end) {
   return end;
 }
 
+template <typename T>
+ClauseProcessor::ClauseIterator2
+ClauseProcessor::findClause2(ClauseIterator2 begin, ClauseIterator2 end) {
+  for (ClauseIterator2 it = begin; it != end; ++it) {
+    if (std::get_if<T>(&it->u))
+      return it;
+  }
+
+  return end;
+}
+
 template <typename T>
 const T *ClauseProcessor::findUniqueClause(
     const Fortran::parser::CharBlock **source) const {
-  ClauseIterator it = findClause<T>(clauses.v.begin(), clauses.v.end());
-  if (it != clauses.v.end()) {
+  ClauseIterator it = findClause<T>(clauses.begin(), clauses.end());
+  if (it != clauses.end()) {
     if (source)
       *source = &it->source;
     return &std::get<T>(it->u);
@@ -277,13 +298,31 @@ const T *ClauseProcessor::findUniqueClause(
 
 template <typename T>
 bool ClauseProcessor::findRepeatableClause(
-    std::function<void(const T *, const Fortran::parser::CharBlock &source)>
+    std::function<void(const T &, const Fortran::parser::CharBlock &source)>
         callbackFn) const {
   bool found = false;
-  ClauseIterator nextIt, endIt = clauses.v.end();
-  for (ClauseIterator it = clauses.v.begin(); it != endIt; it = nextIt) {
+  ClauseIterator nextIt, endIt = clauses.end();
+  for (ClauseIterator it = clauses.begin(); it != endIt; it = nextIt) {
     nextIt = findClause<T>(it, endIt);
 
+    if (nextIt != endIt) {
+      callbackFn(std::get<T>(nextIt->u), nextIt->source);
+      found = true;
+      ++nextIt;
+    }
+  }
+  return found;
+}
+
+template <typename T>
+bool ClauseProcessor::findRepeatableClause2(
+    std::function<void(const T *, const Fortran::parser::CharBlock &source)>
+        callbackFn) const {
+  bool found = false;
+  ClauseIterator2 nextIt, endIt = clauses2.v.end();
+  for (ClauseIterator2 it = clauses2.v.begin(); it != endIt; it = nextIt) {
+    nextIt = findClause2<T>(it, endIt);
+
     if (nextIt != endIt) {
       callbackFn(&std::get<T>(nextIt->u), nextIt->source);
       found = true;
diff --git a/flang/lib/Lower/OpenMP/ClauseT.h b/flang/lib/Lower/OpenMP/ClauseT.h
new file mode 100644
index 0000000000000..2aae29af29214
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/ClauseT.h
@@ -0,0 +1,714 @@
+//===- ClauseT -- clause template definitions -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_LOWER_OPENMP_CLAUSET_H
+#define FORTRAN_LOWER_OPENMP_CLAUSET_H
+
+#include "flang/Parser/parse-tree.h" // For enum reuse
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <iterator>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "llvm/Frontend/OpenMP/OMP.h.inc"
+
+namespace tomp {
+
+template <typename T>
+using ListT = llvm::SmallVector<T, 0>;
+
+// A specialization of ObjectT<Id, Expr> must provide the following definitions:
+// {
+//    using IdType = Id;
+//    using ExprType = Expr;
+//
+//    auto id() const -> Id {
+//      return the identifier of the object (for use in tests for
+//         presence/absence of the object)
+//    }
+//
+//    auto ref() const -> const Expr& {
+//      return the expression accessing (referencing) the object
+//    }
+// }
+//
+// For example, the ObjectT instance created for "var[x+1]" would have
+// the `id()` return the identifier for `var`, and the `ref()` return the
+// representation of the array-access `var[x+1]`.
+template <typename Id, typename Expr>
+struct ObjectT;
+
+template <typename I, typename E>
+using ObjectListT = ListT<ObjectT<I, E>>;
+
+namespace clause {
+// Helper objects
+
+template <typename I, typename E>
+struct DefinedOperatorT {
+  struct DefinedOpName {
+    using WrapperTrait = std::true_type;
+    ObjectT<I, E> v;
+  };
+  using IntrinsicOperator = Fortran::parser::DefinedOperator::IntrinsicOperator;
+  using UnionTrait = std::true_type;
+  std::variant<DefinedOpName, IntrinsicOperator> u;
+};
+
+template <typename I, typename E>
+struct ProcedureDesignatorT {
+  using WrapperTrait = std::true_type;
+  ObjectT<I, E> v;
+};
+
+template <typename I, typename E>
+struct ReductionOperatorT {
+  using UnionTrait = std::true_type;
+  std::variant<DefinedOperatorT<I, E>, ProcedureDesignatorT<I, E>> u;
+};
+
+template <typename I, typename E>
+struct AcqRelT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AcquireT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AdjustArgsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AffinityT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AlignT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AppendArgsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AtT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct BindT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct CancellationConstructTypeT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct CaptureT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct CompareT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DepobjT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DestroyT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DetachT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DoacrossT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DynamicAllocatorsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ExclusiveT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct FailT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct FlushT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct FullT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct InbranchT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct InclusiveT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct IndirectT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct InitT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct MatchT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct MemoryOrderT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct MergeableT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct MessageT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct NogroupT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct NotinbranchT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct NowaitT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct OmpxAttributeT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct OmpxBareT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ReadT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct RelaxedT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ReleaseT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ReverseOffloadT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct SeqCstT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct SeverityT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct SimdT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ThreadprivateT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ThreadsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UnifiedAddressT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UnifiedSharedMemoryT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UnknownT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UntiedT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UpdateT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UseT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UsesAllocatorsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct WeakT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct WhenT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct WriteT {
+  using EmptyTrait = std::true_type;
+};
+
+template <typename I, typename E>
+struct AlignedT {
+  using TupleTrait = std::true_type;
+  std::tuple<ObjectListT<I, E>, std::optional<E>> t;
+};
+
+template <typename I, typename E>
+struct AllocateT {
+  struct Modifier {
+    struct Allocator {
+      using WrapperTrait = std::true_type;
+      E v;
+    };
+    struct Align {
+      using WrapperTrait = std::true_type;
+      E v;
+    };
+    struct ComplexModifier {
+      using TupleTrait = std::true_type;
+      std::tuple<Allocator, Align> t;
+    };
+    using UnionTrait = std::true_type;
+    std::variant<Allocator, ComplexModifier, Align> u;
+  };
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<Modifier>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct AllocatorT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct AtomicDefaultMemOrderT {
+  using WrapperTrait = std::true_type;
+  using OmpAtomicDefaultMemOrderType =
+      Fortran::common::OmpAtomicDefaultMemOrderType;
+  OmpAtomicDefaultMemOrderType v;
+};
+
+template <typename I, typename E>
+struct CollapseT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct CopyinT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct CopyprivateT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct DefaultmapT {
+  using ImplicitBehavior =
+      Fortran::parser::OmpDefaultmapClause::ImplicitBehavior;
+  using VariableCategory =
+      Fortran::parser::OmpDefaultmapClause::VariableCategory;
+  using TupleTrait = std::true_type;
+  std::tuple<ImplicitBehavior, std::optional<VariableCategory>> t;
+};
+
+template <typename I, typename E>
+struct DefaultT {
+  using Type = Fortran::parser::OmpDefaultClause::Type;
+  using WrapperTrait = std::true_type;
+  Type v;
+};
+
+template <typename I, typename E>
+struct DependT {
+  struct Source {
+    using EmptyTrait = std::true_type;
+  };
+  struct Sink {
+    using Length = std::tuple<DefinedOperatorT<I, E>, E>;
+    using Vec = std::tuple<ObjectT<I, E>, std::optional<Length>>;
+    using WrapperTrait = std::true_type;
+    ListT<Vec> v;
+  };
+  using Type = Fortran::parser::OmpDependenceType::Type;
+  struct InOut {
+    using TupleTrait = std::true_type;
+    std::tuple<Type, ObjectListT<I, E>> t;
+  };
+  using UnionTrait = std::true_type;
+  std::variant<Source, Sink, InOut> u;
+};
+
+template <typename I, typename E>
+struct DeviceT {
+  using DeviceModifier = Fortran::parser::OmpDeviceClause::DeviceModifier;
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<DeviceModifier>, E> t;
+};
+
+template <typename I, typename E>
+struct DeviceTypeT {
+  using Type = Fortran::parser::OmpDeviceTypeClause::Type;
+  using WrapperTrait = std::true_type;
+  Type v;
+};
+
+template <typename I, typename E>
+struct DistScheduleT {
+  using WrapperTrait = std::true_type;
+  std::optional<E> v;
+};
+
+template <typename I, typename E>
+struct EnterT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct FilterT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct FinalT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct FirstprivateT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct FromT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct GrainsizeT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct HasDeviceAddrT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct HintT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct IfT {
+  using DirectiveNameModifier =
+      Fortran::parser::OmpIfClause::DirectiveNameModifier;
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<DirectiveNameModifier>, E> t;
+};
+
+template <typename I, typename E>
+struct InReductionT {
+  using TupleTrait = std::true_type;
+  std::tuple<ReductionOperatorT<I, E>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct IsDevicePtrT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct LastprivateT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct LinearT {
+  struct Modifier {
+    using Type = Fortran::parser::OmpLinearModifier::Type;
+    using WrapperTrait = std::true_type;
+    Type v;
+  };
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<Modifier>, ObjectListT<I, E>, std::optional<E>> t;
+};
+
+template <typename I, typename E>
+struct LinkT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct MapT {
+  struct MapType {
+    struct Always {
+      using EmptyTrait = std::true_type;
+    };
+    using Type = Fortran::parser::OmpMapType::Type;
+    using TupleTrait = std::true_type;
+    std::tuple<std::optional<Always>, Type> t;
+  };
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<MapType>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct NocontextT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct NontemporalT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct NovariantsT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct NumTasksT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct NumTeamsT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct NumThreadsT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct OmpxDynCgroupMemT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct OrderedT {
+  using WrapperTrait = std::true_type;
+  std::optional<E> v;
+};
+
+template <typename I, typename E>
+struct OrderT {
+  using Kind = Fortran::parser::OmpOrderModifier::Kind;
+  using Type = Fortran::parser::OmpOrderClause::Type;
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<Kind>, Type> t;
+};
+
+template <typename I, typename E>
+struct PartialT {
+  using WrapperTrait = std::true_type;
+  std::optional<E> v;
+};
+
+template <typename I, typename E>
+struct PriorityT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct PrivateT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct ProcBindT {
+  using Type = Fortran::parser::OmpProcBindClause::Type;
+  using WrapperTrait = std::true_type;
+  Type v;
+};
+
+template <typename I, typename E>
+struct ReductionT {
+  using TupleTrait = std::true_type;
+  std::tuple<ReductionOperatorT<I, E>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct SafelenT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct ScheduleT {
+  using ModType = Fortran::parser::OmpScheduleModifierType::ModType;
+  struct ScheduleModifier {
+    using TupleTrait = std::true_type;
+    std::tuple<ModType, std::optional<ModType>> t;
+  };
+  using ScheduleType = Fortran::parser::OmpScheduleClause::ScheduleType;
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<ScheduleModifier>, ScheduleType, std::optional<E>> t;
+};
+
+template <typename I, typename E>
+struct SharedT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct SimdlenT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct SizesT {
+  using WrapperTrait = std::true_type;
+  ListT<E> v;
+};
+
+template <typename I, typename E>
+struct TaskReductionT {
+  using TupleTrait = std::true_type;
+  std::tuple<ReductionOperatorT<I, E>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct ThreadLimitT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct ToT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct UniformT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct UseDeviceAddrT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct UseDevicePtrT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+using UnionOfAllClausesT = std::variant<
+    AcqRelT<I, E>, AcquireT<I, E>, AdjustArgsT<I, E>, AffinityT<I, E>,
+    AlignT<I, E>, AlignedT<I, E>, AllocateT<I, E>, AllocatorT<I, E>,
+    AppendArgsT<I, E>, AtT<I, E>, AtomicDefaultMemOrderT<I, E>, BindT<I, E>,
+    CancellationConstructTypeT<I, E>, CaptureT<I, E>, CollapseT<I, E>,
+    CompareT<I, E>, CopyprivateT<I, E>, CopyinT<I, E>, DefaultT<I, E>,
+    DefaultmapT<I, E>, DependT<I, E>, DepobjT<I, E>, DestroyT<I, E>,
+    DetachT<I, E>, DeviceT<I, E>, DeviceTypeT<I, E>, DistScheduleT<I, E>,
+    DoacrossT<I, E>, DynamicAllocatorsT<I, E>, EnterT<I, E>, ExclusiveT<I, E>,
+    FailT<I, E>, FilterT<I, E>, FinalT<I, E>, FirstprivateT<I, E>, FlushT<I, E>,
+    FromT<I, E>, FullT<I, E>, GrainsizeT<I, E>, HasDeviceAddrT<I, E>,
+    HintT<I, E>, IfT<I, E>, InReductionT<I, E>, InbranchT<I, E>,
+    InclusiveT<I, E>, IndirectT<I, E>, InitT<I, E>, IsDevicePtrT<I, E>,
+    LastprivateT<I, E>, LinearT<I, E>, LinkT<I, E>, MapT<I, E>, MatchT<I, E>,
+    MemoryOrderT<I, E>, MergeableT<I, E>, MessageT<I, E>, NogroupT<I, E>,
+    NowaitT<I, E>, NocontextT<I, E>, NontemporalT<I, E>, NotinbranchT<I, E>,
+    NovariantsT<I, E>, NumTasksT<I, E>, NumTeamsT<I, E>, NumThreadsT<I, E>,
+    OmpxAttributeT<I, E>, OmpxDynCgroupMemT<I, E>, OmpxBareT<I, E>,
+    OrderT<I, E>, OrderedT<I, E>, PartialT<I, E>, PriorityT<I, E>,
+    PrivateT<I, E>, ProcBindT<I, E>, ReadT<I, E>, ReductionT<I, E>,
+    RelaxedT<I, E>, ReleaseT<I, E>, ReverseOffloadT<I, E>, SafelenT<I, E>,
+    ScheduleT<I, E>, SeqCstT<I, E>, SeverityT<I, E>, SharedT<I, E>, SimdT<I, E>,
+    SimdlenT<I, E>, SizesT<I, E>, TaskReductionT<I, E>, ThreadLimitT<I, E>,
+    ThreadprivateT<I, E>, ThreadsT<I, E>, ToT<I, E>, UnifiedAddressT<I, E>,
+    UnifiedSharedMemoryT<I, E>, UniformT<I, E>, UnknownT<I, E>, UntiedT<I, E>,
+    UpdateT<I, E>, UseT<I, E>, UseDeviceAddrT<I, E>, UseDevicePtrT<I, E>,
+    UsesAllocatorsT<I, E>, WeakT<I, E>, WhenT<I, E>, WriteT<I, E>>;
+} // namespace clause
+
+template <typename Id, typename Expr>
+struct ClauseT {
+  llvm::omp::Clause id; // The numeric id of the clause
+  using UnionTrait = std::true_type;
+  clause::UnionOfAllClausesT<Id, Expr> u;
+};
+
+} // namespace tomp
+
+#endif // FORTRAN_LOWER_OPENMP_CLAUSET_H
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
new file mode 100644
index 0000000000000..70f232a4858e1
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -0,0 +1,728 @@
+//===-- Clauses.cpp -- OpenMP clause handling -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Clauses.h"
+
+#include "flang/Common/idioms.h"
+#include "flang/Evaluate/expression.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/expression.h"
+#include "flang/Semantics/symbol.h"
+
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+
+#include <list>
+#include <optional>
+#include <tuple>
+#include <utility>
+#include <variant>
+
+namespace detail {
+template <typename C>
+llvm::omp::Clause getClauseIdForClass(C &&) {
+  using namespace Fortran;
+  using A = llvm::remove_cvref_t<C>; // A is referenced in OMP.inc
+  // The code included below contains a sequence of checks like the following
+  // for each OpenMP clause
+  //   if constexpr (std::is_same_v<A, parser::OmpClause::AcqRel>)
+  //     return llvm::omp::Clause::OMPC_acq_rel;
+  //   [...]
+#define GEN_FLANG_CLAUSE_PARSER_KIND_MAP
+#include "llvm/Frontend/OpenMP/OMP.inc"
+}
+} // namespace detail
+
+static llvm::omp::Clause getClauseId(const Fortran::parser::OmpClause &clause) {
+  return std::visit([](auto &&s) { return detail::getClauseIdForClass(s); },
+                    clause.u);
+}
+
+namespace Fortran::lower::omp {
+using SymbolWithDesignator = std::tuple<semantics::Symbol *, MaybeExpr>;
+
+struct SymbolAndDesignatorExtractor {
+  template <typename T>
+  static T &&AsRvalueRef(T &&t) {
+    return std::move(t);
+  }
+  template <typename T>
+  static T AsRvalueRef(const T &t) {
+    return t;
+  }
+
+  static semantics::Symbol *symbol_addr(const evaluate::SymbolRef &ref) {
+    // Symbols cannot be created after semantic checks, so all symbol
+    // pointers that are non-null must point to one of those pre-existing
+    // objects. Throughout the code, symbols are often pointed to by
+    // non-const pointers, so there is no harm in casting the constness
+    // away.
+    return const_cast<semantics::Symbol *>(&ref.get());
+  }
+
+  template <typename T>
+  static SymbolWithDesignator visit(T &&) {
+    // Use this to see missing overloads:
+    // llvm::errs() << "NULL: " << __PRETTY_FUNCTION__ << '\n';
+    return SymbolWithDesignator{};
+  }
+
+  template <typename T>
+  static SymbolWithDesignator visit(const evaluate::Designator<T> &e) {
+    return std::make_tuple(symbol_addr(*e.GetLastSymbol()),
+                           evaluate::AsGenericExpr(AsRvalueRef(e)));
+  }
+
+  static SymbolWithDesignator visit(const evaluate::ProcedureDesignator &e) {
+    return std::make_tuple(symbol_addr(*e.GetSymbol()), std::nullopt);
+  }
+
+  template <typename T>
+  static SymbolWithDesignator visit(const evaluate::Expr<T> &e) {
+    return std::visit([](auto &&s) { return visit(s); }, e.u);
+  }
+
+  static void verify(const SymbolWithDesignator &sd) {
+    const semantics::Symbol *symbol = std::get<0>(sd);
+    assert(symbol && "Expecting symbol");
+    auto &maybeDsg = std::get<1>(sd);
+    if (!maybeDsg)
+      return; // Symbol with no designator -> OK
+    std::optional<evaluate::DataRef> maybeRef =
+        evaluate::ExtractDataRef(*maybeDsg);
+    if (maybeRef) {
+      if (&maybeRef->GetLastSymbol() == symbol)
+        return; // Symbol with a designator for it -> OK
+      llvm_unreachable("Expecting designator for given symbol");
+    } else {
+      // This could still be a Substring or ComplexPart, but at least Substring
+      // is not allowed in OpenMP.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+      maybeDsg->dump();
+#endif
+      llvm_unreachable("Expecting DataRef designator");
+    }
+  }
+};
+
+SymbolWithDesignator getSymbolAndDesignator(const MaybeExpr &expr) {
+  if (!expr)
+    return SymbolWithDesignator{};
+  return std::visit(
+      [](auto &&s) { return SymbolAndDesignatorExtractor::visit(s); }, expr->u);
+}
+
+Object makeObject(const parser::Name &name,
+                  semantics::SemanticsContext &semaCtx) {
+  assert(name.symbol && "Expecting Symbol");
+  return Object{name.symbol, std::nullopt};
+}
+
+Object makeObject(const parser::Designator &dsg,
+                  semantics::SemanticsContext &semaCtx) {
+  evaluate::ExpressionAnalyzer ea{semaCtx};
+  SymbolWithDesignator sd = getSymbolAndDesignator(ea.Analyze(dsg));
+  SymbolAndDesignatorExtractor::verify(sd);
+  return Object{std::get<0>(sd), std::move(std::get<1>(sd))};
+}
+
+Object makeObject(const parser::StructureComponent &comp,
+                  semantics::SemanticsContext &semaCtx) {
+  evaluate::ExpressionAnalyzer ea{semaCtx};
+  SymbolWithDesignator sd = getSymbolAndDesignator(ea.Analyze(comp));
+  SymbolAndDesignatorExtractor::verify(sd);
+  return Object{std::get<0>(sd), std::move(std::get<1>(sd))};
+}
+
+Object makeObject(const parser::OmpObject &object,
+                  semantics::SemanticsContext &semaCtx) {
+  // If object is a common block, expression analyzer won't be able to
+  // do anything.
+  if (const auto *name = std::get_if<parser::Name>(&object.u)) {
+    assert(name->symbol && "Expecting Symbol");
+    return Object{name->symbol, std::nullopt};
+  }
+  // OmpObject is std::variant<Designator, /*common block*/ Name>;
+  return makeObject(std::get<parser::Designator>(object.u), semaCtx);
+}
+
+std::optional<Object>
+getBaseObject(const Object &object,
+              Fortran::semantics::SemanticsContext &semaCtx) {
+  // If it's just the symbol, then there is no base.
+  if (!object.id())
+    return std::nullopt;
+
+  auto maybeRef = evaluate::ExtractDataRef(*object.ref());
+  if (!maybeRef)
+    return std::nullopt;
+
+  evaluate::DataRef ref = *maybeRef;
+
+  if (std::get_if<evaluate::SymbolRef>(&ref.u)) {
+    return std::nullopt;
+  } else if (auto *comp = std::get_if<evaluate::Component>(&ref.u)) {
+    const evaluate::DataRef &base = comp->base();
+    return Object{
+        SymbolAndDesignatorExtractor::symbol_addr(base.GetLastSymbol()),
+        evaluate::AsGenericExpr(
+            SymbolAndDesignatorExtractor::AsRvalueRef(base))};
+  } else if (auto *arr = std::get_if<evaluate::ArrayRef>(&ref.u)) {
+    const evaluate::NamedEntity &base = arr->base();
+    evaluate::ExpressionAnalyzer ea{semaCtx};
+    if (auto *comp = base.UnwrapComponent()) {
+      return Object{SymbolAndDesignatorExtractor::symbol_addr(comp->symbol()),
+                    ea.Designate(evaluate::DataRef{
+                        SymbolAndDesignatorExtractor::AsRvalueRef(*comp)})};
+    } else if (base.UnwrapSymbolRef()) {
+      return std::nullopt;
+    }
+  } else {
+    assert(std::holds_alternative<evaluate::CoarrayRef>(ref.u) &&
+           "Unexpected variant alternative");
+    llvm_unreachable("Coarray reference not supported at the moment");
+  }
+  return std::nullopt;
+}
+
+namespace clause {
+// Helper objects
+#ifdef EMPTY_CLASS
+#undef EMPTY_CLASS
+#endif
+#define EMPTY_CLASS(cls)                                                       \
+  cls make(const parser::OmpClause::cls &, semantics::SemanticsContext &) {    \
+    return cls{};                                                              \
+  }                                                                            \
+  [[maybe_unused]] extern int xyzzy_semicolon_absorber
+
+#ifdef WRAPPER_CLASS
+#undef WRAPPER_CLASS
+#endif
+#define WRAPPER_CLASS(cls, content)                                            \
+  [[maybe_unused]] extern int xyzzy_semicolon_absorber
+#define GEN_FLANG_CLAUSE_PARSER_CLASSES
+#include "llvm/Frontend/OpenMP/OMP.inc"
+#undef EMPTY_CLASS
+#undef WRAPPER_CLASS
+
+DefinedOperator makeDefinedOperator(const parser::DefinedOperator &inp,
+                                    semantics::SemanticsContext &semaCtx) {
+  return std::visit(
+      common::visitors{
+          [&](const parser::DefinedOpName &s) {
+            return DefinedOperator{
+                DefinedOperator::DefinedOpName{makeObject(s.v, semaCtx)}};
+          },
+          [&](const parser::DefinedOperator::IntrinsicOperator &s) {
+            return DefinedOperator{s};
+          },
+      },
+      inp.u);
+}
+
+ProcedureDesignator
+makeProcedureDesignator(const parser::ProcedureDesignator &inp,
+                        semantics::SemanticsContext &semaCtx) {
+  return ProcedureDesignator{std::visit(
+      common::visitors{
+          [&](const parser::Name &t) { return makeObject(t, semaCtx); },
+          [&](const parser::ProcComponentRef &t) {
+            return makeObject(t.v.thing, semaCtx);
+          },
+      },
+      inp.u)};
+}
+
+ReductionOperator makeReductionOperator(const parser::OmpReductionOperator &inp,
+                                        semantics::SemanticsContext &semaCtx) {
+  return std::visit(
+      common::visitors{
+          [&](const parser::DefinedOperator &s) {
+            return ReductionOperator{makeDefinedOperator(s, semaCtx)};
+          },
+          [&](const parser::ProcedureDesignator &s) {
+            return ReductionOperator{makeProcedureDesignator(s, semaCtx)};
+          },
+      },
+      inp.u);
+}
+
+// Actual clauses. Each T (where OmpClause::T exists) has its "make".
+Aligned make(const parser::OmpClause::Aligned &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpAlignedClause
+  auto &t0 = std::get<parser::OmpObjectList>(inp.v.t);
+  auto &t1 = std::get<std::optional<parser::ScalarIntConstantExpr>>(inp.v.t);
+
+  return Aligned{{
+      makeList(t0, semaCtx),
+      maybeApply(makeExprFn(semaCtx), t1),
+  }};
+}
+
+Allocate make(const parser::OmpClause::Allocate &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpAllocateClause
+  using wrapped = parser::OmpAllocateClause;
+  auto &t0 = std::get<std::optional<wrapped::AllocateModifier>>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+
+  auto convert = [&](auto &&s) -> Allocate::Modifier {
+    using Modifier = Allocate::Modifier;
+    using Allocator = Modifier::Allocator;
+    using Align = Modifier::Align;
+    using ComplexModifier = Modifier::ComplexModifier;
+
+    return std::visit(
+        common::visitors{
+            [&](const wrapped::AllocateModifier::Allocator &v) {
+              return Modifier{Allocator{makeExpr(v.v, semaCtx)}};
+            },
+            [&](const wrapped::AllocateModifier::ComplexModifier &v) {
+              auto &s0 = std::get<wrapped::AllocateModifier::Allocator>(v.t);
+              auto &s1 = std::get<wrapped::AllocateModifier::Align>(v.t);
+              return Modifier{ComplexModifier{{
+                  Allocator{makeExpr(s0.v, semaCtx)},
+                  Align{makeExpr(s1.v, semaCtx)},
+              }}};
+            },
+            [&](const wrapped::AllocateModifier::Align &v) {
+              return Modifier{Align{makeExpr(v.v, semaCtx)}};
+            },
+        },
+        s.u);
+  };
+
+  return Allocate{{maybeApply(convert, t0), makeList(t1, semaCtx)}};
+}
+
+Allocator make(const parser::OmpClause::Allocator &inp,
+               semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return Allocator{makeExpr(inp.v, semaCtx)};
+}
+
+// Never called, but needed for using "make" as a Clause visitor.
+// See comment about "requires" clauses in Clauses.h.
+AtomicDefaultMemOrder make(const parser::OmpClause::AtomicDefaultMemOrder &inp,
+                           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpAtomicDefaultMemOrderClause
+  return AtomicDefaultMemOrder{inp.v.v};
+}
+
+Collapse make(const parser::OmpClause::Collapse &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntConstantExpr
+  return Collapse{makeExpr(inp.v, semaCtx)};
+}
+
+Copyin make(const parser::OmpClause::Copyin &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Copyin{makeList(inp.v, semaCtx)};
+}
+
+Copyprivate make(const parser::OmpClause::Copyprivate &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Copyprivate{makeList(inp.v, semaCtx)};
+}
+
+Defaultmap make(const parser::OmpClause::Defaultmap &inp,
+                semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDefaultmapClause
+  using wrapped = parser::OmpDefaultmapClause;
+
+  auto &t0 = std::get<wrapped::ImplicitBehavior>(inp.v.t);
+  auto &t1 = std::get<std::optional<wrapped::VariableCategory>>(inp.v.t);
+  return Defaultmap{{t0, t1}};
+}
+
+Default make(const parser::OmpClause::Default &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDefaultClause
+  return Default{inp.v.v};
+}
+
+Depend make(const parser::OmpClause::Depend &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDependClause
+  using wrapped = parser::OmpDependClause;
+
+  return std::visit(
+      common::visitors{
+          [&](const wrapped::Source &s) { return Depend{Depend::Source{}}; },
+          [&](const wrapped::Sink &s) {
+            auto convert = [&](const parser::OmpDependSinkVec &v) {
+              auto &t0 = std::get<parser::Name>(v.t);
+              auto &t1 =
+                  std::get<std::optional<parser::OmpDependSinkVecLength>>(v.t);
+              auto convert1 = [&](const parser::OmpDependSinkVecLength &u) {
+                auto &s0 = std::get<parser::DefinedOperator>(u.t);
+                auto &s1 = std::get<parser::ScalarIntConstantExpr>(u.t);
+                return Depend::Sink::Length{makeDefinedOperator(s0, semaCtx),
+                                            makeExpr(s1, semaCtx)};
+              };
+              return Depend::Sink::Vec{makeObject(t0, semaCtx),
+                                       maybeApply(convert1, t1)};
+            };
+            return Depend{Depend::Sink{makeList(s.v, convert)}};
+          },
+          [&](const wrapped::InOut &s) {
+            auto &t0 = std::get<parser::OmpDependenceType>(s.t);
+            auto &t1 = std::get<std::list<parser::Designator>>(s.t);
+            auto convert = [&](const parser::Designator &t) {
+              return makeObject(t, semaCtx);
+            };
+            return Depend{Depend::InOut{{t0.v, makeList(t1, convert)}}};
+          },
+      },
+      inp.v.u);
+}
+
+Device make(const parser::OmpClause::Device &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDeviceClause
+  using wrapped = parser::OmpDeviceClause;
+
+  auto &t0 = std::get<std::optional<wrapped::DeviceModifier>>(inp.v.t);
+  auto &t1 = std::get<parser::ScalarIntExpr>(inp.v.t);
+  return Device{{t0, makeExpr(t1, semaCtx)}};
+}
+
+DeviceType make(const parser::OmpClause::DeviceType &inp,
+                semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDeviceTypeClause
+  return DeviceType{inp.v.v};
+}
+
+DistSchedule make(const parser::OmpClause::DistSchedule &inp,
+                  semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::ScalarIntExpr>
+  return DistSchedule{maybeApply(makeExprFn(semaCtx), inp.v)};
+}
+
+Enter make(const parser::OmpClause::Enter &inp,
+           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Enter{makeList(inp.v, semaCtx)};
+}
+
+Filter make(const parser::OmpClause::Filter &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return Filter{makeExpr(inp.v, semaCtx)};
+}
+
+Final make(const parser::OmpClause::Final &inp,
+           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarLogicalExpr
+  return Final{makeExpr(inp.v, semaCtx)};
+}
+
+Firstprivate make(const parser::OmpClause::Firstprivate &inp,
+                  semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Firstprivate{makeList(inp.v, semaCtx)};
+}
+
+From make(const parser::OmpClause::From &inp,
+          semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return From{makeList(inp.v, semaCtx)};
+}
+
+Grainsize make(const parser::OmpClause::Grainsize &inp,
+               semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return Grainsize{makeExpr(inp.v, semaCtx)};
+}
+
+HasDeviceAddr make(const parser::OmpClause::HasDeviceAddr &inp,
+                   semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return HasDeviceAddr{makeList(inp.v, semaCtx)};
+}
+
+Hint make(const parser::OmpClause::Hint &inp,
+          semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ConstantExpr
+  return Hint{makeExpr(inp.v, semaCtx)};
+}
+
+If make(const parser::OmpClause::If &inp,
+        semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpIfClause
+  using wrapped = parser::OmpIfClause;
+
+  auto &t0 = std::get<std::optional<wrapped::DirectiveNameModifier>>(inp.v.t);
+  auto &t1 = std::get<parser::ScalarLogicalExpr>(inp.v.t);
+  return If{{t0, makeExpr(t1, semaCtx)}};
+}
+
+InReduction make(const parser::OmpClause::InReduction &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpInReductionClause
+  auto &t0 = std::get<parser::OmpReductionOperator>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  return InReduction{
+      {makeReductionOperator(t0, semaCtx), makeList(t1, semaCtx)}};
+}
+
+IsDevicePtr make(const parser::OmpClause::IsDevicePtr &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return IsDevicePtr{makeList(inp.v, semaCtx)};
+}
+
+Lastprivate make(const parser::OmpClause::Lastprivate &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Lastprivate{makeList(inp.v, semaCtx)};
+}
+
+Linear make(const parser::OmpClause::Linear &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpLinearClause
+  using wrapped = parser::OmpLinearClause;
+
+  return std::visit(
+      common::visitors{
+          [&](const wrapped::WithModifier &s) {
+            return Linear{{Linear::Modifier{s.modifier.v},
+                           makeList(s.names, makeObjectFn(semaCtx)),
+                           maybeApply(makeExprFn(semaCtx), s.step)}};
+          },
+          [&](const wrapped::WithoutModifier &s) {
+            return Linear{{std::nullopt,
+                           makeList(s.names, makeObjectFn(semaCtx)),
+                           maybeApply(makeExprFn(semaCtx), s.step)}};
+          },
+      },
+      inp.v.u);
+}
+
+Link make(const parser::OmpClause::Link &inp,
+          semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Link{makeList(inp.v, semaCtx)};
+}
+
+Map make(const parser::OmpClause::Map &inp,
+         semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpMapClause
+  auto &t0 = std::get<std::optional<parser::OmpMapType>>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  auto convert = [](const parser::OmpMapType &s) {
+    auto &s0 = std::get<std::optional<parser::OmpMapType::Always>>(s.t);
+    auto &s1 = std::get<parser::OmpMapType::Type>(s.t);
+    auto convertT = [](parser::OmpMapType::Always) {
+      return Map::MapType::Always{};
+    };
+    return Map::MapType{{maybeApply(convertT, s0), s1}};
+  };
+  return Map{{maybeApply(convert, t0), makeList(t1, semaCtx)}};
+}
+
+Nocontext make(const parser::OmpClause::Nocontext &inp,
+               semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarLogicalExpr
+  return Nocontext{makeExpr(inp.v, semaCtx)};
+}
+
+Nontemporal make(const parser::OmpClause::Nontemporal &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::list<parser::Name>
+  return Nontemporal{makeList(inp.v, makeObjectFn(semaCtx))};
+}
+
+Novariants make(const parser::OmpClause::Novariants &inp,
+                semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarLogicalExpr
+  return Novariants{makeExpr(inp.v, semaCtx)};
+}
+
+NumTasks make(const parser::OmpClause::NumTasks &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return NumTasks{makeExpr(inp.v, semaCtx)};
+}
+
+NumTeams make(const parser::OmpClause::NumTeams &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return NumTeams{makeExpr(inp.v, semaCtx)};
+}
+
+NumThreads make(const parser::OmpClause::NumThreads &inp,
+                semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return NumThreads{makeExpr(inp.v, semaCtx)};
+}
+
+OmpxDynCgroupMem make(const parser::OmpClause::OmpxDynCgroupMem &inp,
+                      semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return OmpxDynCgroupMem{makeExpr(inp.v, semaCtx)};
+}
+
+Ordered make(const parser::OmpClause::Ordered &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::ScalarIntConstantExpr>
+  return Ordered{maybeApply(makeExprFn(semaCtx), inp.v)};
+}
+
+Order make(const parser::OmpClause::Order &inp,
+           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpOrderClause
+  using wrapped = parser::OmpOrderClause;
+  auto &t0 = std::get<std::optional<parser::OmpOrderModifier>>(inp.v.t);
+  auto &t1 = std::get<wrapped::Type>(inp.v.t);
+  auto convert = [](const parser::OmpOrderModifier &s) -> Order::Kind {
+    return std::get<parser::OmpOrderModifier::Kind>(s.u);
+  };
+  return Order{{maybeApply(convert, t0), t1}};
+}
+
+Partial make(const parser::OmpClause::Partial &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::ScalarIntConstantExpr>
+  return Partial{maybeApply(makeExprFn(semaCtx), inp.v)};
+}
+
+Priority make(const parser::OmpClause::Priority &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return Priority{makeExpr(inp.v, semaCtx)};
+}
+
+Private make(const parser::OmpClause::Private &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Private{makeList(inp.v, semaCtx)};
+}
+
+ProcBind make(const parser::OmpClause::ProcBind &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpProcBindClause
+  return ProcBind{inp.v.v};
+}
+
+Reduction make(const parser::OmpClause::Reduction &inp,
+               semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpReductionClause
+  auto &t0 = std::get<parser::OmpReductionOperator>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  return Reduction{{makeReductionOperator(t0, semaCtx), makeList(t1, semaCtx)}};
+}
+
+Safelen make(const parser::OmpClause::Safelen &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntConstantExpr
+  return Safelen{makeExpr(inp.v, semaCtx)};
+}
+
+Schedule make(const parser::OmpClause::Schedule &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpScheduleClause
+  using wrapped = parser::OmpScheduleClause;
+
+  auto &t0 = std::get<std::optional<parser::OmpScheduleModifier>>(inp.v.t);
+  auto &t1 = std::get<wrapped::ScheduleType>(inp.v.t);
+  auto &t2 = std::get<std::optional<parser::ScalarIntExpr>>(inp.v.t);
+
+  auto convert = [](auto &&s) -> Schedule::ScheduleModifier {
+    auto &s0 = std::get<parser::OmpScheduleModifier::Modifier1>(s.t);
+    auto &s1 =
+        std::get<std::optional<parser::OmpScheduleModifier::Modifier2>>(s.t);
+
+    auto convert1 = [](auto &&v) { // Modifier1 or Modifier2
+      return v.v.v;
+    };
+    return Schedule::ScheduleModifier{{s0.v.v, maybeApply(convert1, s1)}};
+  };
+
+  return Schedule{
+      {maybeApply(convert, t0), t1, maybeApply(makeExprFn(semaCtx), t2)}};
+}
+
+Shared make(const parser::OmpClause::Shared &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Shared{makeList(inp.v, semaCtx)};
+}
+
+Simdlen make(const parser::OmpClause::Simdlen &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntConstantExpr
+  return Simdlen{makeExpr(inp.v, semaCtx)};
+}
+
+Sizes make(const parser::OmpClause::Sizes &inp,
+           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::list<parser::ScalarIntExpr>
+  return Sizes{makeList(inp.v, makeExprFn(semaCtx))};
+}
+
+TaskReduction make(const parser::OmpClause::TaskReduction &inp,
+                   semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpReductionClause
+  auto &t0 = std::get<parser::OmpReductionOperator>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  return TaskReduction{
+      {makeReductionOperator(t0, semaCtx), makeList(t1, semaCtx)}};
+}
+
+ThreadLimit make(const parser::OmpClause::ThreadLimit &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return ThreadLimit{makeExpr(inp.v, semaCtx)};
+}
+
+To make(const parser::OmpClause::To &inp,
+        semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return To{makeList(inp.v, semaCtx)};
+}
+
+Uniform make(const parser::OmpClause::Uniform &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::list<parser::Name>
+  return Uniform{makeList(inp.v, makeObjectFn(semaCtx))};
+}
+
+UseDeviceAddr make(const parser::OmpClause::UseDeviceAddr &inp,
+                   semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return UseDeviceAddr{makeList(inp.v, semaCtx)};
+}
+
+UseDevicePtr make(const parser::OmpClause::UseDevicePtr &inp,
+                  semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return UseDevicePtr{makeList(inp.v, semaCtx)};
+}
+} // namespace clause
+
+Clause makeClause(const Fortran::parser::OmpClause &cls,
+                  semantics::SemanticsContext &semaCtx) {
+  return std::visit(
+      [&](auto &&s) {
+        return makeClause(getClauseId(cls), clause::make(s, semaCtx),
+                          cls.source);
+      },
+      cls.u);
+}
+
+List<Clause> makeList(const parser::OmpClauseList &clauses,
+                      semantics::SemanticsContext &semaCtx) {
+  return makeList(clauses.v, [&](const parser::OmpClause &s) {
+    return makeClause(s, semaCtx);
+  });
+}
+} // namespace Fortran::lower::omp
diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h
new file mode 100644
index 0000000000000..3fba593b5349c
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/Clauses.h
@@ -0,0 +1,212 @@
+//===-- Clauses.h -- OpenMP clause handling -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_LOWER_OPENMP_CLAUSES_H
+#define FORTRAN_LOWER_OPENMP_CLAUSES_H
+
+#include "ClauseT.h"
+
+#include "flang/Evaluate/expression.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/expression.h"
+#include "flang/Semantics/semantics.h"
+#include "flang/Semantics/symbol.h"
+
+#include "llvm/ADT/STLExtras.h"
+
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+namespace Fortran::lower::omp {
+using namespace Fortran;
+using SomeType = evaluate::SomeType;
+using SomeExpr = semantics::SomeExpr;
+using MaybeExpr = semantics::MaybeExpr;
+
+using SymIdent = semantics::Symbol *;
+using SymReference = SomeExpr;
+
+template <typename T>
+using List = tomp::ListT<T>;
+} // namespace Fortran::lower::omp
+
+namespace tomp {
+template <>
+struct ObjectT<Fortran::lower::omp::SymIdent,
+               Fortran::lower::omp::SymReference> {
+  using IdType = Fortran::lower::omp::SymIdent;
+  using ExprType = Fortran::lower::omp::SymReference;
+
+  const IdType &id() const { return symbol; }
+  const std::optional<ExprType> &ref() const { return designator; }
+
+  IdType symbol;
+  std::optional<ExprType> designator;
+};
+} // namespace tomp
+
+namespace Fortran::lower::omp {
+
+using Object = tomp::ObjectT<SymIdent, SymReference>;
+using ObjectList = tomp::ObjectListT<SymIdent, SymReference>;
+
+Object makeObject(const parser::OmpObject &object,
+                  semantics::SemanticsContext &semaCtx);
+Object makeObject(const parser::Name &name,
+                  semantics::SemanticsContext &semaCtx);
+Object makeObject(const parser::Designator &dsg,
+                  semantics::SemanticsContext &semaCtx);
+Object makeObject(const parser::StructureComponent &comp,
+                  semantics::SemanticsContext &semaCtx);
+
+inline auto makeObjectFn(semantics::SemanticsContext &semaCtx) {
+  return [&](auto &&s) { return makeObject(s, semaCtx); };
+}
+
+template <typename T>
+SomeExpr makeExpr(T &&pftExpr, semantics::SemanticsContext &semaCtx) {
+  auto maybeExpr = evaluate::ExpressionAnalyzer(semaCtx).Analyze(pftExpr);
+  assert(maybeExpr);
+  return std::move(*maybeExpr);
+}
+
+inline auto makeExprFn(semantics::SemanticsContext &semaCtx) {
+  return [&](auto &&s) { return makeExpr(s, semaCtx); };
+}
+
+template <
+    typename ContainerTy, typename FunctionTy,
+    typename ElemTy = typename llvm::remove_cvref_t<ContainerTy>::value_type,
+    typename ResultTy = std::invoke_result_t<FunctionTy, ElemTy>>
+List<ResultTy> makeList(ContainerTy &&container, FunctionTy &&func) {
+  List<ResultTy> v;
+  llvm::transform(container, std::back_inserter(v), func);
+  return v;
+}
+
+inline ObjectList makeList(const parser::OmpObjectList &objects,
+                           semantics::SemanticsContext &semaCtx) {
+  return makeList(objects.v, makeObjectFn(semaCtx));
+}
+
+template <typename FuncTy, typename ElemTy,
+          typename ResultTy = std::invoke_result_t<FuncTy, ElemTy>>
+std::optional<ResultTy> maybeApply(FuncTy &&func,
+                                   const std::optional<ElemTy> &inp) {
+  if (!inp)
+    return std::nullopt;
+  return std::move(func(*inp));
+}
+
+std::optional<Object>
+getBaseObject(const Object &object,
+              Fortran::semantics::SemanticsContext &semaCtx);
+
+namespace clause {
+using DefinedOperator = tomp::clause::DefinedOperatorT<SymIdent, SymReference>;
+using ProcedureDesignator =
+    tomp::clause::ProcedureDesignatorT<SymIdent, SymReference>;
+using ReductionOperator =
+    tomp::clause::ReductionOperatorT<SymIdent, SymReference>;
+
+#ifdef EMPTY_CLASS
+#undef EMPTY_CLASS
+#endif
+#define EMPTY_CLASS(cls)                                                       \
+  using cls = tomp::clause::cls##T<SymIdent, SymReference>
+
+#ifdef WRAPPER_CLASS
+#undef WRAPPER_CLASS
+#endif
+#define WRAPPER_CLASS(cls, content)                                            \
+  [[maybe_unused]] extern int xyzzy_semicolon_absorber
+#define GEN_FLANG_CLAUSE_PARSER_CLASSES
+#include "llvm/Frontend/OpenMP/OMP.inc"
+#undef EMPTY_CLASS
+#undef WRAPPER_CLASS
+
+// "Requires" clauses are handled early on, and the aggregated information
+// is stored in the Symbol details of modules, programs, and subprograms.
+// These clauses are still handled here to cover all alternatives in the
+// main clause variant.
+
+using Aligned = tomp::clause::AlignedT<SymIdent, SymReference>;
+using Allocate = tomp::clause::AllocateT<SymIdent, SymReference>;
+using Allocator = tomp::clause::AllocatorT<SymIdent, SymReference>;
+using AtomicDefaultMemOrder =
+    tomp::clause::AtomicDefaultMemOrderT<SymIdent, SymReference>;
+using Collapse = tomp::clause::CollapseT<SymIdent, SymReference>;
+using Copyin = tomp::clause::CopyinT<SymIdent, SymReference>;
+using Copyprivate = tomp::clause::CopyprivateT<SymIdent, SymReference>;
+using Defaultmap = tomp::clause::DefaultmapT<SymIdent, SymReference>;
+using Default = tomp::clause::DefaultT<SymIdent, SymReference>;
+using Depend = tomp::clause::DependT<SymIdent, SymReference>;
+using Device = tomp::clause::DeviceT<SymIdent, SymReference>;
+using DeviceType = tomp::clause::DeviceTypeT<SymIdent, SymReference>;
+using DistSchedule = tomp::clause::DistScheduleT<SymIdent, SymReference>;
+using Enter = tomp::clause::EnterT<SymIdent, SymReference>;
+using Filter = tomp::clause::FilterT<SymIdent, SymReference>;
+using Final = tomp::clause::FinalT<SymIdent, SymReference>;
+using Firstprivate = tomp::clause::FirstprivateT<SymIdent, SymReference>;
+using From = tomp::clause::FromT<SymIdent, SymReference>;
+using Grainsize = tomp::clause::GrainsizeT<SymIdent, SymReference>;
+using HasDeviceAddr = tomp::clause::HasDeviceAddrT<SymIdent, SymReference>;
+using Hint = tomp::clause::HintT<SymIdent, SymReference>;
+using If = tomp::clause::IfT<SymIdent, SymReference>;
+using InReduction = tomp::clause::InReductionT<SymIdent, SymReference>;
+using IsDevicePtr = tomp::clause::IsDevicePtrT<SymIdent, SymReference>;
+using Lastprivate = tomp::clause::LastprivateT<SymIdent, SymReference>;
+using Linear = tomp::clause::LinearT<SymIdent, SymReference>;
+using Link = tomp::clause::LinkT<SymIdent, SymReference>;
+using Map = tomp::clause::MapT<SymIdent, SymReference>;
+using Nocontext = tomp::clause::NocontextT<SymIdent, SymReference>;
+using Nontemporal = tomp::clause::NontemporalT<SymIdent, SymReference>;
+using Novariants = tomp::clause::NovariantsT<SymIdent, SymReference>;
+using NumTasks = tomp::clause::NumTasksT<SymIdent, SymReference>;
+using NumTeams = tomp::clause::NumTeamsT<SymIdent, SymReference>;
+using NumThreads = tomp::clause::NumThreadsT<SymIdent, SymReference>;
+using OmpxDynCgroupMem =
+    tomp::clause::OmpxDynCgroupMemT<SymIdent, SymReference>;
+using Ordered = tomp::clause::OrderedT<SymIdent, SymReference>;
+using Order = tomp::clause::OrderT<SymIdent, SymReference>;
+using Partial = tomp::clause::PartialT<SymIdent, SymReference>;
+using Priority = tomp::clause::PriorityT<SymIdent, SymReference>;
+using Private = tomp::clause::PrivateT<SymIdent, SymReference>;
+using ProcBind = tomp::clause::ProcBindT<SymIdent, SymReference>;
+using Reduction = tomp::clause::ReductionT<SymIdent, SymReference>;
+using Safelen = tomp::clause::SafelenT<SymIdent, SymReference>;
+using Schedule = tomp::clause::ScheduleT<SymIdent, SymReference>;
+using Shared = tomp::clause::SharedT<SymIdent, SymReference>;
+using Simdlen = tomp::clause::SimdlenT<SymIdent, SymReference>;
+using Sizes = tomp::clause::SizesT<SymIdent, SymReference>;
+using TaskReduction = tomp::clause::TaskReductionT<SymIdent, SymReference>;
+using ThreadLimit = tomp::clause::ThreadLimitT<SymIdent, SymReference>;
+using To = tomp::clause::ToT<SymIdent, SymReference>;
+using Uniform = tomp::clause::UniformT<SymIdent, SymReference>;
+using UseDeviceAddr = tomp::clause::UseDeviceAddrT<SymIdent, SymReference>;
+using UseDevicePtr = tomp::clause::UseDevicePtrT<SymIdent, SymReference>;
+} // namespace clause
+
+struct Clause : public tomp::ClauseT<SymIdent, SymReference> {
+  parser::CharBlock source;
+};
+
+template <typename Specific>
+Clause makeClause(llvm::omp::Clause id, Specific &&specific,
+                  parser::CharBlock source = {}) {
+  return Clause{{id, specific}, source};
+}
+
+Clause makeClause(const Fortran::parser::OmpClause &cls,
+                  semantics::SemanticsContext &semaCtx);
+
+List<Clause> makeList(const parser::OmpClauseList &clauses,
+                      semantics::SemanticsContext &semaCtx);
+} // namespace Fortran::lower::omp
+
+#endif // FORTRAN_LOWER_OPENMP_CLAUSES_H
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 717b8cc0276a3..a5c087e452414 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -50,6 +50,7 @@ void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) {
 }
 
 void DataSharingProcessor::insertDeallocs() {
+  // TODO Extend delayed privatization to include a `dealloc` region.
   for (const Fortran::semantics::Symbol *sym : privatizedSymbols)
     if (Fortran::semantics::IsAllocatable(sym->GetUltimate())) {
       converter.createHostAssociateVarCloneDealloc(*sym);
@@ -81,30 +82,26 @@ void DataSharingProcessor::copyLastPrivateSymbol(
 }
 
 void DataSharingProcessor::collectOmpObjectListSymbol(
-    const Fortran::parser::OmpObjectList &ompObjectList,
+    const omp::ObjectList &objects,
     llvm::SetVector<const Fortran::semantics::Symbol *> &symbolSet) {
-  for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) {
-    Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject);
-    symbolSet.insert(sym);
-  }
+  for (const omp::Object &object : objects)
+    symbolSet.insert(object.id());
 }
 
 void DataSharingProcessor::collectSymbolsForPrivatization() {
   bool hasCollapse = false;
-  for (const Fortran::parser::OmpClause &clause : opClauseList.v) {
+  for (const omp::Clause &clause : clauses) {
     if (const auto &privateClause =
-            std::get_if<Fortran::parser::OmpClause::Private>(&clause.u)) {
+            std::get_if<omp::clause::Private>(&clause.u)) {
       collectOmpObjectListSymbol(privateClause->v, privatizedSymbols);
     } else if (const auto &firstPrivateClause =
-                   std::get_if<Fortran::parser::OmpClause::Firstprivate>(
-                       &clause.u)) {
+                   std::get_if<omp::clause::Firstprivate>(&clause.u)) {
       collectOmpObjectListSymbol(firstPrivateClause->v, privatizedSymbols);
     } else if (const auto &lastPrivateClause =
-                   std::get_if<Fortran::parser::OmpClause::Lastprivate>(
-                       &clause.u)) {
+                   std::get_if<omp::clause::Lastprivate>(&clause.u)) {
       collectOmpObjectListSymbol(lastPrivateClause->v, privatizedSymbols);
       hasLastPrivateOp = true;
-    } else if (std::get_if<Fortran::parser::OmpClause::Collapse>(&clause.u)) {
+    } else if (std::get_if<omp::clause::Collapse>(&clause.u)) {
       hasCollapse = true;
     }
   }
@@ -137,138 +134,135 @@ void DataSharingProcessor::insertBarrier() {
 void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
   bool cmpCreated = false;
   mlir::OpBuilder::InsertPoint localInsPt = firOpBuilder.saveInsertionPoint();
-  for (const Fortran::parser::OmpClause &clause : opClauseList.v) {
-    if (std::get_if<Fortran::parser::OmpClause::Lastprivate>(&clause.u)) {
-      // TODO: Add lastprivate support for simd construct
-      if (mlir::isa<mlir::omp::SectionOp>(op)) {
-        if (&eval == &eval.parentConstruct->getLastNestedEvaluation()) {
-          // For `omp.sections`, lastprivatized variables occur in
-          // lexically final `omp.section` operation. The following FIR
-          // shall be generated for the same:
-          //
-          // omp.sections lastprivate(...) {
-          //  omp.section {...}
-          //  omp.section {...}
-          //  omp.section {
-          //      fir.allocate for `private`/`firstprivate`
-          //      <More operations here>
-          //      fir.if %true {
-          //          ^%lpv_update_blk
-          //      }
-          //  }
-          // }
-          //
-          // To keep code consistency while handling privatization
-          // through this control flow, add a `fir.if` operation
-          // that always evaluates to true, in order to create
-          // a dedicated sub-region in `omp.section` where
-          // lastprivate FIR can reside. Later canonicalizations
-          // will optimize away this operation.
-          if (!eval.lowerAsUnstructured()) {
-            auto ifOp = firOpBuilder.create<fir::IfOp>(
-                op->getLoc(),
-                firOpBuilder.createIntegerConstant(
-                    op->getLoc(), firOpBuilder.getIntegerType(1), 0x1),
-                /*else*/ false);
-            firOpBuilder.setInsertionPointToStart(
-                &ifOp.getThenRegion().front());
-
-            const Fortran::parser::OpenMPConstruct *parentOmpConstruct =
-                eval.parentConstruct->getIf<Fortran::parser::OpenMPConstruct>();
-            assert(parentOmpConstruct &&
-                   "Expected a valid enclosing OpenMP construct");
-            const Fortran::parser::OpenMPSectionsConstruct *sectionsConstruct =
-                std::get_if<Fortran::parser::OpenMPSectionsConstruct>(
-                    &parentOmpConstruct->u);
-            assert(sectionsConstruct &&
-                   "Expected an enclosing omp.sections construct");
-            const Fortran::parser::OmpClauseList &sectionsEndClauseList =
-                std::get<Fortran::parser::OmpClauseList>(
-                    std::get<Fortran::parser::OmpEndSectionsDirective>(
-                        sectionsConstruct->t)
-                        .t);
-            for (const Fortran::parser::OmpClause &otherClause :
-                 sectionsEndClauseList.v)
-              if (std::get_if<Fortran::parser::OmpClause::Nowait>(
-                      &otherClause.u))
-                // Emit implicit barrier to synchronize threads and avoid data
-                // races on post-update of lastprivate variables when `nowait`
-                // clause is present.
-                firOpBuilder.create<mlir::omp::BarrierOp>(
-                    converter.getCurrentLocation());
-            firOpBuilder.setInsertionPointToStart(
-                &ifOp.getThenRegion().front());
-            lastPrivIP = firOpBuilder.saveInsertionPoint();
-            firOpBuilder.setInsertionPoint(ifOp);
-            insPt = firOpBuilder.saveInsertionPoint();
-          } else {
-            // Lastprivate operation is inserted at the end
-            // of the lexically last section in the sections
-            // construct
-            mlir::OpBuilder::InsertPoint unstructuredSectionsIP =
-                firOpBuilder.saveInsertionPoint();
-            mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
-            firOpBuilder.setInsertionPoint(lastOper);
-            lastPrivIP = firOpBuilder.saveInsertionPoint();
-            firOpBuilder.restoreInsertionPoint(unstructuredSectionsIP);
-          }
-        }
-      } else if (mlir::isa<mlir::omp::WsLoopOp>(op)) {
-        // Update the original variable just before exiting the worksharing
-        // loop. Conversion as follows:
+  for (const omp::Clause &clause : clauses) {
+    if (clause.id != llvm::omp::OMPC_lastprivate)
+      continue;
+    // TODO: Add lastprivate support for simd construct
+    if (mlir::isa<mlir::omp::SectionOp>(op)) {
+      if (&eval == &eval.parentConstruct->getLastNestedEvaluation()) {
+        // For `omp.sections`, lastprivatized variables occur in
+        // lexically final `omp.section` operation. The following FIR
+        // shall be generated for the same:
         //
-        //                       omp.wsloop {
-        // omp.wsloop {            ...
-        //    ...                  store
-        //    store       ===>     %v = arith.addi %iv, %step
-        //    omp.yield            %cmp = %step < 0 ? %v < %ub : %v > %ub
-        // }                       fir.if %cmp {
-        //                           fir.store %v to %loopIV
-        //                           ^%lpv_update_blk:
-        //                         }
-        //                         omp.yield
-        //                       }
+        // omp.sections lastprivate(...) {
+        //  omp.section {...}
+        //  omp.section {...}
+        //  omp.section {
+        //      fir.allocate for `private`/`firstprivate`
+        //      <More operations here>
+        //      fir.if %true {
+        //          ^%lpv_update_blk
+        //      }
+        //  }
+        // }
         //
-
-        // Only generate the compare once in presence of multiple LastPrivate
-        // clauses.
-        if (cmpCreated)
-          continue;
-        cmpCreated = true;
-
-        mlir::Location loc = op->getLoc();
-        mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
-        firOpBuilder.setInsertionPoint(lastOper);
-
-        mlir::Value iv = op->getRegion(0).front().getArguments()[0];
-        mlir::Value ub =
-            mlir::dyn_cast<mlir::omp::WsLoopOp>(op).getUpperBound()[0];
-        mlir::Value step = mlir::dyn_cast<mlir::omp::WsLoopOp>(op).getStep()[0];
-
-        // v = iv + step
-        // cmp = step < 0 ? v < ub : v > ub
-        mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
-        mlir::Value zero =
-            firOpBuilder.createIntegerConstant(loc, step.getType(), 0);
-        mlir::Value negativeStep = firOpBuilder.create<mlir::arith::CmpIOp>(
-            loc, mlir::arith::CmpIPredicate::slt, step, zero);
-        mlir::Value vLT = firOpBuilder.create<mlir::arith::CmpIOp>(
-            loc, mlir::arith::CmpIPredicate::slt, v, ub);
-        mlir::Value vGT = firOpBuilder.create<mlir::arith::CmpIOp>(
-            loc, mlir::arith::CmpIPredicate::sgt, v, ub);
-        mlir::Value cmpOp = firOpBuilder.create<mlir::arith::SelectOp>(
-            loc, negativeStep, vLT, vGT);
-
-        auto ifOp = firOpBuilder.create<fir::IfOp>(loc, cmpOp, /*else*/ false);
-        firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-        assert(loopIV && "loopIV was not set");
-        firOpBuilder.create<fir::StoreOp>(op->getLoc(), v, loopIV);
-        lastPrivIP = firOpBuilder.saveInsertionPoint();
-      } else {
-        TODO(converter.getCurrentLocation(),
-             "lastprivate clause in constructs other than "
-             "simd/worksharing-loop");
+        // To keep code consistency while handling privatization
+        // through this control flow, add a `fir.if` operation
+        // that always evaluates to true, in order to create
+        // a dedicated sub-region in `omp.section` where
+        // lastprivate FIR can reside. Later canonicalizations
+        // will optimize away this operation.
+        if (!eval.lowerAsUnstructured()) {
+          auto ifOp = firOpBuilder.create<fir::IfOp>(
+              op->getLoc(),
+              firOpBuilder.createIntegerConstant(
+                  op->getLoc(), firOpBuilder.getIntegerType(1), 0x1),
+              /*else*/ false);
+          firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
+          const Fortran::parser::OpenMPConstruct *parentOmpConstruct =
+              eval.parentConstruct->getIf<Fortran::parser::OpenMPConstruct>();
+          assert(parentOmpConstruct &&
+                 "Expected a valid enclosing OpenMP construct");
+          const Fortran::parser::OpenMPSectionsConstruct *sectionsConstruct =
+              std::get_if<Fortran::parser::OpenMPSectionsConstruct>(
+                  &parentOmpConstruct->u);
+          assert(sectionsConstruct &&
+                 "Expected an enclosing omp.sections construct");
+          const Fortran::parser::OmpClauseList &sectionsEndClauseList =
+              std::get<Fortran::parser::OmpClauseList>(
+                  std::get<Fortran::parser::OmpEndSectionsDirective>(
+                      sectionsConstruct->t)
+                      .t);
+          for (const Fortran::parser::OmpClause &otherClause :
+               sectionsEndClauseList.v)
+            if (std::get_if<Fortran::parser::OmpClause::Nowait>(&otherClause.u))
+              // Emit implicit barrier to synchronize threads and avoid data
+              // races on post-update of lastprivate variables when `nowait`
+              // clause is present.
+              firOpBuilder.create<mlir::omp::BarrierOp>(
+                  converter.getCurrentLocation());
+          firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+          lastPrivIP = firOpBuilder.saveInsertionPoint();
+          firOpBuilder.setInsertionPoint(ifOp);
+          insPt = firOpBuilder.saveInsertionPoint();
+        } else {
+          // Lastprivate operation is inserted at the end
+          // of the lexically last section in the sections
+          // construct
+          mlir::OpBuilder::InsertPoint unstructuredSectionsIP =
+              firOpBuilder.saveInsertionPoint();
+          mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
+          firOpBuilder.setInsertionPoint(lastOper);
+          lastPrivIP = firOpBuilder.saveInsertionPoint();
+          firOpBuilder.restoreInsertionPoint(unstructuredSectionsIP);
+        }
       }
+    } else if (mlir::isa<mlir::omp::WsLoopOp>(op)) {
+      // Update the original variable just before exiting the worksharing
+      // loop. Conversion as follows:
+      //
+      //                       omp.wsloop {
+      // omp.wsloop {            ...
+      //    ...                  store
+      //    store       ===>     %v = arith.addi %iv, %step
+      //    omp.yield            %cmp = %step < 0 ? %v < %ub : %v > %ub
+      // }                       fir.if %cmp {
+      //                           fir.store %v to %loopIV
+      //                           ^%lpv_update_blk:
+      //                         }
+      //                         omp.yield
+      //                       }
+      //
+
+      // Only generate the compare once in presence of multiple LastPrivate
+      // clauses.
+      if (cmpCreated)
+        continue;
+      cmpCreated = true;
+
+      mlir::Location loc = op->getLoc();
+      mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
+      firOpBuilder.setInsertionPoint(lastOper);
+
+      mlir::Value iv = op->getRegion(0).front().getArguments()[0];
+      mlir::Value ub =
+          mlir::dyn_cast<mlir::omp::WsLoopOp>(op).getUpperBound()[0];
+      mlir::Value step = mlir::dyn_cast<mlir::omp::WsLoopOp>(op).getStep()[0];
+
+      // v = iv + step
+      // cmp = step < 0 ? v < ub : v > ub
+      mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
+      mlir::Value zero =
+          firOpBuilder.createIntegerConstant(loc, step.getType(), 0);
+      mlir::Value negativeStep = firOpBuilder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::slt, step, zero);
+      mlir::Value vLT = firOpBuilder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::slt, v, ub);
+      mlir::Value vGT = firOpBuilder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::sgt, v, ub);
+      mlir::Value cmpOp = firOpBuilder.create<mlir::arith::SelectOp>(
+          loc, negativeStep, vLT, vGT);
+
+      auto ifOp = firOpBuilder.create<fir::IfOp>(loc, cmpOp, /*else*/ false);
+      firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+      assert(loopIV && "loopIV was not set");
+      firOpBuilder.create<fir::StoreOp>(op->getLoc(), v, loopIV);
+      lastPrivIP = firOpBuilder.saveInsertionPoint();
+    } else {
+      TODO(converter.getCurrentLocation(),
+           "lastprivate clause in constructs other than "
+           "simd/worksharing-loop");
     }
   }
   firOpBuilder.restoreInsertionPoint(localInsPt);
@@ -292,14 +286,12 @@ void DataSharingProcessor::collectSymbols(
 }
 
 void DataSharingProcessor::collectDefaultSymbols() {
-  for (const Fortran::parser::OmpClause &clause : opClauseList.v) {
-    if (const auto &defaultClause =
-            std::get_if<Fortran::parser::OmpClause::Default>(&clause.u)) {
-      if (defaultClause->v.v ==
-          Fortran::parser::OmpDefaultClause::Type::Private)
+  for (const omp::Clause &clause : clauses) {
+    if (const auto *defaultClause =
+            std::get_if<omp::clause::Default>(&clause.u)) {
+      if (defaultClause->v == omp::clause::Default::Type::Private)
         collectSymbols(Fortran::semantics::Symbol::Flag::OmpPrivate);
-      else if (defaultClause->v.v ==
-               Fortran::parser::OmpDefaultClause::Type::Firstprivate)
+      else if (defaultClause->v == omp::clause::Default::Type::Firstprivate)
         collectSymbols(Fortran::semantics::Symbol::Flag::OmpFirstPrivate);
     }
   }
@@ -376,6 +368,7 @@ void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
         symLoc, uniquePrivatizerName, symType,
         isFirstPrivate ? mlir::omp::DataSharingClauseType::FirstPrivate
                        : mlir::omp::DataSharingClauseType::Private);
+    fir::ExtendedValue symExV = converter.getSymbolExtendedValue(*sym);
 
     symTable->pushScope();
 
@@ -386,7 +379,8 @@ void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
           &allocRegion, /*insertPt=*/{}, symType, symLoc);
 
       firOpBuilder.setInsertionPointToEnd(allocEntryBlock);
-      symTable->addSymbol(*sym, allocRegion.getArgument(0));
+      symTable->addSymbol(*sym,
+                          fir::substBase(symExV, allocRegion.getArgument(0)));
       symTable->pushScope();
       cloneSymbol(sym);
       firOpBuilder.create<mlir::omp::YieldOp>(
@@ -403,10 +397,12 @@ void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
       mlir::Block *copyEntryBlock = firOpBuilder.createBlock(
           &copyRegion, /*insertPt=*/{}, {symType, symType}, {symLoc, symLoc});
       firOpBuilder.setInsertionPointToEnd(copyEntryBlock);
-      symTable->addSymbol(*sym, copyRegion.getArgument(0),
+      symTable->addSymbol(*sym,
+                          fir::substBase(symExV, copyRegion.getArgument(0)),
                           /*force=*/true);
       symTable->pushScope();
-      symTable->addSymbol(*sym, copyRegion.getArgument(1));
+      symTable->addSymbol(*sym,
+                          fir::substBase(symExV, copyRegion.getArgument(1)));
       auto ip = firOpBuilder.saveInsertionPoint();
       copyFirstPrivateSymbol(sym, &ip);
 
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 9f7301df07598..226abe96705e3 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -12,6 +12,7 @@
 #ifndef FORTRAN_LOWER_DATASHARINGPROCESSOR_H
 #define FORTRAN_LOWER_DATASHARINGPROCESSOR_H
 
+#include "Clauses.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/OpenMP.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -52,7 +53,7 @@ class DataSharingProcessor {
   llvm::SetVector<const Fortran::semantics::Symbol *> symbolsInParentRegions;
   Fortran::lower::AbstractConverter &converter;
   fir::FirOpBuilder &firOpBuilder;
-  const Fortran::parser::OmpClauseList &opClauseList;
+  omp::List<omp::Clause> clauses;
   Fortran::lower::pft::Evaluation &eval;
   bool useDelayedPrivatization;
   Fortran::lower::SymMap *symTable;
@@ -61,7 +62,7 @@ class DataSharingProcessor {
   bool needBarrier();
   void collectSymbols(Fortran::semantics::Symbol::Flag flag);
   void collectOmpObjectListSymbol(
-      const Fortran::parser::OmpObjectList &ompObjectList,
+      const omp::ObjectList &objects,
       llvm::SetVector<const Fortran::semantics::Symbol *> &symbolSet);
   void collectSymbolsForPrivatization();
   void insertBarrier();
@@ -81,14 +82,15 @@ class DataSharingProcessor {
 
 public:
   DataSharingProcessor(Fortran::lower::AbstractConverter &converter,
+                       Fortran::semantics::SemanticsContext &semaCtx,
                        const Fortran::parser::OmpClauseList &opClauseList,
                        Fortran::lower::pft::Evaluation &eval,
                        bool useDelayedPrivatization = false,
                        Fortran::lower::SymMap *symTable = nullptr)
       : hasLastPrivateOp(false), converter(converter),
-        firOpBuilder(converter.getFirOpBuilder()), opClauseList(opClauseList),
-        eval(eval), useDelayedPrivatization(useDelayedPrivatization),
-        symTable(symTable) {}
+        firOpBuilder(converter.getFirOpBuilder()),
+        clauses(omp::makeList(opClauseList, semaCtx)), eval(eval),
+        useDelayedPrivatization(useDelayedPrivatization), symTable(symTable) {}
 
   // Privatisation is split into two steps.
   // Step1 performs cloning of all privatisation clauses and copying for
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 25bb4d9cff5d1..eaaa8fcd165a9 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -287,9 +287,9 @@ struct OpWithBodyGenInfo {
     return *this;
   }
 
-  OpWithBodyGenInfo &
-  setReductions(llvm::SmallVector<const Fortran::semantics::Symbol *> *value1,
-                llvm::SmallVector<mlir::Type> *value2) {
+  OpWithBodyGenInfo &setReductions(
+      llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *value1,
+      llvm::SmallVectorImpl<mlir::Type> *value2) {
     reductionSymbols = value1;
     reductionTypes = value2;
     return *this;
@@ -317,10 +317,10 @@ struct OpWithBodyGenInfo {
   /// [in] if provided, processes the construct's data-sharing attributes.
   DataSharingProcessor *dsp = nullptr;
   /// [in] if provided, list of reduction symbols
-  llvm::SmallVector<const Fortran::semantics::Symbol *> *reductionSymbols =
+  llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols =
       nullptr;
   /// [in] if provided, list of reduction types
-  llvm::SmallVector<mlir::Type> *reductionTypes = nullptr;
+  llvm::SmallVectorImpl<mlir::Type> *reductionTypes = nullptr;
   /// [in] if provided, emits the op's region entry. Otherwise, an emtpy block
   /// is created in the region.
   GenOMPRegionEntryCBFn genRegionEntryCB = nullptr;
@@ -373,7 +373,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
   std::optional<DataSharingProcessor> tempDsp;
   if (privatize) {
     if (!info.dsp) {
-      tempDsp.emplace(info.converter, *info.clauses, info.eval);
+      tempDsp.emplace(info.converter, info.semaCtx, *info.clauses, info.eval);
       tempDsp->processStep1();
     }
   }
@@ -465,11 +465,9 @@ static void genBodyOfTargetDataOp(
     Fortran::lower::AbstractConverter &converter,
     Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, bool genNested,
-    mlir::omp::DataOp &dataOp,
-    const llvm::SmallVector<mlir::Type> &useDeviceTypes,
-    const llvm::SmallVector<mlir::Location> &useDeviceLocs,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *>
-        &useDeviceSymbols,
+    mlir::omp::DataOp &dataOp, llvm::ArrayRef<mlir::Type> useDeviceTypes,
+    llvm::ArrayRef<mlir::Location> useDeviceLocs,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> useDeviceSymbols,
     const mlir::Location &currentLocation) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::Region &region = dataOp.getRegion();
@@ -574,8 +572,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Parallel,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Parallel, ifClauseOperand);
   cp.processNumThreads(stmtCtx, numThreadsClauseOperand);
   cp.processProcBind(procBindKindAttr);
   cp.processDefault();
@@ -628,7 +625,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   }
 
   bool privatize = !outerCombined;
-  DataSharingProcessor dsp(converter, clauseList, eval,
+  DataSharingProcessor dsp(converter, semaCtx, clauseList, eval,
                            /*useDelayedPrivatization=*/true, &symTable);
 
   if (privatize)
@@ -751,8 +748,7 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
       dependOperands;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Task,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Task, ifClauseOperand);
   cp.processAllocate(allocatorOperands, allocateOperands);
   cp.processDefault();
   cp.processFinal(stmtCtx, finalClauseOperand);
@@ -816,11 +812,12 @@ genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
 //  clause. Support for such list items in a use_device_ptr clause
 //  is deprecated."
 static void promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(
-    llvm::SmallVector<mlir::Value> &devicePtrOperands,
-    llvm::SmallVector<mlir::Value> &deviceAddrOperands,
-    llvm::SmallVector<mlir::Type> &useDeviceTypes,
-    llvm::SmallVector<mlir::Location> &useDeviceLocs,
-    llvm::SmallVector<const Fortran::semantics::Symbol *> &useDeviceSymbols) {
+    llvm::SmallVectorImpl<mlir::Value> &devicePtrOperands,
+    llvm::SmallVectorImpl<mlir::Value> &deviceAddrOperands,
+    llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
+    llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+        &useDeviceSymbols) {
   auto moveElementToBack = [](size_t idx, auto &vector) {
     auto *iter = std::next(vector.begin(), idx);
     vector.push_back(*iter);
@@ -865,8 +862,7 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSymbols;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetData,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::TargetData, ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processUseDevicePtr(devicePtrOperands, useDeviceTypes, useDeviceLocs,
                          useDeviceSymbols);
@@ -911,20 +907,17 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> mapOperands, dependOperands;
   llvm::SmallVector<mlir::Attribute> dependTypeOperands;
 
-  Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName;
+  clause::If::DirectiveNameModifier directiveName;
   // GCC 9.3.0 emits a (probably) bogus warning about an unused variable.
   [[maybe_unused]] llvm::omp::Directive directive;
   if constexpr (std::is_same_v<OpTy, mlir::omp::EnterDataOp>) {
-    directiveName =
-        Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetEnterData;
+    directiveName = clause::If::DirectiveNameModifier::TargetEnterData;
     directive = llvm::omp::Directive::OMPD_target_enter_data;
   } else if constexpr (std::is_same_v<OpTy, mlir::omp::ExitDataOp>) {
-    directiveName =
-        Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetExitData;
+    directiveName = clause::If::DirectiveNameModifier::TargetExitData;
     directive = llvm::omp::Directive::OMPD_target_exit_data;
   } else if constexpr (std::is_same_v<OpTy, mlir::omp::UpdateDataOp>) {
-    directiveName =
-        Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetUpdate;
+    directiveName = clause::If::DirectiveNameModifier::TargetUpdate;
     directive = llvm::omp::Directive::OMPD_target_update;
   } else {
     return nullptr;
@@ -957,15 +950,15 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
 
 // This functions creates a block for the body of the targetOp's region. It adds
 // all the symbols present in mapSymbols as block arguments to this block.
-static void genBodyOfTargetOp(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval, bool genNested,
-    mlir::omp::TargetOp &targetOp,
-    const llvm::SmallVector<mlir::Type> &mapSymTypes,
-    const llvm::SmallVector<mlir::Location> &mapSymLocs,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *> &mapSymbols,
-    const mlir::Location &currentLocation) {
+static void
+genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
+                  Fortran::semantics::SemanticsContext &semaCtx,
+                  Fortran::lower::pft::Evaluation &eval, bool genNested,
+                  mlir::omp::TargetOp &targetOp,
+                  llvm::ArrayRef<mlir::Type> mapSymTypes,
+                  llvm::ArrayRef<mlir::Location> mapSymLocs,
+                  llvm::ArrayRef<const Fortran::semantics::Symbol *> mapSymbols,
+                  const mlir::Location &currentLocation) {
   assert(mapSymTypes.size() == mapSymLocs.size());
 
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -1126,8 +1119,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> mapSymbols;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Target,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Target, ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processThreadLimit(stmtCtx, threadLimitOperand);
   cp.processDepend(dependTypeOperands, dependOperands);
@@ -1258,8 +1250,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Teams,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Teams, ifClauseOperand);
   cp.processAllocate(allocatorOperands, allocateOperands);
   cp.processDefault();
   cp.processNumTeams(stmtCtx, numTeamsClauseOperand);
@@ -1298,8 +1289,9 @@ static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
 
   if (const auto *objectList{
           Fortran::parser::Unwrap<Fortran::parser::OmpObjectList>(spec.u)}) {
+    ObjectList objects{makeList(*objectList, semaCtx)};
     // Case: declare target(func, var1, var2)
-    gatherFuncAndVarSyms(*objectList, mlir::omp::DeclareTargetCaptureClause::to,
+    gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to,
                          symbolAndClause);
   } else if (const auto *clauseList{
                  Fortran::parser::Unwrap<Fortran::parser::OmpClauseList>(
@@ -1438,7 +1430,7 @@ genOmpFlush(Fortran::lower::AbstractConverter &converter,
   if (const auto &ompObjectList =
           std::get<std::optional<Fortran::parser::OmpObjectList>>(
               flushConstruct.t))
-    genObjectList(*ompObjectList, converter, operandRange);
+    genObjectList2(*ompObjectList, converter, operandRange);
   const auto &memOrderClause =
       std::get<std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>>(
           flushConstruct.t);
@@ -1498,7 +1490,7 @@ static void convertLoopBounds(Fortran::lower::AbstractConverter &converter,
 static llvm::SmallVector<const Fortran::semantics::Symbol *>
 genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
             mlir::Location &loc,
-            const llvm::SmallVector<const Fortran::semantics::Symbol *> &args) {
+            llvm::ArrayRef<const Fortran::semantics::Symbol *> args) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   auto &region = op->getRegion(0);
 
@@ -1519,16 +1511,16 @@ genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
   }
   firOpBuilder.setInsertionPointAfter(storeOp);
 
-  return args;
+  return llvm::SmallVector<const Fortran::semantics::Symbol *>(args);
 }
 
 static llvm::SmallVector<const Fortran::semantics::Symbol *>
 genLoopAndReductionVars(
     mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
     mlir::Location &loc,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *> &loopArgs,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *> &reductionArgs,
-    llvm::SmallVector<mlir::Type> &reductionTypes) {
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> loopArgs,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> reductionArgs,
+    llvm::SmallVectorImpl<mlir::Type> &reductionTypes) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   llvm::SmallVector<mlir::Type> blockArgTypes;
@@ -1571,7 +1563,7 @@ genLoopAndReductionVars(
     converter.bindSymbol(*arg, prv);
   }
 
-  return loopArgs;
+  return llvm::SmallVector<const Fortran::semantics::Symbol *>(loopArgs);
 }
 
 static void
@@ -1582,7 +1574,7 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
                const Fortran::parser::OmpClauseList &loopOpClauseList,
                mlir::Location loc) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, loopOpClauseList, eval);
+  DataSharingProcessor dsp(converter, semaCtx, loopOpClauseList, eval);
   dsp.processStep1();
 
   Fortran::lower::StatementContext stmtCtx;
@@ -1600,8 +1592,7 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
                      loopVarTypeSize);
   cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
   cp.processReduction(loc, reductionVars, reductionDeclSymbols);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Simd,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Simd, ifClauseOperand);
   cp.processSimdlen(simdlenClauseOperand);
   cp.processSafelen(safelenClauseOperand);
   cp.processTODO<Fortran::parser::OmpClause::Aligned,
@@ -1642,7 +1633,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
                          const Fortran::parser::OmpClauseList *endClauseList,
                          mlir::Location loc) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, beginClauseList, eval);
+  DataSharingProcessor dsp(converter, semaCtx, beginClauseList, eval);
   dsp.processStep1();
 
   Fortran::lower::StatementContext stmtCtx;
@@ -2419,106 +2410,100 @@ void Fortran::lower::genOpenMPReduction(
     const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  for (const Fortran::parser::OmpClause &clause : clauseList.v) {
+  List<Clause> clauses{makeList(clauseList, semaCtx)};
+
+  for (const Clause &clause : clauses) {
     if (const auto &reductionClause =
-            std::get_if<Fortran::parser::OmpClause::Reduction>(&clause.u)) {
-      const auto &redOperator{std::get<Fortran::parser::OmpReductionOperator>(
-          reductionClause->v.t)};
-      const auto &objectList{
-          std::get<Fortran::parser::OmpObjectList>(reductionClause->v.t)};
+            std::get_if<clause::Reduction>(&clause.u)) {
+      const auto &redOperator{
+          std::get<clause::ReductionOperator>(reductionClause->t)};
+      const auto &objects{std::get<ObjectList>(reductionClause->t)};
       if (const auto *reductionOp =
-              std::get_if<Fortran::parser::DefinedOperator>(&redOperator.u)) {
+              std::get_if<clause::DefinedOperator>(&redOperator.u)) {
         const auto &intrinsicOp{
-            std::get<Fortran::parser::DefinedOperator::IntrinsicOperator>(
+            std::get<clause::DefinedOperator::IntrinsicOperator>(
                 reductionOp->u)};
 
         switch (intrinsicOp) {
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+        case clause::DefinedOperator::IntrinsicOperator::Add:
+        case clause::DefinedOperator::IntrinsicOperator::Multiply:
+        case clause::DefinedOperator::IntrinsicOperator::AND:
+        case clause::DefinedOperator::IntrinsicOperator::EQV:
+        case clause::DefinedOperator::IntrinsicOperator::OR:
+        case clause::DefinedOperator::IntrinsicOperator::NEQV:
           break;
         default:
           continue;
         }
-        for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-          if (const auto *name{
-                  Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-            if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-              mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
-              if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
-                reductionVal = declOp.getBase();
-              mlir::Type reductionType =
-                  reductionVal.getType().cast<fir::ReferenceType>().getEleTy();
-              if (!reductionType.isa<fir::LogicalType>()) {
-                if (!reductionType.isIntOrIndexOrFloat())
-                  continue;
-              }
-              for (mlir::OpOperand &reductionValUse : reductionVal.getUses()) {
-                if (auto loadOp = mlir::dyn_cast<fir::LoadOp>(
-                        reductionValUse.getOwner())) {
-                  mlir::Value loadVal = loadOp.getRes();
-                  if (reductionType.isa<fir::LogicalType>()) {
-                    mlir::Operation *reductionOp = findReductionChain(loadVal);
-                    fir::ConvertOp convertOp =
-                        getConvertFromReductionOp(reductionOp, loadVal);
-                    updateReduction(reductionOp, firOpBuilder, loadVal,
-                                    reductionVal, &convertOp);
-                    removeStoreOp(reductionOp, reductionVal);
-                  } else if (mlir::Operation *reductionOp =
-                                 findReductionChain(loadVal, &reductionVal)) {
-                    updateReduction(reductionOp, firOpBuilder, loadVal,
-                                    reductionVal);
-                  }
+        for (const Object &object : objects) {
+          if (const Fortran::semantics::Symbol *symbol = object.id()) {
+            mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
+            if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
+              reductionVal = declOp.getBase();
+            mlir::Type reductionType =
+                reductionVal.getType().cast<fir::ReferenceType>().getEleTy();
+            if (!reductionType.isa<fir::LogicalType>()) {
+              if (!reductionType.isIntOrIndexOrFloat())
+                continue;
+            }
+            for (mlir::OpOperand &reductionValUse : reductionVal.getUses()) {
+              if (auto loadOp =
+                      mlir::dyn_cast<fir::LoadOp>(reductionValUse.getOwner())) {
+                mlir::Value loadVal = loadOp.getRes();
+                if (reductionType.isa<fir::LogicalType>()) {
+                  mlir::Operation *reductionOp = findReductionChain(loadVal);
+                  fir::ConvertOp convertOp =
+                      getConvertFromReductionOp(reductionOp, loadVal);
+                  updateReduction(reductionOp, firOpBuilder, loadVal,
+                                  reductionVal, &convertOp);
+                  removeStoreOp(reductionOp, reductionVal);
+                } else if (mlir::Operation *reductionOp =
+                               findReductionChain(loadVal, &reductionVal)) {
+                  updateReduction(reductionOp, firOpBuilder, loadVal,
+                                  reductionVal);
                 }
               }
             }
           }
         }
       } else if (const auto *reductionIntrinsic =
-                     std::get_if<Fortran::parser::ProcedureDesignator>(
-                         &redOperator.u)) {
+                     std::get_if<clause::ProcedureDesignator>(&redOperator.u)) {
         if (!ReductionProcessor::supportedIntrinsicProcReduction(
                 *reductionIntrinsic))
           continue;
         ReductionProcessor::ReductionIdentifier redId =
             ReductionProcessor::getReductionType(*reductionIntrinsic);
-        for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-          if (const auto *name{
-                  Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-            if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-              mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
-              if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
-                reductionVal = declOp.getBase();
-              for (const mlir::OpOperand &reductionValUse :
-                   reductionVal.getUses()) {
-                if (auto loadOp = mlir::dyn_cast<fir::LoadOp>(
-                        reductionValUse.getOwner())) {
-                  mlir::Value loadVal = loadOp.getRes();
-                  // Max is lowered as a compare -> select.
-                  // Match the pattern here.
-                  mlir::Operation *reductionOp =
-                      findReductionChain(loadVal, &reductionVal);
-                  if (reductionOp == nullptr)
-                    continue;
-
-                  if (redId == ReductionProcessor::ReductionIdentifier::MAX ||
-                      redId == ReductionProcessor::ReductionIdentifier::MIN) {
-                    assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
-                           "Selection Op not found in reduction intrinsic");
-                    mlir::Operation *compareOp =
-                        getCompareFromReductionOp(reductionOp, loadVal);
-                    updateReduction(compareOp, firOpBuilder, loadVal,
-                                    reductionVal);
-                  }
-                  if (redId == ReductionProcessor::ReductionIdentifier::IOR ||
-                      redId == ReductionProcessor::ReductionIdentifier::IEOR ||
-                      redId == ReductionProcessor::ReductionIdentifier::IAND) {
-                    updateReduction(reductionOp, firOpBuilder, loadVal,
-                                    reductionVal);
-                  }
+        for (const Object &object : objects) {
+          if (const Fortran::semantics::Symbol *symbol = object.id()) {
+            mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
+            if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
+              reductionVal = declOp.getBase();
+            for (const mlir::OpOperand &reductionValUse :
+                 reductionVal.getUses()) {
+              if (auto loadOp =
+                      mlir::dyn_cast<fir::LoadOp>(reductionValUse.getOwner())) {
+                mlir::Value loadVal = loadOp.getRes();
+                // Max is lowered as a compare -> select.
+                // Match the pattern here.
+                mlir::Operation *reductionOp =
+                    findReductionChain(loadVal, &reductionVal);
+                if (reductionOp == nullptr)
+                  continue;
+
+                if (redId == ReductionProcessor::ReductionIdentifier::MAX ||
+                    redId == ReductionProcessor::ReductionIdentifier::MIN) {
+                  assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
+                         "Selection Op not found in reduction intrinsic");
+                  mlir::Operation *compareOp =
+                      getCompareFromReductionOp(reductionOp, loadVal);
+                  updateReduction(compareOp, firOpBuilder, loadVal,
+                                  reductionVal);
+                }
+                if (redId == ReductionProcessor::ReductionIdentifier::IOR ||
+                    redId == ReductionProcessor::ReductionIdentifier::IEOR ||
+                    redId == ReductionProcessor::ReductionIdentifier::IAND) {
+                  updateReduction(reductionOp, firOpBuilder, loadVal,
+                                  reductionVal);
                 }
               }
             }
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index e6a63dd4b939c..6dc467c4f69bc 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -30,9 +30,9 @@ namespace lower {
 namespace omp {
 
 ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
-    const Fortran::parser::ProcedureDesignator &pd) {
+    const omp::clause::ProcedureDesignator &pd) {
   auto redType = llvm::StringSwitch<std::optional<ReductionIdentifier>>(
-                     ReductionProcessor::getRealName(pd).ToString())
+                     getRealName(pd.v.id()).ToString())
                      .Case("max", ReductionIdentifier::MAX)
                      .Case("min", ReductionIdentifier::MIN)
                      .Case("iand", ReductionIdentifier::IAND)
@@ -44,21 +44,21 @@ ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
 }
 
 ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
-    Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp) {
+    omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp) {
   switch (intrinsicOp) {
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Add:
     return ReductionIdentifier::ADD;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Subtract:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Subtract:
     return ReductionIdentifier::SUBTRACT;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Multiply:
     return ReductionIdentifier::MULTIPLY;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
+  case omp::clause::DefinedOperator::IntrinsicOperator::AND:
     return ReductionIdentifier::AND;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+  case omp::clause::DefinedOperator::IntrinsicOperator::EQV:
     return ReductionIdentifier::EQV;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
+  case omp::clause::DefinedOperator::IntrinsicOperator::OR:
     return ReductionIdentifier::OR;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+  case omp::clause::DefinedOperator::IntrinsicOperator::NEQV:
     return ReductionIdentifier::NEQV;
   default:
     llvm_unreachable("unexpected intrinsic operator in reduction");
@@ -66,13 +66,11 @@ ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
 }
 
 bool ReductionProcessor::supportedIntrinsicProcReduction(
-    const Fortran::parser::ProcedureDesignator &pd) {
-  const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(pd)};
-  assert(name && "Invalid Reduction Intrinsic.");
-  if (!name->symbol->GetUltimate().attrs().test(
-          Fortran::semantics::Attr::INTRINSIC))
+    const omp::clause::ProcedureDesignator &pd) {
+  Fortran::semantics::Symbol *sym = pd.v.id();
+  if (!sym->GetUltimate().attrs().test(Fortran::semantics::Attr::INTRINSIC))
     return false;
-  auto redType = llvm::StringSwitch<bool>(getRealName(name).ToString())
+  auto redType = llvm::StringSwitch<bool>(getRealName(sym).ToString())
                      .Case("max", true)
                      .Case("min", true)
                      .Case("iand", true)
@@ -99,24 +97,24 @@ std::string ReductionProcessor::getReductionName(llvm::StringRef name,
 }
 
 std::string ReductionProcessor::getReductionName(
-    Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-    mlir::Type ty, bool isByRef) {
+    omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp, mlir::Type ty,
+    bool isByRef) {
   std::string reductionName;
 
   switch (intrinsicOp) {
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Add:
     reductionName = "add_reduction";
     break;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Multiply:
     reductionName = "multiply_reduction";
     break;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
+  case omp::clause::DefinedOperator::IntrinsicOperator::AND:
     return "and_reduction";
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+  case omp::clause::DefinedOperator::IntrinsicOperator::EQV:
     return "eqv_reduction";
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
+  case omp::clause::DefinedOperator::IntrinsicOperator::OR:
     return "or_reduction";
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+  case omp::clause::DefinedOperator::IntrinsicOperator::NEQV:
     return "neqv_reduction";
   default:
     reductionName = "other_reduction";
@@ -364,7 +362,7 @@ bool ReductionProcessor::doReductionByRef(
 void ReductionProcessor::addReductionDecl(
     mlir::Location currentLocation,
     Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::OmpReductionClause &reduction,
+    const omp::clause::Reduction &reduction,
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
@@ -372,13 +370,12 @@ void ReductionProcessor::addReductionDecl(
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::omp::ReductionDeclareOp decl;
   const auto &redOperator{
-      std::get<Fortran::parser::OmpReductionOperator>(reduction.t)};
-  const auto &objectList{std::get<Fortran::parser::OmpObjectList>(reduction.t)};
+      std::get<omp::clause::ReductionOperator>(reduction.t)};
+  const auto &objectList{std::get<omp::ObjectList>(reduction.t)};
 
-  if (!std::holds_alternative<Fortran::parser::DefinedOperator>(
-          redOperator.u)) {
+  if (!std::holds_alternative<omp::clause::DefinedOperator>(redOperator.u)) {
     if (const auto *reductionIntrinsic =
-            std::get_if<Fortran::parser::ProcedureDesignator>(&redOperator.u)) {
+            std::get_if<omp::clause::ProcedureDesignator>(&redOperator.u)) {
       if (!ReductionProcessor::supportedIntrinsicProcReduction(
               *reductionIntrinsic)) {
         return;
@@ -388,27 +385,23 @@ void ReductionProcessor::addReductionDecl(
     }
   }
 
-  // initial pass to collect all recuction vars so we can figure out if this
+  // initial pass to collect all reduction vars so we can figure out if this
   // should happen byref
-  for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-    if (const auto *name{
-            Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-      if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-        if (reductionSymbols)
-          reductionSymbols->push_back(symbol);
-        mlir::Value symVal = converter.getSymbolAddress(*symbol);
-        if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
-          symVal = declOp.getBase();
-        reductionVars.push_back(symVal);
-      }
-    }
+  for (const Object &object : objectList) {
+    const Fortran::semantics::Symbol *symbol = object.id();
+    if (reductionSymbols)
+      reductionSymbols->push_back(symbol);
+    mlir::Value symVal = converter.getSymbolAddress(*symbol);
+    if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
+      symVal = declOp.getBase();
+    reductionVars.push_back(symVal);
   }
   const bool isByRef = doReductionByRef(reductionVars);
 
   if (const auto &redDefinedOp =
-          std::get_if<Fortran::parser::DefinedOperator>(&redOperator.u)) {
+          std::get_if<omp::clause::DefinedOperator>(&redOperator.u)) {
     const auto &intrinsicOp{
-        std::get<Fortran::parser::DefinedOperator::IntrinsicOperator>(
+        std::get<omp::clause::DefinedOperator::IntrinsicOperator>(
             redDefinedOp->u)};
     ReductionIdentifier redId = getReductionType(intrinsicOp);
     switch (redId) {
@@ -424,73 +417,63 @@ void ReductionProcessor::addReductionDecl(
            "Reduction of some intrinsic operators is not supported");
       break;
     }
-    for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-      if (const auto *name{
-              Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-        if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-          mlir::Value symVal = converter.getSymbolAddress(*symbol);
-          if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
-            symVal = declOp.getBase();
-          auto redType = symVal.getType().cast<fir::ReferenceType>();
-          if (redType.getEleTy().isa<fir::LogicalType>())
-            decl = createReductionDecl(
-                firOpBuilder,
-                getReductionName(intrinsicOp, firOpBuilder.getI1Type(),
-                                 isByRef),
-                redId, redType, currentLocation, isByRef);
-          else if (redType.getEleTy().isIntOrIndexOrFloat()) {
-            decl = createReductionDecl(
-                firOpBuilder, getReductionName(intrinsicOp, redType, isByRef),
-                redId, redType, currentLocation, isByRef);
-          } else {
-            TODO(currentLocation, "Reduction of some types is not supported");
-          }
-          reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
-              firOpBuilder.getContext(), decl.getSymName()));
-        }
+
+    for (const Object &object : objectList) {
+      const Fortran::semantics::Symbol *symbol = object.id();
+      mlir::Value symVal = converter.getSymbolAddress(*symbol);
+      if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
+        symVal = declOp.getBase();
+      auto redType = symVal.getType().cast<fir::ReferenceType>();
+      if (redType.getEleTy().isa<fir::LogicalType>())
+        decl = createReductionDecl(
+            firOpBuilder,
+            getReductionName(intrinsicOp, firOpBuilder.getI1Type(), isByRef),
+            redId, redType, currentLocation, isByRef);
+      else if (redType.getEleTy().isIntOrIndexOrFloat()) {
+        decl = createReductionDecl(
+            firOpBuilder, getReductionName(intrinsicOp, redType, isByRef),
+            redId, redType, currentLocation, isByRef);
+      } else {
+        TODO(currentLocation, "Reduction of some types is not supported");
       }
+      reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
+          firOpBuilder.getContext(), decl.getSymName()));
     }
   } else if (const auto *reductionIntrinsic =
-                 std::get_if<Fortran::parser::ProcedureDesignator>(
+                 std::get_if<omp::clause::ProcedureDesignator>(
                      &redOperator.u)) {
     if (ReductionProcessor::supportedIntrinsicProcReduction(
             *reductionIntrinsic)) {
       ReductionProcessor::ReductionIdentifier redId =
           ReductionProcessor::getReductionType(*reductionIntrinsic);
-      for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-        if (const auto *name{
-                Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-          if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-            mlir::Value symVal = converter.getSymbolAddress(*symbol);
-            if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
-              symVal = declOp.getBase();
-            auto redType = symVal.getType().cast<fir::ReferenceType>();
-            assert(redType.getEleTy().isIntOrIndexOrFloat() &&
-                   "Unsupported reduction type");
-            decl = createReductionDecl(
-                firOpBuilder,
-                getReductionName(getRealName(*reductionIntrinsic).ToString(),
-                                 redType, isByRef),
-                redId, redType, currentLocation, isByRef);
-            reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
-                firOpBuilder.getContext(), decl.getSymName()));
-          }
-        }
+      for (const Object &object : objectList) {
+        const Fortran::semantics::Symbol *symbol = object.id();
+        mlir::Value symVal = converter.getSymbolAddress(*symbol);
+        if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
+          symVal = declOp.getBase();
+        auto redType = symVal.getType().cast<fir::ReferenceType>();
+        assert(redType.getEleTy().isIntOrIndexOrFloat() &&
+               "Unsupported reduction type");
+        decl = createReductionDecl(
+            firOpBuilder,
+            getReductionName(getRealName(*reductionIntrinsic).ToString(),
+                             redType, isByRef),
+            redId, redType, currentLocation, isByRef);
+        reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
+            firOpBuilder.getContext(), decl.getSymName()));
       }
     }
   }
 }
 
 const Fortran::semantics::SourceName
-ReductionProcessor::getRealName(const Fortran::parser::Name *name) {
-  return name->symbol->GetUltimate().name();
+ReductionProcessor::getRealName(const Fortran::semantics::Symbol *symbol) {
+  return symbol->GetUltimate().name();
 }
 
-const Fortran::semantics::SourceName ReductionProcessor::getRealName(
-    const Fortran::parser::ProcedureDesignator &pd) {
-  const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(pd)};
-  assert(name && "Invalid Reduction Intrinsic.");
-  return getRealName(name);
+const Fortran::semantics::SourceName
+ReductionProcessor::getRealName(const omp::clause::ProcedureDesignator &pd) {
+  return getRealName(pd.v.id());
 }
 
 int ReductionProcessor::getOperationIdentity(ReductionIdentifier redId,
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/lib/Lower/OpenMP/ReductionProcessor.h
index 679580f2a3cac..ef6339407c135 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.h
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.h
@@ -13,6 +13,7 @@
 #ifndef FORTRAN_LOWER_REDUCTIONPROCESSOR_H
 #define FORTRAN_LOWER_REDUCTIONPROCESSOR_H
 
+#include "Clauses.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Parser/parse-tree.h"
@@ -58,19 +59,19 @@ class ReductionProcessor {
   };
 
   static ReductionIdentifier
-  getReductionType(const Fortran::parser::ProcedureDesignator &pd);
+  getReductionType(const omp::clause::ProcedureDesignator &pd);
 
-  static ReductionIdentifier getReductionType(
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp);
+  static ReductionIdentifier
+  getReductionType(omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp);
 
-  static bool supportedIntrinsicProcReduction(
-      const Fortran::parser::ProcedureDesignator &pd);
+  static bool
+  supportedIntrinsicProcReduction(const omp::clause::ProcedureDesignator &pd);
 
   static const Fortran::semantics::SourceName
-  getRealName(const Fortran::parser::Name *name);
+  getRealName(const Fortran::semantics::Symbol *symbol);
 
   static const Fortran::semantics::SourceName
-  getRealName(const Fortran::parser::ProcedureDesignator &pd);
+  getRealName(const omp::clause::ProcedureDesignator &pd);
 
   static bool
   doReductionByRef(const llvm::SmallVectorImpl<mlir::Value> &reductionVars);
@@ -78,9 +79,9 @@ class ReductionProcessor {
   static std::string getReductionName(llvm::StringRef name, mlir::Type ty,
                                       bool isByRef);
 
-  static std::string getReductionName(
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      mlir::Type ty, bool isByRef);
+  static std::string
+  getReductionName(omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp,
+                   mlir::Type ty, bool isByRef);
 
   /// This function returns the identity value of the operator \p
   /// reductionOpName. For example:
@@ -119,7 +120,7 @@ class ReductionProcessor {
   static void
   addReductionDecl(mlir::Location currentLocation,
                    Fortran::lower::AbstractConverter &converter,
-                   const Fortran::parser::OmpReductionClause &reduction,
+                   const omp::clause::Reduction &reduction,
                    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
                    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
                    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 49517f62895df..fa4a51e338483 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Utils.h"
+#include "Clauses.h"
 
 #include <flang/Lower/AbstractConverter.h>
 #include <flang/Lower/ConvertType.h>
@@ -34,19 +35,33 @@ namespace Fortran {
 namespace lower {
 namespace omp {
 
-void genObjectList(const Fortran::parser::OmpObjectList &objectList,
+void genObjectList(const ObjectList &objects,
                    Fortran::lower::AbstractConverter &converter,
                    llvm::SmallVectorImpl<mlir::Value> &operands) {
+  for (const Object &object : objects) {
+    const Fortran::semantics::Symbol *sym = object.id();
+    assert(sym && "Expected Symbol");
+    if (mlir::Value variable = converter.getSymbolAddress(*sym)) {
+      operands.push_back(variable);
+    } else if (const auto *details =
+                   sym->detailsIf<Fortran::semantics::HostAssocDetails>()) {
+      operands.push_back(converter.getSymbolAddress(details->symbol()));
+      converter.copySymbolBinding(details->symbol(), *sym);
+    }
+  }
+}
+
+void genObjectList2(const Fortran::parser::OmpObjectList &objectList,
+                    Fortran::lower::AbstractConverter &converter,
+                    llvm::SmallVectorImpl<mlir::Value> &operands) {
   auto addOperands = [&](Fortran::lower::SymbolRef sym) {
     const mlir::Value variable = converter.getSymbolAddress(sym);
     if (variable) {
       operands.push_back(variable);
-    } else {
-      if (const auto *details =
-              sym->detailsIf<Fortran::semantics::HostAssocDetails>()) {
-        operands.push_back(converter.getSymbolAddress(details->symbol()));
-        converter.copySymbolBinding(details->symbol(), sym);
-      }
+    } else if (const auto *details =
+                   sym->detailsIf<Fortran::semantics::HostAssocDetails>()) {
+      operands.push_back(converter.getSymbolAddress(details->symbol()));
+      converter.copySymbolBinding(details->symbol(), sym);
     }
   };
   for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
@@ -56,24 +71,10 @@ void genObjectList(const Fortran::parser::OmpObjectList &objectList,
 }
 
 void gatherFuncAndVarSyms(
-    const Fortran::parser::OmpObjectList &objList,
-    mlir::omp::DeclareTargetCaptureClause clause,
+    const ObjectList &objects, mlir::omp::DeclareTargetCaptureClause clause,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
-  for (const Fortran::parser::OmpObject &ompObject : objList.v) {
-    Fortran::common::visit(
-        Fortran::common::visitors{
-            [&](const Fortran::parser::Designator &designator) {
-              if (const Fortran::parser::Name *name =
-                      Fortran::semantics::getDesignatorNameIfDataRef(
-                          designator)) {
-                symbolAndClause.emplace_back(clause, *name->symbol);
-              }
-            },
-            [&](const Fortran::parser::Name &name) {
-              symbolAndClause.emplace_back(clause, *name.symbol);
-            }},
-        ompObject.u);
-  }
+  for (const Object &object : objects)
+    symbolAndClause.emplace_back(clause, *object.id());
 }
 
 Fortran::semantics::Symbol *
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 76a15e8bcaab9..3ab0823a46214 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_LOWER_OPENMPUTILS_H
 #define FORTRAN_LOWER_OPENMPUTILS_H
 
+#include "Clauses.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/Value.h"
@@ -45,23 +46,26 @@ using DeclareTargetCapturePair =
 mlir::omp::MapInfoOp
 createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name,
-                mlir::SmallVector<mlir::Value> bounds,
-                mlir::SmallVector<mlir::Value> members, uint64_t mapType,
+                mlir::ArrayRef<mlir::Value> bounds,
+                mlir::ArrayRef<mlir::Value> members, uint64_t mapType,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
                 bool isVal = false);
 
 void gatherFuncAndVarSyms(
-    const Fortran::parser::OmpObjectList &objList,
-    mlir::omp::DeclareTargetCaptureClause clause,
+    const ObjectList &objects, mlir::omp::DeclareTargetCaptureClause clause,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause);
 
 Fortran::semantics::Symbol *
 getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject);
 
-void genObjectList(const Fortran::parser::OmpObjectList &objectList,
+void genObjectList(const ObjectList &objects,
                    Fortran::lower::AbstractConverter &converter,
                    llvm::SmallVectorImpl<mlir::Value> &operands);
 
+void genObjectList2(const Fortran::parser::OmpObjectList &objectList,
+                    Fortran::lower::AbstractConverter &converter,
+                    llvm::SmallVectorImpl<mlir::Value> &operands);
+
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 12da7412888a3..f7327a299d9a5 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -208,6 +208,8 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
               .getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
     return ompOutlineableIface.getAllocaBlock();
   }
+  if (mlir::isa<mlir::omp::ReductionDeclareOp>(getRegion().getParentOp()))
+    return &getRegion().front();
   if (auto accRecipeIface =
           getRegion().getParentOfType<mlir::acc::RecipeInterface>()) {
     return accRecipeIface.getAllocaBlock(getRegion());
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 94fcfa3503114..eb8f5135ff12e 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -177,6 +177,8 @@ static constexpr IntrinsicHandler handlers[]{
      /*isElemental=*/false},
     {"c_funloc", &I::genCFunLoc, {{{"x", asBox}}}, /*isElemental=*/false},
     {"c_loc", &I::genCLoc, {{{"x", asBox}}}, /*isElemental=*/false},
+    {"c_ptr_eq", &I::genCPtrCompare<mlir::arith::CmpIPredicate::eq>},
+    {"c_ptr_ne", &I::genCPtrCompare<mlir::arith::CmpIPredicate::ne>},
     {"ceiling", &I::genCeiling},
     {"char", &I::genChar},
     {"cmplx",
@@ -2797,6 +2799,23 @@ IntrinsicLibrary::genCLoc(mlir::Type resultType,
   return genCLocOrCFunLoc(builder, loc, resultType, args);
 }
 
+// C_PTR_EQ and C_PTR_NE
+template <mlir::arith::CmpIPredicate pred>
+fir::ExtendedValue
+IntrinsicLibrary::genCPtrCompare(mlir::Type resultType,
+                                 llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value cPtr1 = fir::getBase(args[0]);
+  mlir::Value cPtrVal1 =
+      fir::factory::genCPtrOrCFunptrValue(builder, loc, cPtr1);
+  mlir::Value cPtr2 = fir::getBase(args[1]);
+  mlir::Value cPtrVal2 =
+      fir::factory::genCPtrOrCFunptrValue(builder, loc, cPtr2);
+  mlir::Value cmp =
+      builder.create<mlir::arith::CmpIOp>(loc, pred, cPtrVal1, cPtrVal2);
+  return builder.createConvert(loc, resultType, cmp);
+}
+
 // CEILING
 mlir::Value IntrinsicLibrary::genCeiling(mlir::Type resultType,
                                          llvm::ArrayRef<mlir::Value> args) {
@@ -5240,6 +5259,13 @@ mlir::Value IntrinsicLibrary::genModulo(mlir::Type resultType,
                                                  remainder);
   }
 
+  // F128 arith::RemFOp may be lowered to a runtime call that may be unsupported
+  // on the target, so generate a call to Fortran Runtime's ModuloReal16.
+  if (resultType == mlir::FloatType::getF128(builder.getContext()))
+    return builder.createConvert(
+        loc, resultType,
+        fir::runtime::genModulo(builder, loc, args[0], args[1]));
+
   auto remainder = builder.create<mlir::arith::RemFOp>(loc, args[0], args[1]);
   mlir::Value zero = builder.createRealZeroConstant(loc, remainder.getType());
   auto remainderIsNotZero = builder.create<mlir::arith::CmpFOp>(
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index b958a30eb6e5b..4dcbd13cb319f 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -118,6 +118,20 @@ struct ForcedMod16 {
   }
 };
 
+/// Placeholder for real*16 version of Modulo Intrinsic
+struct ForcedModulo16 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ModuloReal16));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto fltTy = mlir::FloatType::getF128(ctx);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      return mlir::FunctionType::get(ctx, {fltTy, fltTy, strTy, intTy},
+                                     {fltTy});
+    };
+  }
+};
+
 /// Placeholder for real*10 version of Nearest Intrinsic
 struct ForcedNearest10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Nearest10));
@@ -323,6 +337,33 @@ mlir::Value fir::runtime::genMod(fir::FirOpBuilder &builder, mlir::Location loc,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
+/// Generate call to Modulo intrinsic runtime routine.
+mlir::Value fir::runtime::genModulo(fir::FirOpBuilder &builder,
+                                    mlir::Location loc, mlir::Value a,
+                                    mlir::Value p) {
+  mlir::func::FuncOp func;
+  mlir::Type fltTy = a.getType();
+
+  if (fltTy != p.getType())
+    fir::emitFatalError(loc, "arguments type mismatch in MOD");
+
+  // MODULO is lowered into math operations in intrinsics lowering,
+  // so genModulo() should only be used for F128 data type now.
+  if (fltTy.isF128())
+    func = fir::runtime::getRuntimeFunc<ForcedModulo16>(loc, builder);
+  else
+    fir::intrinsicTypeTODO(builder, fltTy, loc, "MODULO");
+
+  auto funcTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, funcTy.getInput(3));
+  auto args = fir::runtime::createArguments(builder, loc, funcTy, a, p,
+                                            sourceFile, sourceLine);
+
+  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
+
 /// Generate call to Nearest intrinsic runtime routine.
 mlir::Value fir::runtime::genNearest(fir::FirOpBuilder &builder,
                                      mlir::Location loc, mlir::Value x,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index f81a08388da72..e941a0226ca50 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -37,6 +37,7 @@
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/Transforms/AddComdats.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
@@ -49,7 +50,6 @@
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include <mlir/Dialect/LLVMIR/LLVMAttrs.h>
 
 namespace fir {
 #define GEN_PASS_DEF_FIRTOLLVMLOWERING
@@ -410,8 +410,15 @@ class FIROpConversion : public mlir::ConvertOpToLLVMPattern<FromOp> {
       mlir::ConversionPatternRewriter &rewriter) const {
     auto thisPt = rewriter.saveInsertionPoint();
     mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
-    mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp);
-    rewriter.setInsertionPointToStart(insertBlock);
+    if (mlir::isa<mlir::omp::ReductionDeclareOp>(parentOp)) {
+      // ReductionDeclareOp has multiple child regions. We want to get the first
+      // block of whichever of those regions we are currently in
+      mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
+      rewriter.setInsertionPointToStart(&parentRegion->front());
+    } else {
+      mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp);
+      rewriter.setInsertionPointToStart(insertBlock);
+    }
     auto size = genI32Constant(loc, rewriter, 1);
     unsigned allocaAs = getAllocaAddressSpace(rewriter);
     unsigned programAs = getProgramAddressSpace(rewriter);
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index 0170b56367cf3..9d48a2f08aba0 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -298,8 +298,7 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
                                        .isa<fir::SequenceType>());
     });
     mlir::RewritePatternSet patterns(&context);
-    patterns.insert<EmboxConversion, ArrayCoorConversion, ReboxConversion,
-                    DeclareOpConversion>(&context);
+    fir::populatePreCGRewritePatterns(patterns);
     if (mlir::failed(
             mlir::applyPartialConversion(op, target, std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(&context),
@@ -327,3 +326,8 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 std::unique_ptr<mlir::Pass> fir::createFirCodeGenRewritePass() {
   return std::make_unique<CodeGenRewrite>();
 }
+
+void fir::populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns) {
+  patterns.insert<EmboxConversion, ArrayCoorConversion, ReboxConversion,
+                  DeclareOpConversion>(patterns.getContext());
+}
diff --git a/flang/lib/Parser/executable-parsers.cpp b/flang/lib/Parser/executable-parsers.cpp
index de2be017508c3..07a570bd61e99 100644
--- a/flang/lib/Parser/executable-parsers.cpp
+++ b/flang/lib/Parser/executable-parsers.cpp
@@ -542,19 +542,19 @@ TYPE_CONTEXT_PARSER("UNLOCK statement"_en_US,
 // CUF-kernel-do-directive ->
 //     !$CUF KERNEL DO [ (scalar-int-constant-expr) ] <<< grid, block [, stream]
 //     >>> do-construct
-// grid -> * | scalar-int-expr | ( scalar-int-expr-list )
-// block -> * | scalar-int-expr | ( scalar-int-expr-list )
+// star-or-expr -> * | scalar-int-expr
+// grid -> * | scalar-int-expr | ( star-or-expr-list )
+// block -> * | scalar-int-expr | ( star-or-expr-list )
 // stream -> ( 0, | STREAM = ) scalar-int-expr
+constexpr auto starOrExpr{construct<CUFKernelDoConstruct::StarOrExpr>(
+    "*" >> pure<std::optional<ScalarIntExpr>>() ||
+    applyFunction(presentOptional<ScalarIntExpr>, scalarIntExpr))};
+constexpr auto gridOrBlock{parenthesized(nonemptyList(starOrExpr)) ||
+    applyFunction(singletonList<CUFKernelDoConstruct::StarOrExpr>, starOrExpr)};
 TYPE_PARSER(sourced(beginDirective >> "$CUF KERNEL DO"_tok >>
     construct<CUFKernelDoConstruct::Directive>(
-        maybe(parenthesized(scalarIntConstantExpr)),
-        "<<<" >>
-            ("*" >> pure<std::list<ScalarIntExpr>>() ||
-                parenthesized(nonemptyList(scalarIntExpr)) ||
-                applyFunction(singletonList<ScalarIntExpr>, scalarIntExpr)),
-        "," >> ("*" >> pure<std::list<ScalarIntExpr>>() ||
-                   parenthesized(nonemptyList(scalarIntExpr)) ||
-                   applyFunction(singletonList<ScalarIntExpr>, scalarIntExpr)),
+        maybe(parenthesized(scalarIntConstantExpr)), "<<<" >> gridOrBlock,
+        "," >> gridOrBlock,
         maybe((", 0 ,"_tok || ", STREAM ="_tok) >> scalarIntExpr) / ">>>" /
             endDirective)))
 TYPE_CONTEXT_PARSER("!$CUF KERNEL DO construct"_en_US,
diff --git a/flang/lib/Parser/misc-parsers.h b/flang/lib/Parser/misc-parsers.h
index e9b52b7d0fcd0..4a318e05bb4b8 100644
--- a/flang/lib/Parser/misc-parsers.h
+++ b/flang/lib/Parser/misc-parsers.h
@@ -57,5 +57,10 @@ template <typename A> common::IfNoLvalue<std::list<A>, A> singletonList(A &&x) {
   result.emplace_back(std::move(x));
   return result;
 }
+
+template <typename A>
+common::IfNoLvalue<std::optional<A>, A> presentOptional(A &&x) {
+  return std::make_optional(std::move(x));
+}
 } // namespace Fortran::parser
 #endif
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 600aa01999dab..baba4863f5775 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2729,6 +2729,13 @@ class UnparseVisitor {
   WALK_NESTED_ENUM(OmpOrderModifier, Kind) // OMP order-modifier
 #undef WALK_NESTED_ENUM
 
+  void Unparse(const CUFKernelDoConstruct::StarOrExpr &x) {
+    if (x.v) {
+      Walk(*x.v);
+    } else {
+      Word("*");
+    }
+  }
   void Unparse(const CUFKernelDoConstruct::Directive &x) {
     Word("!$CUF KERNEL DO");
     Walk(" (", std::get<std::optional<ScalarIntConstantExpr>>(x.t), ")");
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 3adbd7cc41774..d625f8c2f7fc1 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -332,7 +332,15 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
   bool typesCompatible{typesCompatibleWithIgnoreTKR ||
       dummy.type.type().IsTkCompatibleWith(actualType.type())};
   int dummyRank{dummy.type.Rank()};
-  if (!typesCompatible && dummyRank == 0 && allowActualArgumentConversions) {
+  if (typesCompatible) {
+    if (const auto *constantChar{
+            evaluate::UnwrapConstantValue<evaluate::Ascii>(actual)};
+        constantChar && constantChar->wasHollerith() &&
+        dummy.type.type().IsUnlimitedPolymorphic()) {
+      messages.Say(
+          "passing Hollerith to unlimited polymorphic as if it were CHARACTER"_port_en_US);
+    }
+  } else if (dummyRank == 0 && allowActualArgumentConversions) {
     // Extension: pass Hollerith literal to scalar as if it had been BOZ
     if (auto converted{evaluate::HollerithToBOZ(
             foldingContext, actual, dummy.type.type())}) {
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 431cef2079350..581371ff7a003 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -632,6 +632,11 @@ void CheckHelper::CheckObjectEntity(
     const Symbol &symbol, const ObjectEntityDetails &details) {
   CheckSymbolType(symbol);
   CheckArraySpec(symbol, details.shape());
+  CheckConflicting(symbol, Attr::ALLOCATABLE, Attr::PARAMETER);
+  CheckConflicting(symbol, Attr::ASYNCHRONOUS, Attr::PARAMETER);
+  CheckConflicting(symbol, Attr::SAVE, Attr::PARAMETER);
+  CheckConflicting(symbol, Attr::TARGET, Attr::PARAMETER);
+  CheckConflicting(symbol, Attr::VOLATILE, Attr::PARAMETER);
   Check(details.shape());
   Check(details.coshape());
   if (details.shape().Rank() > common::maxRank) {
diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index 4e8578d0e1daf..51f536f3d7723 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -220,8 +220,11 @@ class DoConcurrentBodyEnforce {
       if (MightDeallocatePolymorphic(*entity, DeallocateNonCoarray)) {
         SayDeallocateOfPolymorph(variable.GetSource(), *entity, reason);
       }
-      if (const Symbol * impure{HasImpureFinal(*entity)}) {
-        SayDeallocateWithImpureFinal(*entity, reason, *impure);
+      if (const auto *assignment{GetAssignment(stmt)}) {
+        const auto &lhs{assignment->lhs};
+        if (const Symbol * impure{HasImpureFinal(*entity, lhs.Rank())}) {
+          SayDeallocateWithImpureFinal(*entity, reason, *impure);
+        }
       }
     }
     if (const auto *assignment{GetAssignment(stmt)}) {
@@ -435,6 +438,18 @@ class DoContext {
       CheckForallIndexesUsed(*assignment);
       CheckForImpureCall(assignment->lhs);
       CheckForImpureCall(assignment->rhs);
+
+      if (IsVariable(assignment->lhs)) {
+        if (const Symbol * symbol{GetLastSymbol(assignment->lhs)}) {
+          if (auto impureFinal{
+                  HasImpureFinal(*symbol, assignment->lhs.Rank())}) {
+            context_.SayWithDecl(*symbol, parser::FindSourceLocation(stmt),
+                "Impure procedure '%s' is referenced by finalization in a %s"_err_en_US,
+                impureFinal->name(), LoopKindName());
+          }
+        }
+      }
+
       if (const auto *proc{
               std::get_if<evaluate::ProcedureRef>(&assignment->u)}) {
         CheckForImpureCall(*proc);
diff --git a/flang/lib/Semantics/data-to-inits.cpp b/flang/lib/Semantics/data-to-inits.cpp
index fa22d49867905..2ebc4e561a339 100644
--- a/flang/lib/Semantics/data-to-inits.cpp
+++ b/flang/lib/Semantics/data-to-inits.cpp
@@ -118,9 +118,10 @@ class DataInitializationCompiler {
   bool Scan(const parser::DataIDoObject &);
 
   // Initializes all elements of a designator, which can be an array or section.
-  bool InitDesignator(const SomeExpr &);
+  bool InitDesignator(const SomeExpr &, const Scope &);
   // Initializes a single scalar object.
-  bool InitElement(const evaluate::OffsetSymbol &, const SomeExpr &designator);
+  bool InitElement(const evaluate::OffsetSymbol &, const SomeExpr &designator,
+      const Scope &);
   // If the returned flag is true, emit a warning about CHARACTER misusage.
   std::optional<std::pair<SomeExpr, bool>> ConvertElement(
       const SomeExpr &, const evaluate::DynamicType &);
@@ -128,7 +129,6 @@ class DataInitializationCompiler {
   DataInitializations &inits_;
   evaluate::ExpressionAnalyzer &exprAnalyzer_;
   ValueListIterator<DSV> values_;
-  const Scope *scope_{nullptr};
 };
 
 template <typename DSV>
@@ -149,8 +149,7 @@ bool DataInitializationCompiler<DSV>::Scan(const parser::Variable &var) {
   if (const auto *expr{GetExpr(exprAnalyzer_.context(), var)}) {
     parser::CharBlock at{var.GetSource()};
     exprAnalyzer_.GetFoldingContext().messages().SetLocation(at);
-    scope_ = &exprAnalyzer_.context().FindScope(at);
-    if (InitDesignator(*expr)) {
+    if (InitDesignator(*expr, exprAnalyzer_.context().FindScope(at))) {
       return true;
     }
   }
@@ -170,8 +169,7 @@ bool DataInitializationCompiler<DSV>::Scan(
   if (expr) {
     parser::CharBlock at{parser::FindSourceLocation(designator)};
     exprAnalyzer_.GetFoldingContext().messages().SetLocation(at);
-    scope_ = &exprAnalyzer_.context().FindScope(at);
-    if (InitDesignator(*expr)) {
+    if (InitDesignator(*expr, exprAnalyzer_.context().FindScope(at))) {
       return true;
     }
   }
@@ -254,12 +252,12 @@ template <typename DSV>
 bool DataInitializationCompiler<DSV>::Scan(const Symbol &symbol) {
   auto designator{exprAnalyzer_.Designate(evaluate::DataRef{symbol})};
   CHECK(designator.has_value());
-  return InitDesignator(*designator);
+  return InitDesignator(*designator, symbol.owner());
 }
 
 template <typename DSV>
 bool DataInitializationCompiler<DSV>::InitDesignator(
-    const SomeExpr &designator) {
+    const SomeExpr &designator, const Scope &scope) {
   evaluate::FoldingContext &context{exprAnalyzer_.GetFoldingContext()};
   evaluate::DesignatorFolder folder{context};
   while (auto offsetSymbol{folder.FoldDesignator(designator)}) {
@@ -274,7 +272,7 @@ bool DataInitializationCompiler<DSV>::InitDesignator(
             designator.AsFortran());
       }
       return false;
-    } else if (!InitElement(*offsetSymbol, designator)) {
+    } else if (!InitElement(*offsetSymbol, designator, scope)) {
       return false;
     } else {
       ++values_;
@@ -314,7 +312,8 @@ DataInitializationCompiler<DSV>::ConvertElement(
 
 template <typename DSV>
 bool DataInitializationCompiler<DSV>::InitElement(
-    const evaluate::OffsetSymbol &offsetSymbol, const SomeExpr &designator) {
+    const evaluate::OffsetSymbol &offsetSymbol, const SomeExpr &designator,
+    const Scope &scope) {
   const Symbol &symbol{offsetSymbol.symbol()};
   const Symbol *lastSymbol{GetLastSymbol(designator)};
   bool isPointer{lastSymbol && IsPointer(*lastSymbol)};
@@ -390,7 +389,7 @@ bool DataInitializationCompiler<DSV>::InitElement(
     } else if (isProcPointer) {
       if (evaluate::IsProcedure(*expr)) {
         if (CheckPointerAssignment(exprAnalyzer_.context(), designator, *expr,
-                DEREF(scope_),
+                scope,
                 /*isBoundsRemapping=*/false, /*isAssumedRank=*/false)) {
           if (lastSymbol->has<ProcEntityDetails>()) {
             GetImage().AddPointer(offsetSymbol.offset(), *expr);
@@ -413,7 +412,7 @@ bool DataInitializationCompiler<DSV>::InitElement(
           "Procedure '%s' may not be used to initialize '%s', which is not a procedure pointer"_err_en_US,
           expr->AsFortran(), DescribeElement());
     } else if (CheckInitialDataPointerTarget(
-                   exprAnalyzer_.context(), designator, *expr, DEREF(scope_))) {
+                   exprAnalyzer_.context(), designator, *expr, scope)) {
       GetImage().AddPointer(offsetSymbol.offset(), *expr);
       return true;
     }
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 54bfe0f2e1563..6af86de9dd81c 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -875,8 +875,11 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::CharLiteralConstant &x) {
 MaybeExpr ExpressionAnalyzer::Analyze(
     const parser::HollerithLiteralConstant &x) {
   int kind{GetDefaultKind(TypeCategory::Character)};
-  auto value{x.v};
-  return AnalyzeString(std::move(value), kind);
+  auto result{AnalyzeString(std::string{x.v}, kind)};
+  if (auto *constant{UnwrapConstantValue<Ascii>(result)}) {
+    constant->set_wasHollerith(true);
+  }
+  return result;
 }
 
 // .TRUE. and .FALSE. of various kinds
@@ -1299,7 +1302,8 @@ static NamedEntity IgnoreAnySubscripts(Designator<SomeDerived> &&designator) {
       std::move(designator.u));
 }
 
-// Components of parent derived types are explicitly represented as such.
+// Components, but not bindings, of parent derived types are explicitly
+// represented as such.
 std::optional<Component> ExpressionAnalyzer::CreateComponent(DataRef &&base,
     const Symbol &component, const semantics::Scope &scope,
     bool C919bAlreadyEnforced) {
@@ -1307,7 +1311,8 @@ std::optional<Component> ExpressionAnalyzer::CreateComponent(DataRef &&base,
       base.Rank() > 0) { // C919b
     Say("An allocatable or pointer component reference must be applied to a scalar base"_err_en_US);
   }
-  if (&component.owner() == &scope) {
+  if (&component.owner() == &scope ||
+      component.has<semantics::ProcBindingDetails>()) {
     return Component{std::move(base), component};
   }
   if (const Symbol *typeSymbol{scope.GetSymbol()}) {
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index b4df7216a33e2..5d0d210fa3487 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -1242,7 +1242,7 @@ static void GetModuleDependences(
   std::size_t limit{content.size()};
   std::string_view str{content.data(), limit};
   for (std::size_t j{ModHeader::len};
-       str.substr(j, ModHeader::needLen) == ModHeader::need;) {
+       str.substr(j, ModHeader::needLen) == ModHeader::need; ++j) {
     j += 7;
     auto checkSum{ExtractCheckSum(str.substr(j, ModHeader::sumLen))};
     if (!checkSum) {
@@ -1260,8 +1260,8 @@ static void GetModuleDependences(
     for (; j < limit && str.at(j) != '\n'; ++j) {
     }
     if (j > start && j < limit && str.at(j) == '\n') {
-      dependences.AddDependence(
-          std::string{str.substr(start, j - start)}, intrinsic, *checkSum);
+      std::string depModName{str.substr(start, j - start)};
+      dependences.AddDependence(std::move(depModName), intrinsic, *checkSum);
     } else {
       break;
     }
@@ -1271,7 +1271,7 @@ static void GetModuleDependences(
 Scope *ModFileReader::Read(SourceName name, std::optional<bool> isIntrinsic,
     Scope *ancestor, bool silent) {
   std::string ancestorName; // empty for module
-  Symbol *notAModule{nullptr};
+  const Symbol *notAModule{nullptr};
   bool fatalError{false};
   if (ancestor) {
     if (auto *scope{ancestor->FindSubmodule(name)}) {
@@ -1287,26 +1287,28 @@ Scope *ModFileReader::Read(SourceName name, std::optional<bool> isIntrinsic,
     if (it != context_.globalScope().end()) {
       Scope *scope{it->second->scope()};
       if (scope->kind() == Scope::Kind::Module) {
-        if (requiredHash) {
-          if (const Symbol * foundModule{scope->symbol()}) {
-            if (const auto *module{foundModule->detailsIf<ModuleDetails>()};
-                module && module->moduleFileHash() &&
-                *requiredHash != *module->moduleFileHash()) {
-              Say(name, ancestorName,
-                  "Multiple versions of the module '%s' cannot be required by the same compilation"_err_en_US,
-                  name.ToString());
-              return nullptr;
+        for (const Symbol *found{scope->symbol()}; found;) {
+          if (const auto *module{found->detailsIf<ModuleDetails>()}) {
+            if (!requiredHash ||
+                *requiredHash ==
+                    module->moduleFileHash().value_or(*requiredHash)) {
+              return const_cast<Scope *>(found->scope());
             }
+            found = module->previous(); // same name, distinct hash
+          } else {
+            notAModule = found;
+            break;
           }
         }
-        return scope;
       } else {
         notAModule = scope->symbol();
-        // USE, NON_INTRINSIC global name isn't a module?
-        fatalError = isIntrinsic.has_value();
       }
     }
   }
+  if (notAModule) {
+    // USE, NON_INTRINSIC global name isn't a module?
+    fatalError = isIntrinsic.has_value();
+  }
   auto path{ModFileName(name, ancestorName, context_.moduleFileSuffix())};
   parser::Parsing parsing{context_.allCookedSources()};
   parser::Options options;
@@ -1360,42 +1362,18 @@ Scope *ModFileReader::Read(SourceName name, std::optional<bool> isIntrinsic,
 
   // Look for the right module file if its hash is known
   if (requiredHash && !fatalError) {
-    std::vector<std::string> misses;
     for (const std::string &maybe :
         parser::LocateSourceFileAll(path, options.searchDirectories)) {
       if (const auto *srcFile{context_.allCookedSources().allSources().OpenPath(
               maybe, llvm::errs())}) {
-        if (auto checkSum{VerifyHeader(srcFile->content())}) {
-          if (*checkSum == *requiredHash) {
-            path = maybe;
-            if (!misses.empty()) {
-              auto &msg{context_.Say(name,
-                  "Module file for '%s' appears later in the module search path than conflicting modules with different checksums"_warn_en_US,
-                  name.ToString())};
-              for (const std::string &m : misses) {
-                msg.Attach(
-                    name, "Module file with a conflicting name: '%s'"_en_US, m);
-              }
-            }
-            misses.clear();
-            break;
-          } else {
-            misses.emplace_back(maybe);
-          }
+        if (auto checkSum{VerifyHeader(srcFile->content())};
+            checkSum && *checkSum == *requiredHash) {
+          path = maybe;
+          break;
         }
       }
     }
-    if (!misses.empty()) {
-      auto &msg{Say(name, ancestorName,
-          "Could not find a module file for '%s' in the module search path with the expected checksum"_err_en_US,
-          name.ToString())};
-      for (const std::string &m : misses) {
-        msg.Attach(name, "Module file with different checksum: '%s'"_en_US, m);
-      }
-      return nullptr;
-    }
   }
-
   const auto *sourceFile{fatalError ? nullptr : parsing.Prescan(path, options)};
   if (fatalError || parsing.messages().AnyFatalError()) {
     if (!silent) {
@@ -1451,11 +1429,24 @@ Scope *ModFileReader::Read(SourceName name, std::optional<bool> isIntrinsic,
   Scope &topScope{isIntrinsic.value_or(false) ? context_.intrinsicModulesScope()
                                               : context_.globalScope()};
   Symbol *moduleSymbol{nullptr};
+  const Symbol *previousModuleSymbol{nullptr};
   if (!ancestor) { // module, not submodule
     parentScope = &topScope;
     auto pair{parentScope->try_emplace(name, UnknownDetails{})};
     if (!pair.second) {
-      return nullptr;
+      // There is already a global symbol or intrinsic module of the same name.
+      previousModuleSymbol = &*pair.first->second;
+      if (const auto *details{
+              previousModuleSymbol->detailsIf<ModuleDetails>()}) {
+        if (!details->moduleFileHash().has_value()) {
+          return nullptr;
+        }
+      } else {
+        return nullptr;
+      }
+      CHECK(parentScope->erase(name) != 0);
+      pair = parentScope->try_emplace(name, UnknownDetails{});
+      CHECK(pair.second);
     }
     moduleSymbol = &*pair.first->second;
     moduleSymbol->set(Symbol::Flag::ModFile);
@@ -1486,7 +1477,9 @@ Scope *ModFileReader::Read(SourceName name, std::optional<bool> isIntrinsic,
   }
   if (moduleSymbol) {
     CHECK(moduleSymbol->test(Symbol::Flag::ModFile));
-    moduleSymbol->get<ModuleDetails>().set_moduleFileHash(checkSum.value());
+    auto &details{moduleSymbol->get<ModuleDetails>()};
+    details.set_moduleFileHash(checkSum.value());
+    details.set_previous(previousModuleSymbol);
     if (isIntrinsic.value_or(false)) {
       moduleSymbol->attrs().set(Attr::INTRINSIC);
     }
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 215a3c9060a24..6d58013b87d29 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -487,6 +487,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
       const auto namePair{
           currScope().try_emplace(name->source, Attrs{}, ProcEntityDetails{})};
       auto &newSymbol{*namePair.first->second};
+      if (context_.intrinsics().IsIntrinsic(name->ToString())) {
+        newSymbol.attrs().set(Attr::INTRINSIC);
+      }
       name->symbol = &newSymbol;
     };
     if (const auto *procD{parser::Unwrap<parser::ProcedureDesignator>(opr.u)}) {
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 67392a02cf186..f89323f3e54a6 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -3391,12 +3391,25 @@ void ModuleVisitor::ApplyDefaultAccess() {
   const auto *moduleDetails{
       DEREF(currScope().symbol()).detailsIf<ModuleDetails>()};
   CHECK(moduleDetails);
+  Attr defaultAttr{
+      DEREF(moduleDetails).isDefaultPrivate() ? Attr::PRIVATE : Attr::PUBLIC};
   for (auto &pair : currScope()) {
     Symbol &symbol{*pair.second};
     if (!symbol.attrs().HasAny({Attr::PUBLIC, Attr::PRIVATE})) {
-      SetImplicitAttr(symbol,
-          DEREF(moduleDetails).isDefaultPrivate() ? Attr::PRIVATE
-                                                  : Attr::PUBLIC);
+      Attr attr{defaultAttr};
+      if (auto *generic{symbol.detailsIf<GenericDetails>()}) {
+        if (generic->derivedType()) {
+          // If a generic interface has a derived type of the same
+          // name that has an explicit accessibility attribute, then
+          // the generic must have the same accessibility.
+          if (generic->derivedType()->attrs().test(Attr::PUBLIC)) {
+            attr = Attr::PUBLIC;
+          } else if (generic->derivedType()->attrs().test(Attr::PRIVATE)) {
+            attr = Attr::PRIVATE;
+          }
+        }
+      }
+      SetImplicitAttr(symbol, attr);
     }
   }
 }
@@ -4419,7 +4432,7 @@ Symbol &SubprogramVisitor::PushSubprogramScope(const parser::Name &name,
         CHECK(context().HasError(genericSymbol));
       }
     }
-    set_inheritFromParent(false);
+    set_inheritFromParent(hasModulePrefix);
   }
   if (Symbol * found{FindSymbol(name)};
       found && found->has<HostAssocDetails>()) {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 9845a190bc756..15ea34c66dba5 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -1239,6 +1239,16 @@ void RuntimeTableBuilder::IncorporateDefinedIoGenericInterfaces(
 RuntimeDerivedTypeTables BuildRuntimeDerivedTypeTables(
     SemanticsContext &context) {
   RuntimeDerivedTypeTables result;
+  // Do not attempt to read __fortran_type_info.mod when compiling
+  // the module on which it depends.
+  const auto &allSources{context.allCookedSources().allSources()};
+  if (auto firstProv{allSources.GetFirstFileProvenance()}) {
+    if (const auto *srcFile{allSources.GetSourceFile(firstProv->start())}) {
+      if (srcFile->path().find("__fortran_builtins.f90") != std::string::npos) {
+        return result;
+      }
+    }
+  }
   result.schemata = context.GetBuiltinModule(typeInfoBuiltinModule);
   if (result.schemata) {
     RuntimeTableBuilder builder{context, result};
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index bf999b090419c..0484baae93cd5 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -827,15 +827,18 @@ static const Symbol *HasImpureFinal(
   return IsFinalizable(derived, nullptr, /*withImpureFinalizer=*/true, rank);
 }
 
-const Symbol *HasImpureFinal(const Symbol &original) {
+const Symbol *HasImpureFinal(const Symbol &original, std::optional<int> rank) {
   const Symbol &symbol{ResolveAssociations(original)};
   if (symbol.has<ObjectEntityDetails>()) {
     if (const DeclTypeSpec * symType{symbol.GetType()}) {
       if (const DerivedTypeSpec * derived{symType->AsDerived()}) {
-        // finalizable assumed-rank not allowed (C839)
-        return evaluate::IsAssumedRank(symbol)
-            ? nullptr
-            : HasImpureFinal(*derived, symbol.Rank());
+        if (evaluate::IsAssumedRank(symbol)) {
+          // finalizable assumed-rank not allowed (C839)
+          return nullptr;
+        } else {
+          int actualRank{rank.value_or(symbol.Rank())};
+          return HasImpureFinal(*derived, actualRank);
+        }
       }
     }
   }
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index 62b20103221ad..3d3dbef6d018a 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -110,7 +110,7 @@
   public :: operator(==)
 
   interface operator(/=)
-    module procedure __builtin_c_ptr_eq
+    module procedure __builtin_c_ptr_ne
   end interface
   public :: operator(/=)
 
diff --git a/flang/runtime/connection.h b/flang/runtime/connection.h
index c9a7566f20988..c41970d47e7b0 100644
--- a/flang/runtime/connection.h
+++ b/flang/runtime/connection.h
@@ -12,8 +12,8 @@
 #define FORTRAN_RUNTIME_IO_CONNECTION_H_
 
 #include "format.h"
+#include "flang/Common/optional.h"
 #include <cinttypes>
-#include <optional>
 
 namespace Fortran::runtime::io {
 
@@ -26,10 +26,10 @@ enum class Access { Sequential, Direct, Stream };
 // established in an OPEN statement.
 struct ConnectionAttributes {
   Access access{Access::Sequential}; // ACCESS='SEQUENTIAL', 'DIRECT', 'STREAM'
-  std::optional<bool> isUnformatted; // FORM='UNFORMATTED' if true
+  Fortran::common::optional<bool> isUnformatted; // FORM='UNFORMATTED' if true
   bool isUTF8{false}; // ENCODING='UTF-8'
   unsigned char internalIoCharKind{0}; // 0->external, 1/2/4->internal
-  std::optional<std::int64_t> openRecl; // RECL= on OPEN
+  Fortran::common::optional<std::int64_t> openRecl; // RECL= on OPEN
 
   bool IsRecordFile() const {
     // Formatted stream files are viewed as having records, at least on input
@@ -63,14 +63,14 @@ struct ConnectionState : public ConnectionAttributes {
     unterminatedRecord = false;
   }
 
-  std::optional<std::int64_t> EffectiveRecordLength() const {
+  Fortran::common::optional<std::int64_t> EffectiveRecordLength() const {
     // When an input record is longer than an explicit RECL= from OPEN
     // it is effectively truncated on input.
     return openRecl && recordLength && *openRecl < *recordLength ? openRecl
                                                                  : recordLength;
   }
 
-  std::optional<std::int64_t> recordLength;
+  Fortran::common::optional<std::int64_t> recordLength;
 
   std::int64_t currentRecordNumber{1}; // 1 is first
 
@@ -86,11 +86,12 @@ struct ConnectionState : public ConnectionAttributes {
   std::int64_t furthestPositionInRecord{0}; // max(position+bytes)
 
   // Set at end of non-advancing I/O data transfer
-  std::optional<std::int64_t> leftTabLimit; // offset in current record
+  Fortran::common::optional<std::int64_t>
+      leftTabLimit; // offset in current record
 
   // currentRecordNumber value captured after ENDFILE/REWIND/BACKSPACE statement
   // or an end-of-file READ condition on a sequential access file
-  std::optional<std::int64_t> endfileRecordNumber;
+  Fortran::common::optional<std::int64_t> endfileRecordNumber;
 
   // Mutable modes set at OPEN() that can be overridden in READ/WRITE & FORMAT
   MutableModes modes; // BLANK=, DECIMAL=, SIGN=, ROUND=, PAD=, DELIM=, kP
diff --git a/flang/runtime/derived-api.cpp b/flang/runtime/derived-api.cpp
index 321f50a1edfcf..eca784be208d1 100644
--- a/flang/runtime/derived-api.cpp
+++ b/flang/runtime/derived-api.cpp
@@ -95,7 +95,7 @@ inline RT_API_ATTRS bool CompareDerivedType(
   return a == b || CompareDerivedTypeNames(a->name(), b->name());
 }
 
-static const RT_API_ATTRS typeInfo::DerivedType *GetDerivedType(
+static RT_API_ATTRS const typeInfo::DerivedType *GetDerivedType(
     const Descriptor &desc) {
   if (const DescriptorAddendum * addendum{desc.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
diff --git a/flang/runtime/descriptor-io.cpp b/flang/runtime/descriptor-io.cpp
index 6041104773cc4..7c7323b719adf 100644
--- a/flang/runtime/descriptor-io.cpp
+++ b/flang/runtime/descriptor-io.cpp
@@ -12,11 +12,12 @@
 namespace Fortran::runtime::io::descr {
 
 // Defined formatted I/O (maybe)
-std::optional<bool> DefinedFormattedIo(IoStatementState &io,
+Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
     const Descriptor &descriptor, const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special,
     const SubscriptValue subscripts[]) {
-  std::optional<DataEdit> peek{io.GetNextDataEdit(0 /*to peek at it*/)};
+  Fortran::common::optional<DataEdit> peek{
+      io.GetNextDataEdit(0 /*to peek at it*/)};
   if (peek &&
       (peek->descriptor == DataEdit::DefinedDerivedType ||
           peek->descriptor == DataEdit::ListDirected)) {
@@ -55,7 +56,7 @@ std::optional<bool> DefinedFormattedIo(IoStatementState &io,
     int unit{external->unitNumber()};
     int ioStat{IostatOk};
     char ioMsg[100];
-    std::optional<std::int64_t> startPos;
+    Fortran::common::optional<std::int64_t> startPos;
     if (edit.descriptor == DataEdit::DefinedDerivedType &&
         special.which() == typeInfo::SpecialBinding::Which::ReadFormatted) {
       // DT is an edit descriptor so everything that the child
@@ -96,7 +97,7 @@ std::optional<bool> DefinedFormattedIo(IoStatementState &io,
     // There's a defined I/O subroutine, but there's a FORMAT present and
     // it does not have a DT data edit descriptor, so apply default formatting
     // to the components of the derived type as usual.
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 }
 
diff --git a/flang/runtime/descriptor-io.h b/flang/runtime/descriptor-io.h
index 394578796faa7..b6b0fefcff870 100644
--- a/flang/runtime/descriptor-io.h
+++ b/flang/runtime/descriptor-io.h
@@ -21,6 +21,7 @@
 #include "terminator.h"
 #include "type-info.h"
 #include "unit.h"
+#include "flang/Common/optional.h"
 #include "flang/Common/uint128.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
@@ -321,9 +322,9 @@ static bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
   return true;
 }
 
-std::optional<bool> DefinedFormattedIo(IoStatementState &, const Descriptor &,
-    const typeInfo::DerivedType &, const typeInfo::SpecialBinding &,
-    const SubscriptValue[]);
+Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &,
+    const Descriptor &, const typeInfo::DerivedType &,
+    const typeInfo::SpecialBinding &, const SubscriptValue[]);
 
 template <Direction DIR>
 static bool FormattedDerivedTypeIO(IoStatementState &io,
@@ -334,7 +335,7 @@ static bool FormattedDerivedTypeIO(IoStatementState &io,
   RUNTIME_CHECK(handler, addendum != nullptr);
   const typeInfo::DerivedType *type{addendum->derivedType()};
   RUNTIME_CHECK(handler, type != nullptr);
-  std::optional<typeInfo::SpecialBinding> nonTbpSpecial;
+  Fortran::common::optional<typeInfo::SpecialBinding> nonTbpSpecial;
   const typeInfo::SpecialBinding *special{nullptr};
   if (table) {
     if (const auto *definedIo{table->Find(*type,
@@ -365,7 +366,7 @@ static bool FormattedDerivedTypeIO(IoStatementState &io,
   std::size_t numElements{descriptor.Elements()};
   for (std::size_t j{0}; j < numElements;
        ++j, descriptor.IncrementSubscripts(subscripts)) {
-    std::optional<bool> result;
+    Fortran::common::optional<bool> result;
     if (special) {
       result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts);
     }
@@ -406,7 +407,7 @@ static bool UnformattedDescriptorIO(IoStatementState &io,
                   : typeInfo::SpecialBinding::Which::WriteUnformatted,
               definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
               false};
-          if (std::optional<bool> wasDefined{
+          if (Fortran::common::optional<bool> wasDefined{
                   DefinedUnformattedIo(io, descriptor, *type, special)}) {
             return *wasDefined;
           }
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 85cce2f1a1662..fbeb1a595b327 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -9,6 +9,7 @@
 #include "edit-input.h"
 #include "namelist.h"
 #include "utf.h"
+#include "flang/Common/optional.h"
 #include "flang/Common/real.h"
 #include "flang/Common/uint128.h"
 #include <algorithm>
@@ -54,9 +55,9 @@ template <int LOG2_BASE>
 static bool EditBOZInput(
     IoStatementState &io, const DataEdit &edit, void *n, std::size_t bytes) {
   // Skip leading white space & zeroes
-  std::optional<int> remaining{io.CueUpInput(edit)};
+  Fortran::common::optional<int> remaining{io.CueUpInput(edit)};
   auto start{io.GetConnectionState().positionInRecord};
-  std::optional<char32_t> next{io.NextInField(remaining, edit)};
+  Fortran::common::optional<char32_t> next{io.NextInField(remaining, edit)};
   if (next.value_or('?') == '0') {
     do {
       start = io.GetConnectionState().positionInRecord;
@@ -156,7 +157,8 @@ static inline char32_t GetRadixPointChar(const DataEdit &edit) {
 
 // Prepares input from a field, and returns the sign, if any, else '\0'.
 static char ScanNumericPrefix(IoStatementState &io, const DataEdit &edit,
-    std::optional<char32_t> &next, std::optional<int> &remaining) {
+    Fortran::common::optional<char32_t> &next,
+    Fortran::common::optional<int> &remaining) {
   remaining = io.CueUpInput(edit);
   next = io.NextInField(remaining, edit);
   char sign{'\0'};
@@ -198,8 +200,8 @@ bool EditIntegerInput(
         edit.descriptor);
     return false;
   }
-  std::optional<int> remaining;
-  std::optional<char32_t> next;
+  Fortran::common::optional<int> remaining;
+  Fortran::common::optional<char32_t> next;
   char sign{ScanNumericPrefix(io, edit, next, remaining)};
   common::UnsignedInt128 value{0};
   bool any{!!sign};
@@ -279,10 +281,10 @@ struct ScannedRealInput {
 };
 static ScannedRealInput ScanRealInput(
     char *buffer, int bufferSize, IoStatementState &io, const DataEdit &edit) {
-  std::optional<int> remaining;
-  std::optional<char32_t> next;
+  Fortran::common::optional<int> remaining;
+  Fortran::common::optional<char32_t> next;
   int got{0};
-  std::optional<int> radixPointOffset;
+  Fortran::common::optional<int> radixPointOffset;
   auto Put{[&](char ch) -> void {
     if (got < bufferSize) {
       buffer[got] = ch;
@@ -612,11 +614,23 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
     } else {
       break;
     }
-    while (fraction >> binaryPrecision) {
-      guardBit |= roundingBit;
-      roundingBit = (int)fraction & 1;
-      fraction >>= 1;
-      ++expo;
+    if (fraction >> binaryPrecision) {
+      while (fraction >> binaryPrecision) {
+        guardBit |= roundingBit;
+        roundingBit = (int)fraction & 1;
+        fraction >>= 1;
+        ++expo;
+      }
+      // Consume excess digits
+      while (*++p) {
+        if (*p == '0') {
+        } else if ((*p >= '1' && *p <= '9') || (*p >= 'A' && *p <= 'F')) {
+          guardBit = 1;
+        } else {
+          break;
+        }
+      }
+      break;
     }
   }
   if (fraction) {
@@ -631,31 +645,32 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
     while (expo > 1 && !(fraction >> (binaryPrecision - 1))) {
       fraction <<= 1;
       --expo;
+      guardBit = roundingBit = 0;
     }
-    // Rounding
-    bool increase{false};
-    switch (rounding) {
-    case decimal::RoundNearest: // RN & RP
-      increase = roundingBit && (guardBit | ((int)fraction & 1));
-      break;
-    case decimal::RoundUp: // RU
-      increase = !isNegative && (roundingBit | guardBit);
-      break;
-    case decimal::RoundDown: // RD
-      increase = isNegative && (roundingBit | guardBit);
-      break;
-    case decimal::RoundToZero: // RZ
-      break;
-    case decimal::RoundCompatible: // RC
-      increase = roundingBit != 0;
-      break;
-    }
-    if (increase) {
-      ++fraction;
-      if (fraction >> binaryPrecision) {
-        fraction >>= 1;
-        ++expo;
-      }
+  }
+  // Rounding
+  bool increase{false};
+  switch (rounding) {
+  case decimal::RoundNearest: // RN & RP
+    increase = roundingBit && (guardBit | ((int)fraction & 1));
+    break;
+  case decimal::RoundUp: // RU
+    increase = !isNegative && (roundingBit | guardBit);
+    break;
+  case decimal::RoundDown: // RD
+    increase = isNegative && (roundingBit | guardBit);
+    break;
+  case decimal::RoundToZero: // RZ
+    break;
+  case decimal::RoundCompatible: // RC
+    increase = roundingBit != 0;
+    break;
+  }
+  if (increase) {
+    ++fraction;
+    if (fraction >> binaryPrecision) {
+      fraction >>= 1;
+      ++expo;
     }
   }
   // Package & return result
@@ -833,8 +848,8 @@ bool EditLogicalInput(IoStatementState &io, const DataEdit &edit, bool &x) {
         edit.descriptor);
     return false;
   }
-  std::optional<int> remaining{io.CueUpInput(edit)};
-  std::optional<char32_t> next{io.NextInField(remaining, edit)};
+  Fortran::common::optional<int> remaining{io.CueUpInput(edit)};
+  Fortran::common::optional<char32_t> next{io.NextInField(remaining, edit)};
   if (next && *next == '.') { // skip optional period
     next = io.NextInField(remaining, edit);
   }
@@ -916,8 +931,9 @@ static bool EditListDirectedCharacterInput(
   // or the end of the current record.  Subtlety: the "remaining" count
   // here is a dummy that's used to avoid the interpretation of separators
   // in NextInField.
-  std::optional<int> remaining{length > 0 ? maxUTF8Bytes : 0};
-  while (std::optional<char32_t> next{io.NextInField(remaining, edit)}) {
+  Fortran::common::optional<int> remaining{length > 0 ? maxUTF8Bytes : 0};
+  while (Fortran::common::optional<char32_t> next{
+      io.NextInField(remaining, edit)}) {
     bool isSep{false};
     switch (*next) {
     case ' ':
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index b474c8cd91bae..7267540370fc0 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -9,6 +9,7 @@
 #include "edit-output.h"
 #include "emit-encoded.h"
 #include "utf.h"
+#include "flang/Common/real.h"
 #include "flang/Common/uint128.h"
 #include <algorithm>
 
@@ -700,7 +701,9 @@ bool RealOutputEditing<KIND>::EditEXOutput(const DataEdit &edit) {
   if ((editWidth == 0 && !edit.digits) || editDigits == 0) {
     // EX0 or EXw.0
     flags |= decimal::Minimize;
-    significantDigits = 28; // enough for 128-bit F.P.
+    static constexpr int maxSigHexDigits{
+        (common::PrecisionOfRealKind(16) + 3) / 4};
+    significantDigits = maxSigHexDigits;
   }
   auto converted{
       ConvertToHexadecimal(significantDigits, edit.modes.round, flags)};
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index 62d9ee2afd1ce..b74067a377774 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -23,7 +23,9 @@ extern char **environ;
 
 namespace Fortran::runtime {
 
-ExecutionEnvironment executionEnvironment;
+RT_OFFLOAD_VAR_GROUP_BEGIN
+RT_VAR_ATTRS ExecutionEnvironment executionEnvironment;
+RT_OFFLOAD_VAR_GROUP_END
 
 static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
   if (!envDefaults) {
@@ -47,7 +49,8 @@ static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
   }
 }
 
-std::optional<Convert> GetConvertFromString(const char *x, std::size_t n) {
+Fortran::common::optional<Convert> GetConvertFromString(
+    const char *x, std::size_t n) {
   static const char *keywords[]{
       "UNKNOWN", "NATIVE", "LITTLE_ENDIAN", "BIG_ENDIAN", "SWAP", nullptr};
   switch (IdentifyValue(x, n, keywords)) {
@@ -62,7 +65,7 @@ std::optional<Convert> GetConvertFromString(const char *x, std::size_t n) {
   case 4:
     return Convert::Swap;
   default:
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 }
 
@@ -123,6 +126,19 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("FORT_CHECK_POINTER_DEALLOCATION")}) {
+    char *end;
+    auto n{std::strtol(x, &end, 10)};
+    if (n >= 0 && n <= 1 && *end == '\0') {
+      checkPointerDeallocation = n != 0;
+    } else {
+      std::fprintf(stderr,
+          "Fortran runtime: FORT_CHECK_POINTER_DEALLOCATION=%s is invalid; "
+          "ignored\n",
+          x);
+    }
+  }
+
   // TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment
 }
 
diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h
index 82a5ec8f4ebfb..9bc1158509615 100644
--- a/flang/runtime/environment.h
+++ b/flang/runtime/environment.h
@@ -9,8 +9,8 @@
 #ifndef FORTRAN_RUNTIME_ENVIRONMENT_H_
 #define FORTRAN_RUNTIME_ENVIRONMENT_H_
 
+#include "flang/Common/optional.h"
 #include "flang/Decimal/decimal.h"
-#include <optional>
 
 struct EnvironmentDefaultList;
 
@@ -29,10 +29,15 @@ constexpr bool isHostLittleEndian{true};
 // External unformatted I/O data conversions
 enum class Convert { Unknown, Native, LittleEndian, BigEndian, Swap };
 
-std::optional<Convert> GetConvertFromString(const char *, std::size_t);
+Fortran::common::optional<Convert> GetConvertFromString(
+    const char *, std::size_t);
 
 struct ExecutionEnvironment {
-  constexpr ExecutionEnvironment(){};
+#if !defined(_OPENMP)
+  // FIXME: https://github.com/llvm/llvm-project/issues/84942
+  constexpr
+#endif
+      ExecutionEnvironment(){};
   void Configure(int argc, const char *argv[], const char *envp[],
       const EnvironmentDefaultList *envDefaults);
   const char *GetEnv(
@@ -48,9 +53,13 @@ struct ExecutionEnvironment {
   Convert conversion{Convert::Unknown}; // FORT_CONVERT
   bool noStopMessage{false}; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP"
   bool defaultUTF8{false}; // DEFAULT_UTF8
+  bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
 };
 
-extern ExecutionEnvironment executionEnvironment;
+RT_OFFLOAD_VAR_GROUP_BEGIN
+extern RT_VAR_ATTRS ExecutionEnvironment executionEnvironment;
+RT_OFFLOAD_VAR_GROUP_END
+
 } // namespace Fortran::runtime
 
 #endif // FORTRAN_RUNTIME_ENVIRONMENT_H_
diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp
index 61afb0458430d..d6e9633372f52 100644
--- a/flang/runtime/extrema.cpp
+++ b/flang/runtime/extrema.cpp
@@ -18,7 +18,6 @@
 #include <cfloat>
 #include <cinttypes>
 #include <cmath>
-#include <optional>
 #include <type_traits>
 
 namespace Fortran::runtime {
diff --git a/flang/runtime/file.cpp b/flang/runtime/file.cpp
index 5cf91b8d64c86..6ca5776f812a0 100644
--- a/flang/runtime/file.cpp
+++ b/flang/runtime/file.cpp
@@ -60,7 +60,7 @@ static int openfile_mkstemp(IoErrorHandler &handler) {
   return fd;
 }
 
-void OpenFile::Open(OpenStatus status, std::optional<Action> action,
+void OpenFile::Open(OpenStatus status, Fortran::common::optional<Action> action,
     Position position, IoErrorHandler &handler) {
   if (fd_ >= 0 &&
       (status == OpenStatus::Old || status == OpenStatus::Unknown)) {
@@ -322,7 +322,7 @@ int OpenFile::WriteAsynchronously(FileOffset at, const char *buffer,
 }
 
 void OpenFile::Wait(int id, IoErrorHandler &handler) {
-  std::optional<int> ioStat;
+  Fortran::common::optional<int> ioStat;
   Pending *prev{nullptr};
   for (Pending *p{pending_.get()}; p; p = (prev = p)->next.get()) {
     if (p->id == id) {
diff --git a/flang/runtime/file.h b/flang/runtime/file.h
index a0712136a7f3d..17deeb0e2f827 100644
--- a/flang/runtime/file.h
+++ b/flang/runtime/file.h
@@ -12,9 +12,9 @@
 #define FORTRAN_RUNTIME_FILE_H_
 
 #include "io-error.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/memory.h"
 #include <cinttypes>
-#include <optional>
 
 namespace Fortran::runtime::io {
 
@@ -37,10 +37,11 @@ class OpenFile {
   void set_mayAsynchronous(bool yes) { mayAsynchronous_ = yes; }
   bool isTerminal() const { return isTerminal_; }
   bool isWindowsTextFile() const { return isWindowsTextFile_; }
-  std::optional<FileOffset> knownSize() const { return knownSize_; }
+  Fortran::common::optional<FileOffset> knownSize() const { return knownSize_; }
 
   bool IsConnected() const { return fd_ >= 0; }
-  void Open(OpenStatus, std::optional<Action>, Position, IoErrorHandler &);
+  void Open(OpenStatus, Fortran::common::optional<Action>, Position,
+      IoErrorHandler &);
   void Predefine(int fd);
   void Close(CloseStatus, IoErrorHandler &);
 
@@ -94,9 +95,10 @@ class OpenFile {
   bool mayWrite_{false};
   bool mayPosition_{false};
   bool mayAsynchronous_{false};
-  std::optional<Position> openPosition_; // from Open(); reset after positioning
+  Fortran::common::optional<Position>
+      openPosition_; // from Open(); reset after positioning
   FileOffset position_{0};
-  std::optional<FileOffset> knownSize_;
+  Fortran::common::optional<FileOffset> knownSize_;
   bool isTerminal_{false};
   bool isWindowsTextFile_{false}; // expands LF to CR+LF on write
 
diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
index 9c342db2e19a2..b84e3208271b7 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -66,15 +66,6 @@ template <typename CONTEXT>
 int FormatControl<CONTEXT>::GetIntField(
     IoErrorHandler &handler, CharType firstCh, bool *hadError) {
   CharType ch{firstCh ? firstCh : PeekNext()};
-  if (ch != '-' && ch != '+' && (ch < '0' || ch > '9')) {
-    handler.SignalError(IostatErrorInFormat,
-        "Invalid FORMAT: integer expected at '%c'", static_cast<char>(ch));
-    if (hadError) {
-      *hadError = true;
-    }
-    return 0;
-  }
-  int result{0};
   bool negate{ch == '-'};
   if (negate || ch == '+') {
     if (firstCh) {
@@ -84,6 +75,15 @@ int FormatControl<CONTEXT>::GetIntField(
     }
     ch = PeekNext();
   }
+  if (ch < '0' || ch > '9') {
+    handler.SignalError(IostatErrorInFormat,
+        "Invalid FORMAT: integer expected at '%c'", static_cast<char>(ch));
+    if (hadError) {
+      *hadError = true;
+    }
+    return 0;
+  }
+  int result{0};
   while (ch >= '0' && ch <= '9') {
     constexpr int tenth{std::numeric_limits<int>::max() / 10};
     if (result > tenth ||
@@ -233,7 +233,7 @@ int FormatControl<CONTEXT>::CueUpNextDataEdit(Context &context, bool stop) {
     }
   }
   while (true) {
-    std::optional<int> repeat;
+    Fortran::common::optional<int> repeat;
     bool unlimited{false};
     auto maybeReversionPoint{offset_};
     CharType ch{GetNextChar(context)};
@@ -246,8 +246,15 @@ int FormatControl<CONTEXT>::CueUpNextDataEdit(Context &context, bool stop) {
       ch = GetNextChar(context);
     }
     if (ch == '-' || ch == '+' || (ch >= '0' && ch <= '9')) {
+      bool hadSign{ch == '-' || ch == '+'};
       repeat = GetIntField(context, ch);
       ch = GetNextChar(context);
+      if (hadSign && ch != 'p' && ch != 'P') {
+        ReportBadFormat(context,
+            "Invalid FORMAT: signed integer may appear only before 'P",
+            maybeReversionPoint);
+        return 0;
+      }
     } else if (ch == '*') {
       unlimited = true;
       ch = GetNextChar(context);
@@ -297,11 +304,11 @@ int FormatControl<CONTEXT>::CueUpNextDataEdit(Context &context, bool stop) {
       return 0;
     } else if (ch == ')') {
       if (height_ == 1) {
+        hitEnd_ = true;
         if (stop) {
           return 0; // end of FORMAT and no data items remain
         }
         context.AdvanceRecord(); // implied / before rightmost )
-        hitEnd_ = true;
       }
       auto restart{stack_[height_ - 1].start};
       if (format_[restart] == '(') {
@@ -412,7 +419,7 @@ int FormatControl<CONTEXT>::CueUpNextDataEdit(Context &context, bool stop) {
 
 // Returns the next data edit descriptor
 template <typename CONTEXT>
-std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
+Fortran::common::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
     Context &context, int maxRepeat) {
   int repeat{CueUpNextDataEdit(context)};
   auto start{offset_};
@@ -444,7 +451,7 @@ std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
         }
         if (edit.ioTypeChars >= edit.maxIoTypeChars) {
           ReportBadFormat(context, "Excessive DT'iotype' in FORMAT", start);
-          return std::nullopt;
+          return Fortran::common::nullopt;
         }
         edit.ioType[edit.ioTypeChars++] = ch;
         if (ch == quote) {
@@ -453,7 +460,7 @@ std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
       }
       if (!ok) {
         ReportBadFormat(context, "Unclosed DT'iotype' in FORMAT", start);
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
     }
     if (PeekNext() == '(') {
@@ -468,7 +475,7 @@ std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
         }
         if (edit.vListEntries >= edit.maxVListEntries) {
           ReportBadFormat(context, "Excessive DT(v_list) in FORMAT", start);
-          return std::nullopt;
+          return Fortran::common::nullopt;
         }
         edit.vList[edit.vListEntries++] = n;
         auto ch{static_cast<char>(GetNextChar(context))};
@@ -479,7 +486,7 @@ std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
       }
       if (!ok) {
         ReportBadFormat(context, "Unclosed DT(v_list) in FORMAT", start);
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
     }
   } else { // not DT'iotype'
diff --git a/flang/runtime/format.h b/flang/runtime/format.h
index 1fe0802ac43c6..e7d9455996404 100644
--- a/flang/runtime/format.h
+++ b/flang/runtime/format.h
@@ -14,9 +14,9 @@
 #include "environment.h"
 #include "io-error.h"
 #include "flang/Common/Fortran.h"
+#include "flang/Common/optional.h"
 #include "flang/Decimal/decimal.h"
 #include <cinttypes>
-#include <optional>
 
 namespace Fortran::runtime {
 class Descriptor;
@@ -64,9 +64,9 @@ struct DataEdit {
   static constexpr char DefinedDerivedType{'d'}; // DT defined I/O
 
   char variation{'\0'}; // N, S, or X for EN, ES, EX; G/l for original G/list
-  std::optional<int> width; // the 'w' field; optional for A
-  std::optional<int> digits; // the 'm' or 'd' field
-  std::optional<int> expoDigits; // 'Ee' field
+  Fortran::common::optional<int> width; // the 'w' field; optional for A
+  Fortran::common::optional<int> digits; // the 'm' or 'd' field
+  Fortran::common::optional<int> expoDigits; // 'Ee' field
   MutableModes modes;
   int repeat{1};
 
@@ -102,7 +102,8 @@ template <typename CONTEXT> class FormatControl {
   // Extracts the next data edit descriptor, handling control edit descriptors
   // along the way.  If maxRepeat==0, this is a peek at the next data edit
   // descriptor.
-  std::optional<DataEdit> GetNextDataEdit(Context &, int maxRepeat = 1);
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
+      Context &, int maxRepeat = 1);
 
   // Emit any remaining character literals after the last data item (on output)
   // and perform remaining record positioning actions.
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index bdc11ae93ac90..682b4c9b89294 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -42,6 +42,11 @@
 #define STD_REALLOC_UNSUPPORTED 1
 #endif
 
+#if !defined(STD_MEMCHR_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_MEMCHR_UNSUPPORTED 1
+#endif
+
 namespace Fortran::runtime {
 
 #if STD_FILL_N_UNSUPPORTED
@@ -134,5 +139,23 @@ static inline RT_API_ATTRS void *realloc(void *ptr, std::size_t newByteSize) {
 using std::realloc;
 #endif // !STD_REALLOC_UNSUPPORTED
 
+#if STD_MEMCHR_UNSUPPORTED
+// Provides alternative implementation for std::memchr(), if
+// it is not supported.
+static inline RT_API_ATTRS const void *memchr(
+    const void *ptr, int ch, std::size_t count) {
+  auto buf{reinterpret_cast<const unsigned char *>(ptr)};
+  auto c{static_cast<unsigned char>(ch)};
+  for (; count--; ++buf) {
+    if (*buf == c) {
+      return buf;
+    }
+  }
+  return nullptr;
+}
+#else // !STD_MEMCMP_UNSUPPORTED
+using std::memchr;
+#endif // !STD_MEMCMP_UNSUPPORTED
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
diff --git a/flang/runtime/internal-unit.cpp b/flang/runtime/internal-unit.cpp
index e3fffaa6f378f..66140e0058872 100644
--- a/flang/runtime/internal-unit.cpp
+++ b/flang/runtime/internal-unit.cpp
@@ -41,16 +41,6 @@ InternalDescriptorUnit<DIR>::InternalDescriptorUnit(
   endfileRecordNumber = d.Elements() + 1;
 }
 
-template <Direction DIR> void InternalDescriptorUnit<DIR>::EndIoStatement() {
-  if constexpr (DIR == Direction::Output) {
-    // Clear the remainder of the current record.
-    auto end{endfileRecordNumber.value_or(0)};
-    if (currentRecordNumber < end) {
-      BlankFillOutputRecord();
-    }
-  }
-}
-
 template <Direction DIR>
 bool InternalDescriptorUnit<DIR>::Emit(
     const char *data, std::size_t bytes, IoErrorHandler &handler) {
@@ -109,7 +99,11 @@ std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
 template <Direction DIR>
 bool InternalDescriptorUnit<DIR>::AdvanceRecord(IoErrorHandler &handler) {
   if (currentRecordNumber >= endfileRecordNumber.value_or(0)) {
-    handler.SignalEnd();
+    if constexpr (DIR == Direction::Input) {
+      handler.SignalEnd();
+    } else {
+      handler.SignalError(IostatInternalWriteOverrun);
+    }
     return false;
   }
   if constexpr (DIR == Direction::Output) {
diff --git a/flang/runtime/internal-unit.h b/flang/runtime/internal-unit.h
index f0c50aac98878..b536ffb831d55 100644
--- a/flang/runtime/internal-unit.h
+++ b/flang/runtime/internal-unit.h
@@ -28,7 +28,6 @@ template <Direction DIR> class InternalDescriptorUnit : public ConnectionState {
       std::conditional_t<DIR == Direction::Input, const char *, char *>;
   InternalDescriptorUnit(Scalar, std::size_t chars, int kind);
   InternalDescriptorUnit(const Descriptor &, const Terminator &);
-  void EndIoStatement();
 
   bool Emit(const char *, std::size_t, IoErrorHandler &);
   std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index 0ac20d3346dd5..094db5572f15c 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -18,6 +18,7 @@
 #include "terminator.h"
 #include "tools.h"
 #include "unit.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/memory.h"
 #include <cstdlib>
@@ -168,7 +169,7 @@ static Cookie NoopUnit(const Terminator &terminator, int unitNumber,
 }
 
 static ExternalFileUnit *GetOrCreateUnit(int unitNumber, Direction direction,
-    std::optional<bool> isUnformatted, const Terminator &terminator,
+    Fortran::common::optional<bool> isUnformatted, const Terminator &terminator,
     Cookie &errorCookie) {
   if (ExternalFileUnit *
       unit{ExternalFileUnit::LookUpOrCreateAnonymous(
@@ -472,8 +473,8 @@ Cookie IONAME(BeginEndfile)(
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
   if (ExternalFileUnit *
-      unit{GetOrCreateUnit(unitNumber, Direction::Output, std::nullopt,
-          terminator, errorCookie)}) {
+      unit{GetOrCreateUnit(unitNumber, Direction::Output,
+          Fortran::common::nullopt, terminator, errorCookie)}) {
     if (ChildIo * child{unit->GetChildIo()}) {
       return &child->BeginIoStatement<ErroneousIoStatementState>(
           IostatBadOpOnChildUnit, nullptr /* no unit */, sourceFile,
@@ -492,8 +493,8 @@ Cookie IONAME(BeginRewind)(
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
   if (ExternalFileUnit *
-      unit{GetOrCreateUnit(unitNumber, Direction::Input, std::nullopt,
-          terminator, errorCookie)}) {
+      unit{GetOrCreateUnit(unitNumber, Direction::Input,
+          Fortran::common::nullopt, terminator, errorCookie)}) {
     if (ChildIo * child{unit->GetChildIo()}) {
       return &child->BeginIoStatement<ErroneousIoStatementState>(
           IostatBadOpOnChildUnit, nullptr /* no unit */, sourceFile,
@@ -801,7 +802,7 @@ bool IONAME(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
     io.GetIoErrorHandler().Crash(
         "SetAction() called after GetNewUnit() for an OPEN statement");
   }
-  std::optional<Action> action;
+  Fortran::common::optional<Action> action;
   static const char *keywords[]{"READ", "WRITE", "READWRITE", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
   case 0:
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 3ec01ffba9bf0..075d7b5ae518a 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -39,9 +39,9 @@ bool IoStatementBase::Receive(char *, std::size_t, std::size_t) {
   return false;
 }
 
-std::optional<DataEdit> IoStatementBase::GetNextDataEdit(
+Fortran::common::optional<DataEdit> IoStatementBase::GetNextDataEdit(
     IoStatementState &, int) {
-  return std::nullopt;
+  return Fortran::common::nullopt;
 }
 
 ExternalFileUnit *IoStatementBase::GetExternalFileUnit() const {
@@ -119,9 +119,6 @@ template <Direction DIR> void InternalIoStatementState<DIR>::BackspaceRecord() {
 }
 
 template <Direction DIR> int InternalIoStatementState<DIR>::EndIoStatement() {
-  if constexpr (DIR == Direction::Output) {
-    unit_.EndIoStatement(); // fill
-  }
   auto result{IoStatementBase::EndIoStatement()};
   if (free_) {
     FreeMemory(this);
@@ -165,7 +162,8 @@ template <Direction DIR, typename CHAR>
 void InternalFormattedIoStatementState<DIR, CHAR>::CompleteOperation() {
   if (!this->completedOperation()) {
     if constexpr (DIR == Direction::Output) {
-      format_.Finish(*this); // ignore any remaining input positioning actions
+      format_.Finish(*this);
+      unit_.AdvanceRecord(*this);
     }
     IoStatementBase::CompleteOperation();
   }
@@ -189,8 +187,21 @@ InternalListIoStatementState<DIR>::InternalListIoStatementState(
     : InternalIoStatementState<DIR>{d, sourceFile, sourceLine},
       ioStatementState_{*this} {}
 
+template <Direction DIR>
+void InternalListIoStatementState<DIR>::CompleteOperation() {
+  if (!this->completedOperation()) {
+    if constexpr (DIR == Direction::Output) {
+      if (unit_.furthestPositionInRecord > 0) {
+        unit_.AdvanceRecord(*this);
+      }
+    }
+    IoStatementBase::CompleteOperation();
+  }
+}
+
 template <Direction DIR>
 int InternalListIoStatementState<DIR>::EndIoStatement() {
+  CompleteOperation();
   if constexpr (DIR == Direction::Input) {
     if (int status{ListDirectedStatementState<DIR>::EndIoStatement()};
         status != IostatOk) {
@@ -455,7 +466,7 @@ int ExternalFormattedIoStatementState<DIR, CHAR>::EndIoStatement() {
   return ExternalIoStatementState<DIR>::EndIoStatement();
 }
 
-std::optional<DataEdit> IoStatementState::GetNextDataEdit(int n) {
+Fortran::common::optional<DataEdit> IoStatementState::GetNextDataEdit(int n) {
   return common::visit(
       [&](auto &x) { return x.get().GetNextDataEdit(*this, n); }, u_);
 }
@@ -530,13 +541,13 @@ ExternalFileUnit *IoStatementState::GetExternalFileUnit() const {
       [](auto &x) { return x.get().GetExternalFileUnit(); }, u_);
 }
 
-std::optional<char32_t> IoStatementState::GetCurrentChar(
+Fortran::common::optional<char32_t> IoStatementState::GetCurrentChar(
     std::size_t &byteCount) {
   const char *p{nullptr};
   std::size_t bytes{GetNextInputBytes(p)};
   if (bytes == 0) {
     byteCount = 0;
-    return std::nullopt;
+    return Fortran::common::nullopt;
   } else {
     const ConnectionState &connection{GetConnectionState()};
     if (connection.isUTF8) {
@@ -562,8 +573,8 @@ std::optional<char32_t> IoStatementState::GetCurrentChar(
   }
 }
 
-std::optional<char32_t> IoStatementState::NextInField(
-    std::optional<int> &remaining, const DataEdit &edit) {
+Fortran::common::optional<char32_t> IoStatementState::NextInField(
+    Fortran::common::optional<int> &remaining, const DataEdit &edit) {
   std::size_t byteCount{0};
   if (!remaining) { // Stream, list-directed, or NAMELIST
     if (auto next{GetCurrentChar(byteCount)}) {
@@ -579,21 +590,21 @@ std::optional<char32_t> IoStatementState::NextInField(
         case '"':
         case '*':
         case '\n': // for stream access
-          return std::nullopt;
+          return Fortran::common::nullopt;
         case '&':
         case '$':
           if (edit.IsNamelist()) {
-            return std::nullopt;
+            return Fortran::common::nullopt;
           }
           break;
         case ',':
           if (!(edit.modes.editingFlags & decimalComma)) {
-            return std::nullopt;
+            return Fortran::common::nullopt;
           }
           break;
         case ';':
           if (edit.modes.editingFlags & decimalComma) {
-            return std::nullopt;
+            return Fortran::common::nullopt;
           }
           break;
         default:
@@ -607,7 +618,7 @@ std::optional<char32_t> IoStatementState::NextInField(
   } else if (*remaining > 0) {
     if (auto next{GetCurrentChar(byteCount)}) {
       if (byteCount > static_cast<std::size_t>(*remaining)) {
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
       *remaining -= byteCount;
       HandleRelativePosition(byteCount);
@@ -616,10 +627,10 @@ std::optional<char32_t> IoStatementState::NextInField(
     }
     if (CheckForEndOfRecord(0)) { // do padding
       --*remaining;
-      return std::optional<char32_t>{' '};
+      return Fortran::common::optional<char32_t>{' '};
     }
   }
-  return std::nullopt;
+  return Fortran::common::nullopt;
 }
 
 bool IoStatementState::CheckForEndOfRecord(std::size_t afterReading) {
@@ -711,7 +722,7 @@ bool ListDirectedStatementState<Direction::Output>::EmitLeadingSpaceOrAdvance(
   return true;
 }
 
-std::optional<DataEdit>
+Fortran::common::optional<DataEdit>
 ListDirectedStatementState<Direction::Output>::GetNextDataEdit(
     IoStatementState &io, int maxRepeat) {
   DataEdit edit;
@@ -728,7 +739,7 @@ int ListDirectedStatementState<Direction::Input>::EndIoStatement() {
   return IostatOk;
 }
 
-std::optional<DataEdit>
+Fortran::common::optional<DataEdit>
 ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
     IoStatementState &io, int maxRepeat) {
   // N.B. list-directed transfers cannot be nonadvancing (C1221)
@@ -784,7 +795,7 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
   }
   eatComma_ = true;
   if (!ch) {
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
   if (*ch == '/') {
     hitSlash_ = true;
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index 0b6bcbd9af025..e00d54980aae5 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -16,6 +16,8 @@
 #include "format.h"
 #include "internal-unit.h"
 #include "io-error.h"
+#include "flang/Common/optional.h"
+#include "flang/Common/reference-wrapper.h"
 #include "flang/Common/visit.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/io-api.h"
@@ -94,7 +96,7 @@ class IoStatementState {
   void BackspaceRecord();
   void HandleRelativePosition(std::int64_t byteOffset);
   void HandleAbsolutePosition(std::int64_t byteOffset); // for r* in list I/O
-  std::optional<DataEdit> GetNextDataEdit(int maxRepeat = 1);
+  Fortran::common::optional<DataEdit> GetNextDataEdit(int maxRepeat = 1);
   ExternalFileUnit *GetExternalFileUnit() const; // null if internal unit
   bool BeginReadingRecord();
   void FinishReadingRecord();
@@ -122,7 +124,7 @@ class IoStatementState {
   }
 
   // Vacant after the end of the current record
-  std::optional<char32_t> GetCurrentChar(std::size_t &byteCount);
+  Fortran::common::optional<char32_t> GetCurrentChar(std::size_t &byteCount);
 
   // The "remaining" arguments to CueUpInput(), SkipSpaces(), & NextInField()
   // are always in units of bytes, not characters; the distinction matters
@@ -130,8 +132,8 @@ class IoStatementState {
 
   // For fixed-width fields, return the number of remaining bytes.
   // Skip over leading blanks.
-  std::optional<int> CueUpInput(const DataEdit &edit) {
-    std::optional<int> remaining;
+  Fortran::common::optional<int> CueUpInput(const DataEdit &edit) {
+    Fortran::common::optional<int> remaining;
     if (edit.IsListDirected()) {
       std::size_t byteCount{0};
       GetNextNonBlank(byteCount);
@@ -148,7 +150,8 @@ class IoStatementState {
     return remaining;
   }
 
-  std::optional<char32_t> SkipSpaces(std::optional<int> &remaining) {
+  Fortran::common::optional<char32_t> SkipSpaces(
+      Fortran::common::optional<int> &remaining) {
     while (!remaining || *remaining > 0) {
       std::size_t byteCount{0};
       if (auto ch{GetCurrentChar(byteCount)}) {
@@ -167,27 +170,27 @@ class IoStatementState {
         break;
       }
     }
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 
   // Acquires the next input character, respecting any applicable field width
   // or separator character.
-  std::optional<char32_t> NextInField(
-      std::optional<int> &remaining, const DataEdit &);
+  Fortran::common::optional<char32_t> NextInField(
+      Fortran::common::optional<int> &remaining, const DataEdit &);
 
   // Detect and signal any end-of-record condition after input.
   // Returns true if at EOR and remaining input should be padded with blanks.
   bool CheckForEndOfRecord(std::size_t afterReading);
 
   // Skips spaces, advances records, and ignores NAMELIST comments
-  std::optional<char32_t> GetNextNonBlank(std::size_t &byteCount) {
+  Fortran::common::optional<char32_t> GetNextNonBlank(std::size_t &byteCount) {
     auto ch{GetCurrentChar(byteCount)};
     bool inNamelist{mutableModes().inNamelist};
     while (!ch || *ch == ' ' || *ch == '\t' || (inNamelist && *ch == '!')) {
       if (ch && (*ch == ' ' || *ch == '\t')) {
         HandleRelativePosition(byteCount);
       } else if (!AdvanceRecord()) {
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
       ch = GetCurrentChar(byteCount);
     }
@@ -208,39 +211,47 @@ class IoStatementState {
   }
 
 private:
-  std::variant<std::reference_wrapper<OpenStatementState>,
-      std::reference_wrapper<CloseStatementState>,
-      std::reference_wrapper<NoopStatementState>,
-      std::reference_wrapper<
+  std::variant<Fortran::common::reference_wrapper<OpenStatementState>,
+      Fortran::common::reference_wrapper<CloseStatementState>,
+      Fortran::common::reference_wrapper<NoopStatementState>,
+      Fortran::common::reference_wrapper<
           InternalFormattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
           InternalFormattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<InternalListIoStatementState<Direction::Output>>,
-      std::reference_wrapper<InternalListIoStatementState<Direction::Input>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
+          InternalListIoStatementState<Direction::Output>>,
+      Fortran::common::reference_wrapper<
+          InternalListIoStatementState<Direction::Input>>,
+      Fortran::common::reference_wrapper<
           ExternalFormattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
           ExternalFormattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<ExternalListIoStatementState<Direction::Output>>,
-      std::reference_wrapper<ExternalListIoStatementState<Direction::Input>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
+          ExternalListIoStatementState<Direction::Output>>,
+      Fortran::common::reference_wrapper<
+          ExternalListIoStatementState<Direction::Input>>,
+      Fortran::common::reference_wrapper<
           ExternalUnformattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
           ExternalUnformattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<ChildFormattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<ChildFormattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<ChildListIoStatementState<Direction::Output>>,
-      std::reference_wrapper<ChildListIoStatementState<Direction::Input>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
+          ChildFormattedIoStatementState<Direction::Output>>,
+      Fortran::common::reference_wrapper<
+          ChildFormattedIoStatementState<Direction::Input>>,
+      Fortran::common::reference_wrapper<
+          ChildListIoStatementState<Direction::Output>>,
+      Fortran::common::reference_wrapper<
+          ChildListIoStatementState<Direction::Input>>,
+      Fortran::common::reference_wrapper<
           ChildUnformattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
           ChildUnformattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<InquireUnitState>,
-      std::reference_wrapper<InquireNoUnitState>,
-      std::reference_wrapper<InquireUnconnectedFileState>,
-      std::reference_wrapper<InquireIOLengthState>,
-      std::reference_wrapper<ExternalMiscIoStatementState>,
-      std::reference_wrapper<ErroneousIoStatementState>>
+      Fortran::common::reference_wrapper<InquireUnitState>,
+      Fortran::common::reference_wrapper<InquireNoUnitState>,
+      Fortran::common::reference_wrapper<InquireUnconnectedFileState>,
+      Fortran::common::reference_wrapper<InquireIOLengthState>,
+      Fortran::common::reference_wrapper<ExternalMiscIoStatementState>,
+      Fortran::common::reference_wrapper<ErroneousIoStatementState>>
       u_;
 };
 
@@ -262,7 +273,7 @@ class IoStatementBase : public IoErrorHandler {
   void BackspaceRecord();
   void HandleRelativePosition(std::int64_t);
   void HandleAbsolutePosition(std::int64_t);
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1);
   ExternalFileUnit *GetExternalFileUnit() const;
   bool BeginReadingRecord();
@@ -287,7 +298,7 @@ class ListDirectedStatementState<Direction::Output>
 public:
   bool EmitLeadingSpaceOrAdvance(
       IoStatementState &, std::size_t = 1, bool isCharacter = false);
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1);
   bool lastWasUndelimitedCharacter() const {
     return lastWasUndelimitedCharacter_;
@@ -309,7 +320,7 @@ class ListDirectedStatementState<Direction::Input>
   // Skips value separators, handles repetition and null values.
   // Vacant when '/' appears; present with descriptor == ListDirectedNullValue
   // when a null value appears.
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1);
 
   // Each NAMELIST input item is treated like a distinct list-directed
@@ -328,7 +339,7 @@ class ListDirectedStatementState<Direction::Input>
 
 private:
   int remaining_{0}; // for "r*" repetition
-  std::optional<SavedPosition> repeatPosition_;
+  Fortran::common::optional<SavedPosition> repeatPosition_;
   bool eatComma_{false}; // consume comma after previously read item
   bool hitSlash_{false}; // once '/' is seen, nullify further items
   bool realPart_{false};
@@ -380,7 +391,7 @@ class InternalFormattedIoStatementState
   IoStatementState &ioStatementState() { return ioStatementState_; }
   void CompleteOperation();
   int EndIoStatement();
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1) {
     return format_.GetNextDataEdit(*this, maxRepeat);
   }
@@ -403,6 +414,7 @@ class InternalListIoStatementState : public InternalIoStatementState<DIR>,
       const Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
   IoStatementState &ioStatementState() { return ioStatementState_; }
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
+  void CompleteOperation();
   int EndIoStatement();
 
 private:
@@ -464,7 +476,7 @@ class ExternalFormattedIoStatementState
       const char *sourceFile = nullptr, int sourceLine = 0);
   void CompleteOperation();
   int EndIoStatement();
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1) {
     return format_.GetNextDataEdit(*this, maxRepeat);
   }
@@ -522,7 +534,7 @@ class ChildFormattedIoStatementState : public ChildIoStatementState<DIR>,
   void CompleteOperation();
   int EndIoStatement();
   bool AdvanceRecord(int = 1);
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1) {
     return format_.GetNextDataEdit(*this, maxRepeat);
   }
@@ -570,14 +582,14 @@ class OpenStatementState : public ExternalIoStatementBase {
 private:
   bool wasExtant_;
   bool isNewUnit_;
-  std::optional<OpenStatus> status_;
-  std::optional<Position> position_;
-  std::optional<Action> action_;
+  Fortran::common::optional<OpenStatus> status_;
+  Fortran::common::optional<Position> position_;
+  Fortran::common::optional<Action> action_;
   Convert convert_{Convert::Unknown};
   OwningPtr<char> path_;
   std::size_t pathLength_;
-  std::optional<bool> isUnformatted_;
-  std::optional<Access> access_;
+  Fortran::common::optional<bool> isUnformatted_;
+  Fortran::common::optional<Access> access_;
 };
 
 class CloseStatementState : public ExternalIoStatementBase {
diff --git a/flang/runtime/matmul-transpose.cpp b/flang/runtime/matmul-transpose.cpp
index ee5fcd842b025..a12d188266f7c 100644
--- a/flang/runtime/matmul-transpose.cpp
+++ b/flang/runtime/matmul-transpose.cpp
@@ -23,6 +23,7 @@
 #include "flang/Runtime/matmul-transpose.h"
 #include "terminator.h"
 #include "tools.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/c-or-cpp.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
@@ -96,8 +97,8 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
-    SubscriptValue n, std::optional<std::size_t> xColumnByteStride,
-    std::optional<std::size_t> yColumnByteStride) {
+    SubscriptValue n, Fortran::common::optional<std::size_t> xColumnByteStride,
+    Fortran::common::optional<std::size_t> yColumnByteStride) {
   if (!xColumnByteStride) {
     if (!yColumnByteStride) {
       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
@@ -163,7 +164,7 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
-    std::optional<std::size_t> xColumnByteStride) {
+    Fortran::common::optional<std::size_t> xColumnByteStride) {
   if (!xColumnByteStride) {
     MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, false>(
         product, rows, n, x, y);
@@ -229,7 +230,7 @@ inline static RT_API_ATTRS void DoMatmulTranspose(
         (IS_ALLOCATING || result.IsContiguous())) {
       // Contiguous numeric matrices (maybe with columns
       // separated by a stride).
-      std::optional<std::size_t> xColumnByteStride;
+      Fortran::common::optional<std::size_t> xColumnByteStride;
       if (!x.IsContiguous()) {
         // X's columns are strided.
         SubscriptValue xAt[2]{};
@@ -237,7 +238,7 @@ inline static RT_API_ATTRS void DoMatmulTranspose(
         xAt[1]++;
         xColumnByteStride = x.SubscriptsToByteOffset(xAt);
       }
-      std::optional<std::size_t> yColumnByteStride;
+      Fortran::common::optional<std::size_t> yColumnByteStride;
       if (!y.IsContiguous()) {
         // Y's columns are strided.
         SubscriptValue yAt[2]{};
diff --git a/flang/runtime/matmul.cpp b/flang/runtime/matmul.cpp
index e4595db779260..543284cb5c363 100644
--- a/flang/runtime/matmul.cpp
+++ b/flang/runtime/matmul.cpp
@@ -22,6 +22,7 @@
 #include "flang/Runtime/matmul.h"
 #include "terminator.h"
 #include "tools.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/c-or-cpp.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
@@ -116,8 +117,8 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 inline RT_API_ATTRS void MatrixTimesMatrixHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
-    SubscriptValue n, std::optional<std::size_t> xColumnByteStride,
-    std::optional<std::size_t> yColumnByteStride) {
+    SubscriptValue n, Fortran::common::optional<std::size_t> xColumnByteStride,
+    Fortran::common::optional<std::size_t> yColumnByteStride) {
   if (!xColumnByteStride) {
     if (!yColumnByteStride) {
       MatrixTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
@@ -183,7 +184,7 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 inline RT_API_ATTRS void MatrixTimesVectorHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
-    std::optional<std::size_t> xColumnByteStride) {
+    Fortran::common::optional<std::size_t> xColumnByteStride) {
   if (!xColumnByteStride) {
     MatrixTimesVector<RCAT, RKIND, XT, YT, false>(product, rows, n, x, y);
   } else {
@@ -240,7 +241,7 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
 inline RT_API_ATTRS void VectorTimesMatrixHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue n,
     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
-    std::optional<std::size_t> yColumnByteStride) {
+    Fortran::common::optional<std::size_t> yColumnByteStride) {
   if (!yColumnByteStride) {
     VectorTimesMatrix<RCAT, RKIND, XT, YT, false>(product, n, cols, x, y);
   } else {
@@ -301,7 +302,7 @@ static inline RT_API_ATTRS void DoMatmul(
         (IS_ALLOCATING || result.IsContiguous())) {
       // Contiguous numeric matrices (maybe with columns
       // separated by a stride).
-      std::optional<std::size_t> xColumnByteStride;
+      Fortran::common::optional<std::size_t> xColumnByteStride;
       if (!x.IsContiguous()) {
         // X's columns are strided.
         SubscriptValue xAt[2]{};
@@ -309,7 +310,7 @@ static inline RT_API_ATTRS void DoMatmul(
         xAt[1]++;
         xColumnByteStride = x.SubscriptsToByteOffset(xAt);
       }
-      std::optional<std::size_t> yColumnByteStride;
+      Fortran::common::optional<std::size_t> yColumnByteStride;
       if (!y.IsContiguous()) {
         // Y's columns are strided.
         SubscriptValue yAt[2]{};
diff --git a/flang/runtime/misc-intrinsic.cpp b/flang/runtime/misc-intrinsic.cpp
index 56f2028c2ff02..f5b292a1f3d32 100644
--- a/flang/runtime/misc-intrinsic.cpp
+++ b/flang/runtime/misc-intrinsic.cpp
@@ -9,16 +9,16 @@
 #include "flang/Runtime/misc-intrinsic.h"
 #include "terminator.h"
 #include "tools.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <cstring>
-#include <optional>
 
 namespace Fortran::runtime {
 
 static RT_API_ATTRS void TransferImpl(Descriptor &result,
     const Descriptor &source, const Descriptor &mold, const char *sourceFile,
-    int line, std::optional<std::int64_t> resultExtent) {
+    int line, Fortran::common::optional<std::int64_t> resultExtent) {
   int rank{resultExtent.has_value() ? 1 : 0};
   std::size_t elementBytes{mold.ElementBytes()};
   result.Establish(mold.type(), elementBytes, nullptr, rank, nullptr,
@@ -57,7 +57,7 @@ RT_EXT_API_GROUP_BEGIN
 
 void RTDEF(Transfer)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line) {
-  std::optional<std::int64_t> elements;
+  Fortran::common::optional<std::int64_t> elements;
   if (mold.rank() > 0) {
     if (std::size_t sourceElementBytes{
             source.Elements() * source.ElementBytes()}) {
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index f5d7404fdcb86..ac9234f4af832 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -116,10 +116,11 @@ static bool GetLowerCaseName(
   return false;
 }
 
-static std::optional<SubscriptValue> GetSubscriptValue(IoStatementState &io) {
-  std::optional<SubscriptValue> value;
+static Fortran::common::optional<SubscriptValue> GetSubscriptValue(
+    IoStatementState &io) {
+  Fortran::common::optional<SubscriptValue> value;
   std::size_t byteCount{0};
-  std::optional<char32_t> ch{io.GetCurrentChar(byteCount)};
+  Fortran::common::optional<char32_t> ch{io.GetCurrentChar(byteCount)};
   bool negate{ch && *ch == '-'};
   if ((ch && *ch == '+') || negate) {
     io.HandleRelativePosition(byteCount);
@@ -136,7 +137,7 @@ static std::optional<SubscriptValue> GetSubscriptValue(IoStatementState &io) {
   if (overflow) {
     io.GetIoErrorHandler().SignalError(
         "NAMELIST input subscript value overflow");
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
   if (negate) {
     if (value) {
@@ -158,7 +159,7 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
   std::size_t contiguousStride{source.ElementBytes()};
   bool ok{true};
   std::size_t byteCount{0};
-  std::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
+  Fortran::common::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
   char32_t comma{GetComma(io)};
   for (; ch && *ch != ')'; ++j) {
     SubscriptValue dimLower{0}, dimUpper{0}, dimStride{0};
@@ -282,9 +283,9 @@ static bool HandleSubstring(
   SubscriptValue chars{static_cast<SubscriptValue>(desc.ElementBytes()) / kind};
   // Allow for blanks in substring bounds; they're nonstandard, but not
   // ambiguous within the parentheses.
-  std::optional<SubscriptValue> lower, upper;
+  Fortran::common::optional<SubscriptValue> lower, upper;
   std::size_t byteCount{0};
-  std::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
+  Fortran::common::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
   if (ch) {
     if (*ch == ':') {
       lower = 1;
@@ -346,7 +347,8 @@ static bool HandleComponent(IoStatementState &io, Descriptor &desc,
           // If base and component are both arrays, the component name
           // must be followed by subscripts; process them now.
           std::size_t byteCount{0};
-          if (std::optional<char32_t> next{io.GetNextNonBlank(byteCount)};
+          if (Fortran::common::optional<char32_t> next{
+                  io.GetNextNonBlank(byteCount)};
               next && *next == '(') {
             io.HandleRelativePosition(byteCount); // skip over '('
             StaticDescriptor<maxRank, true, 16> staticDesc;
@@ -435,7 +437,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   RUNTIME_CHECK(handler, listInput != nullptr);
   // Find this namelist group's header in the input
   io.BeginReadingRecord();
-  std::optional<char32_t> next;
+  Fortran::common::optional<char32_t> next;
   char name[nameBufferSize];
   RUNTIME_CHECK(handler, group.groupName != nullptr);
   char32_t comma{GetComma(io)};
diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h
index ecc3b2654d965..8ea3daaa57bcf 100644
--- a/flang/runtime/numeric-templates.h
+++ b/flang/runtime/numeric-templates.h
@@ -122,12 +122,19 @@ template <typename T> struct ABSTy {
   static constexpr RT_API_ATTRS T compute(T x) { return std::abs(x); }
 };
 
+// Suppress the warnings about calling __host__-only
+// 'long double' std::frexp, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 template <typename T> struct FREXPTy {
   static constexpr RT_API_ATTRS T compute(T x, int *e) {
     return std::frexp(x, e);
   }
 };
 
+RT_DIAG_POP
+
 template <typename T> struct ILOGBTy {
   static constexpr RT_API_ATTRS int compute(T x) { return std::ilogb(x); }
 };
diff --git a/flang/runtime/pointer.cpp b/flang/runtime/pointer.cpp
index fc9e0eeb7dac9..b01735dc30e69 100644
--- a/flang/runtime/pointer.cpp
+++ b/flang/runtime/pointer.cpp
@@ -9,6 +9,7 @@
 #include "flang/Runtime/pointer.h"
 #include "assign-impl.h"
 #include "derived.h"
+#include "environment.h"
 #include "stat.h"
 #include "terminator.h"
 #include "tools.h"
@@ -184,18 +185,23 @@ int RTDEF(PointerDeallocate)(Descriptor &pointer, bool hasStat,
   if (!pointer.IsAllocated()) {
     return ReturnError(terminator, StatBaseNull, errMsg, hasStat);
   }
-  // Validate the footer.  This should fail if the pointer doesn't
-  // span the entire object, or the object was not allocated as a
-  // pointer.
-  std::size_t byteSize{pointer.Elements() * pointer.ElementBytes()};
-  constexpr std::size_t align{sizeof(std::uintptr_t)};
-  byteSize = ((byteSize + align - 1) / align) * align;
-  void *p{pointer.raw().base_addr};
-  std::uintptr_t *footer{
-      reinterpret_cast<std::uintptr_t *>(static_cast<char *>(p) + byteSize)};
-  if (*footer != ~reinterpret_cast<std::uintptr_t>(p)) {
-    return ReturnError(terminator, StatBadPointerDeallocation, errMsg, hasStat);
+#if !defined(RT_DEVICE_COMPILATION)
+  if (executionEnvironment.checkPointerDeallocation) {
+    // Validate the footer.  This should fail if the pointer doesn't
+    // span the entire object, or the object was not allocated as a
+    // pointer.
+    std::size_t byteSize{pointer.Elements() * pointer.ElementBytes()};
+    constexpr std::size_t align{sizeof(std::uintptr_t)};
+    byteSize = ((byteSize + align - 1) / align) * align;
+    void *p{pointer.raw().base_addr};
+    std::uintptr_t *footer{
+        reinterpret_cast<std::uintptr_t *>(static_cast<char *>(p) + byteSize)};
+    if (*footer != ~reinterpret_cast<std::uintptr_t>(p)) {
+      return ReturnError(
+          terminator, StatBadPointerDeallocation, errMsg, hasStat);
+    }
   }
+#endif
   return ReturnError(terminator,
       pointer.Destroy(/*finalize=*/true, /*destroyPointers=*/true, &terminator),
       errMsg, hasStat);
diff --git a/flang/runtime/random-templates.h b/flang/runtime/random-templates.h
index ce64a94901a28..f34422f6f5d9a 100644
--- a/flang/runtime/random-templates.h
+++ b/flang/runtime/random-templates.h
@@ -11,6 +11,7 @@
 
 #include "lock.h"
 #include "numeric-templates.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <random>
@@ -32,7 +33,7 @@ static constexpr int rangeBits{
 
 extern Lock lock;
 extern Generator generator;
-extern std::optional<GeneratedWord> nextValue;
+extern Fortran::common::optional<GeneratedWord> nextValue;
 
 // Call only with lock held
 static GeneratedWord GetNextValue() {
diff --git a/flang/runtime/random.cpp b/flang/runtime/random.cpp
index 13bed1f0abe10..e0a421fd28396 100644
--- a/flang/runtime/random.cpp
+++ b/flang/runtime/random.cpp
@@ -28,7 +28,7 @@ namespace Fortran::runtime::random {
 
 Lock lock;
 Generator generator;
-std::optional<GeneratedWord> nextValue;
+Fortran::common::optional<GeneratedWord> nextValue;
 
 extern "C" {
 
diff --git a/flang/runtime/tools.cpp b/flang/runtime/tools.cpp
index e653323ed1de0..71022c7a8c179 100644
--- a/flang/runtime/tools.cpp
+++ b/flang/runtime/tools.cpp
@@ -175,7 +175,7 @@ RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from) {
 
 RT_API_ATTRS char *EnsureNullTerminated(
     char *str, std::size_t length, Terminator &terminator) {
-  if (std::memchr(str, '\0', length) == nullptr) {
+  if (runtime::memchr(str, '\0', length) == nullptr) {
     char *newCmd{(char *)AllocateMemoryOrCrash(terminator, length + 1)};
     std::memcpy(newCmd, str, length);
     newCmd[length] = '\0';
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index c1f89cadca06e..df25eb8882335 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -12,6 +12,7 @@
 #include "freestanding-tools.h"
 #include "stat.h"
 #include "terminator.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/memory.h"
@@ -95,7 +96,7 @@ static inline RT_API_ATTRS std::int64_t GetInt64(
   }
 }
 
-static inline RT_API_ATTRS std::optional<std::int64_t> GetInt64Safe(
+static inline RT_API_ATTRS Fortran::common::optional<std::int64_t> GetInt64Safe(
     const char *p, std::size_t bytes, Terminator &terminator) {
   switch (bytes) {
   case 1:
@@ -113,7 +114,7 @@ static inline RT_API_ATTRS std::optional<std::int64_t> GetInt64Safe(
     if (static_cast<Int128>(result) == n) {
       return result;
     }
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
   default:
     terminator.Crash("GetInt64Safe: no case for %zd bytes", bytes);
@@ -334,7 +335,8 @@ inline RT_API_ATTRS RESULT ApplyLogicalKind(
 }
 
 // Calculate result type of (X op Y) for *, //, DOT_PRODUCT, &c.
-std::optional<std::pair<TypeCategory, int>> inline constexpr RT_API_ATTRS
+Fortran::common::optional<
+    std::pair<TypeCategory, int>> inline constexpr RT_API_ATTRS
 GetResultType(TypeCategory xCat, int xKind, TypeCategory yCat, int yKind) {
   int maxKind{std::max(xKind, yKind)};
   switch (xCat) {
@@ -390,18 +392,18 @@ GetResultType(TypeCategory xCat, int xKind, TypeCategory yCat, int yKind) {
     if (yCat == TypeCategory::Character) {
       return std::make_pair(TypeCategory::Character, maxKind);
     } else {
-      return std::nullopt;
+      return Fortran::common::nullopt;
     }
   case TypeCategory::Logical:
     if (yCat == TypeCategory::Logical) {
       return std::make_pair(TypeCategory::Logical, maxKind);
     } else {
-      return std::nullopt;
+      return Fortran::common::nullopt;
     }
   default:
     break;
   }
-  return std::nullopt;
+  return Fortran::common::nullopt;
 }
 
 // Accumulate floating-point results in (at least) double precision
@@ -428,7 +430,7 @@ template <>
 inline RT_API_ATTRS const char *FindCharacter(
     const char *data, char ch, std::size_t chars) {
   return reinterpret_cast<const char *>(
-      std::memchr(data, static_cast<int>(ch), chars));
+      runtime::memchr(data, static_cast<int>(ch), chars));
 }
 
 // Copy payload data from one allocated descriptor to another.
diff --git a/flang/runtime/type-code.cpp b/flang/runtime/type-code.cpp
index b9ef307835dfb..cb1b944433aae 100644
--- a/flang/runtime/type-code.cpp
+++ b/flang/runtime/type-code.cpp
@@ -112,7 +112,7 @@ RT_API_ATTRS TypeCode::TypeCode(TypeCategory f, int kind) {
   }
 }
 
-RT_API_ATTRS std::optional<std::pair<TypeCategory, int>>
+RT_API_ATTRS Fortran::common::optional<std::pair<TypeCategory, int>>
 TypeCode::GetCategoryAndKind() const {
   switch (raw_) {
   case CFI_type_signed_char:
@@ -204,7 +204,7 @@ TypeCode::GetCategoryAndKind() const {
   case CFI_type_char32_t:
     return std::make_pair(TypeCategory::Character, 4);
   default:
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 }
 
diff --git a/flang/runtime/type-info.cpp b/flang/runtime/type-info.cpp
index b30a2c832a138..cb18c5669b5ff 100644
--- a/flang/runtime/type-info.cpp
+++ b/flang/runtime/type-info.cpp
@@ -15,7 +15,7 @@ namespace Fortran::runtime::typeInfo {
 
 RT_OFFLOAD_API_GROUP_BEGIN
 
-RT_API_ATTRS std::optional<TypeParameterValue> Value::GetValue(
+RT_API_ATTRS Fortran::common::optional<TypeParameterValue> Value::GetValue(
     const Descriptor *descriptor) const {
   switch (genre_) {
   case Genre::Explicit:
@@ -26,9 +26,9 @@ RT_API_ATTRS std::optional<TypeParameterValue> Value::GetValue(
         return addendum->LenParameterValue(value_);
       }
     }
-    return std::nullopt;
+    return Fortran::common::nullopt;
   default:
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 }
 
diff --git a/flang/runtime/type-info.h b/flang/runtime/type-info.h
index bd8112d9d6d8c..c3f3595e32ef2 100644
--- a/flang/runtime/type-info.h
+++ b/flang/runtime/type-info.h
@@ -15,10 +15,10 @@
 #include "terminator.h"
 #include "flang/Common/Fortran.h"
 #include "flang/Common/bit-population-count.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/descriptor.h"
 #include <cinttypes>
 #include <memory>
-#include <optional>
 
 namespace Fortran::runtime::typeInfo {
 
@@ -39,7 +39,7 @@ class Value {
     LenParameter = 3
   };
   RT_API_ATTRS Genre genre() const { return genre_; }
-  RT_API_ATTRS std::optional<TypeParameterValue> GetValue(
+  RT_API_ATTRS Fortran::common::optional<TypeParameterValue> GetValue(
       const Descriptor *) const;
 
 private:
@@ -58,7 +58,7 @@ class Component {
     Automatic = 4
   };
 
-  const RT_API_ATTRS Descriptor &name() const { return name_.descriptor(); }
+  RT_API_ATTRS const Descriptor &name() const { return name_.descriptor(); }
   RT_API_ATTRS Genre genre() const { return genre_; }
   RT_API_ATTRS TypeCategory category() const {
     return static_cast<TypeCategory>(category_);
@@ -66,17 +66,17 @@ class Component {
   RT_API_ATTRS int kind() const { return kind_; }
   RT_API_ATTRS int rank() const { return rank_; }
   RT_API_ATTRS std::uint64_t offset() const { return offset_; }
-  const RT_API_ATTRS Value &characterLen() const { return characterLen_; }
-  const RT_API_ATTRS DerivedType *derivedType() const {
+  RT_API_ATTRS const Value &characterLen() const { return characterLen_; }
+  RT_API_ATTRS const DerivedType *derivedType() const {
     return derivedType_.descriptor().OffsetElement<const DerivedType>();
   }
-  const RT_API_ATTRS Value *lenValue() const {
+  RT_API_ATTRS const Value *lenValue() const {
     return lenValue_.descriptor().OffsetElement<const Value>();
   }
-  const RT_API_ATTRS Value *bounds() const {
+  RT_API_ATTRS const Value *bounds() const {
     return bounds_.descriptor().OffsetElement<const Value>();
   }
-  const RT_API_ATTRS char *initialization() const { return initialization_; }
+  RT_API_ATTRS const char *initialization() const { return initialization_; }
 
   RT_API_ATTRS std::size_t GetElementByteSize(const Descriptor &) const;
   RT_API_ATTRS std::size_t GetElements(const Descriptor &) const;
@@ -205,27 +205,27 @@ class DerivedType {
 public:
   ~DerivedType(); // never defined
 
-  const RT_API_ATTRS Descriptor &binding() const {
+  RT_API_ATTRS const Descriptor &binding() const {
     return binding_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &name() const { return name_.descriptor(); }
+  RT_API_ATTRS const Descriptor &name() const { return name_.descriptor(); }
   RT_API_ATTRS std::uint64_t sizeInBytes() const { return sizeInBytes_; }
-  const RT_API_ATTRS Descriptor &uninstatiated() const {
+  RT_API_ATTRS const Descriptor &uninstatiated() const {
     return uninstantiated_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &kindParameter() const {
+  RT_API_ATTRS const Descriptor &kindParameter() const {
     return kindParameter_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &lenParameterKind() const {
+  RT_API_ATTRS const Descriptor &lenParameterKind() const {
     return lenParameterKind_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &component() const {
+  RT_API_ATTRS const Descriptor &component() const {
     return component_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &procPtr() const {
+  RT_API_ATTRS const Descriptor &procPtr() const {
     return procPtr_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &special() const {
+  RT_API_ATTRS const Descriptor &special() const {
     return special_.descriptor();
   }
   RT_API_ATTRS bool hasParent() const { return hasParent_; }
@@ -241,14 +241,14 @@ class DerivedType {
     return lenParameterKind().Elements();
   }
 
-  const RT_API_ATTRS DerivedType *GetParentType() const;
+  RT_API_ATTRS const DerivedType *GetParentType() const;
 
   // Finds a data component by name in this derived type or its ancestors.
-  const RT_API_ATTRS Component *FindDataComponent(
+  RT_API_ATTRS const Component *FindDataComponent(
       const char *name, std::size_t nameLen) const;
 
   // O(1) look-up of special procedure bindings
-  const RT_API_ATTRS SpecialBinding *FindSpecialBinding(
+  RT_API_ATTRS const SpecialBinding *FindSpecialBinding(
       SpecialBinding::Which which) const {
     auto bitIndex{static_cast<std::uint32_t>(which)};
     auto bit{std::uint32_t{1} << bitIndex};
diff --git a/flang/runtime/unit-map.cpp b/flang/runtime/unit-map.cpp
index 6ebda5c45f087..684a9b9e20b97 100644
--- a/flang/runtime/unit-map.cpp
+++ b/flang/runtime/unit-map.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "unit-map.h"
+#include "flang/Common/optional.h"
 
 namespace Fortran::runtime::io {
 
@@ -29,7 +30,7 @@ void UnitMap::Initialize() {
 ExternalFileUnit &UnitMap::NewUnit(const Terminator &terminator) {
   CriticalSection critical{lock_};
   Initialize();
-  std::optional<int> n{freeNewUnits_.PopValue()};
+  Fortran::common::optional<int> n{freeNewUnits_.PopValue()};
   if (!n) {
     n = emergencyNewUnit_++;
   }
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 09782d2f84927..82f0e68cc20a2 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -52,7 +52,7 @@ ExternalFileUnit *ExternalFileUnit::LookUpOrCreate(
 }
 
 ExternalFileUnit *ExternalFileUnit::LookUpOrCreateAnonymous(int unit,
-    Direction dir, std::optional<bool> isUnformatted,
+    Direction dir, Fortran::common::optional<bool> isUnformatted,
     const Terminator &terminator) {
   // Make sure that the returned anonymous unit has been opened
   // not just created in the unitMap.
@@ -95,9 +95,10 @@ ExternalFileUnit &ExternalFileUnit::NewUnit(
   return unit;
 }
 
-bool ExternalFileUnit::OpenUnit(std::optional<OpenStatus> status,
-    std::optional<Action> action, Position position, OwningPtr<char> &&newPath,
-    std::size_t newPathLength, Convert convert, IoErrorHandler &handler) {
+bool ExternalFileUnit::OpenUnit(Fortran::common::optional<OpenStatus> status,
+    Fortran::common::optional<Action> action, Position position,
+    OwningPtr<char> &&newPath, std::size_t newPathLength, Convert convert,
+    IoErrorHandler &handler) {
   if (convert == Convert::Unknown) {
     convert = executionEnvironment.conversion;
   }
@@ -176,9 +177,10 @@ bool ExternalFileUnit::OpenUnit(std::optional<OpenStatus> status,
   return impliedClose;
 }
 
-void ExternalFileUnit::OpenAnonymousUnit(std::optional<OpenStatus> status,
-    std::optional<Action> action, Position position, Convert convert,
-    IoErrorHandler &handler) {
+void ExternalFileUnit::OpenAnonymousUnit(
+    Fortran::common::optional<OpenStatus> status,
+    Fortran::common::optional<Action> action, Position position,
+    Convert convert, IoErrorHandler &handler) {
   // I/O to an unconnected unit reads/creates a local file, e.g. fort.7
   std::size_t pathMaxLen{32};
   auto path{SizedNew<char>{handler}(pathMaxLen)};
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index e3c8757645bb9..fc5bead7e1d93 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -21,10 +21,10 @@
 #include "lock.h"
 #include "terminator.h"
 #include "flang/Common/constexpr-bitset.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/memory.h"
 #include <cstdlib>
 #include <cstring>
-#include <optional>
 #include <variant>
 
 namespace Fortran::runtime::io {
@@ -55,7 +55,7 @@ class ExternalFileUnit : public ConnectionState,
   static ExternalFileUnit *LookUpOrCreate(
       int unit, const Terminator &, bool &wasExtant);
   static ExternalFileUnit *LookUpOrCreateAnonymous(int unit, Direction,
-      std::optional<bool> isUnformatted, const Terminator &);
+      Fortran::common::optional<bool> isUnformatted, const Terminator &);
   static ExternalFileUnit *LookUp(const char *path, std::size_t pathLen);
   static ExternalFileUnit &CreateNew(int unit, const Terminator &);
   static ExternalFileUnit *LookUpForClose(int unit);
@@ -64,11 +64,11 @@ class ExternalFileUnit : public ConnectionState,
   static void FlushAll(IoErrorHandler &);
 
   // Returns true if an existing unit was closed
-  bool OpenUnit(std::optional<OpenStatus>, std::optional<Action>, Position,
-      OwningPtr<char> &&path, std::size_t pathLength, Convert,
-      IoErrorHandler &);
-  void OpenAnonymousUnit(std::optional<OpenStatus>, std::optional<Action>,
-      Position, Convert, IoErrorHandler &);
+  bool OpenUnit(Fortran::common::optional<OpenStatus>,
+      Fortran::common::optional<Action>, Position, OwningPtr<char> &&path,
+      std::size_t pathLength, Convert, IoErrorHandler &);
+  void OpenAnonymousUnit(Fortran::common::optional<OpenStatus>,
+      Fortran::common::optional<Action>, Position, Convert, IoErrorHandler &);
   void CloseUnit(CloseStatus, IoErrorHandler &);
   void DestroyClosed();
 
@@ -169,7 +169,7 @@ class ExternalFileUnit : public ConnectionState,
       u_;
 
   // Points to the active alternative (if any) in u_ for use as a Cookie
-  std::optional<IoStatementState> io_;
+  Fortran::common::optional<IoStatementState> io_;
 
   // A stack of child I/O pseudo-units for defined I/O that have this
   // unit number.
@@ -211,7 +211,7 @@ class ChildIo {
       ChildUnformattedIoStatementState<Direction::Input>, InquireUnitState,
       ErroneousIoStatementState, ExternalMiscIoStatementState>
       u_;
-  std::optional<IoStatementState> io_;
+  Fortran::common::optional<IoStatementState> io_;
 };
 
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index 8f59ddbb19663..e9ccc2c04b6b0 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -40,7 +40,7 @@ const std::uint8_t UTF8FirstByteTable[256]{
 // clang-format on
 
 // Non-minimal encodings are accepted.
-std::optional<char32_t> DecodeUTF8(const char *p0) {
+Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
   const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
   std::size_t bytes{MeasureUTF8Bytes(*p0)};
   if (bytes == 1) {
@@ -50,7 +50,7 @@ std::optional<char32_t> DecodeUTF8(const char *p0) {
     for (std::size_t j{1}; j < bytes; ++j) {
       std::uint8_t next{p[j]};
       if (next < 0x80 || next > 0xbf) {
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
       result = (result << 6) | (next & 0x3f);
     }
@@ -58,7 +58,7 @@ std::optional<char32_t> DecodeUTF8(const char *p0) {
       return static_cast<char32_t>(result);
     }
   }
-  return std::nullopt;
+  return Fortran::common::nullopt;
 }
 
 std::size_t EncodeUTF8(char *p0, char32_t ucs) {
diff --git a/flang/runtime/utf.h b/flang/runtime/utf.h
index 6d9943bb6b8a2..2b4e4f9a18875 100644
--- a/flang/runtime/utf.h
+++ b/flang/runtime/utf.h
@@ -41,9 +41,9 @@
 #ifndef FORTRAN_RUNTIME_UTF_H_
 #define FORTRAN_RUNTIME_UTF_H_
 
+#include "flang/Common/optional.h"
 #include <cstddef>
 #include <cstdint>
-#include <optional>
 
 namespace Fortran::runtime {
 
@@ -58,7 +58,7 @@ static constexpr std::size_t maxUTF8Bytes{7};
 
 // Ensure that all bytes are present in sequence in the input buffer
 // before calling; use MeasureUTF8Bytes(first byte) to count them.
-std::optional<char32_t> DecodeUTF8(const char *);
+Fortran::common::optional<char32_t> DecodeUTF8(const char *);
 
 // Ensure that at least maxUTF8Bytes remain in the output
 // buffer before calling.
diff --git a/flang/test/Driver/cuda-option.f90 b/flang/test/Driver/cuda-option.f90
index 112e1cb6c77f8..562f8683b0ff7 100644
--- a/flang/test/Driver/cuda-option.f90
+++ b/flang/test/Driver/cuda-option.f90
@@ -1,6 +1,6 @@
 ! Test -fcuda option
-! RUN: %flang -fc1 -cpp -x cuda -fdebug-unparse %s -o - | FileCheck %s
-! RUN: not %flang -fc1 -cpp %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
+! RUN: %flang_fc1 -cpp -x cuda -fdebug-unparse %s -o - | FileCheck %s
+! RUN: not %flang_fc1 -cpp %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 program main
 #if _CUDA
   integer :: var = _CUDA
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index 44dbac44772b2..bf3660d57cbb4 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -52,8 +52,6 @@
 ! CHECK-NEXT:                         Do not use HLFIR lowering (deprecated)
 ! CHECK-NEXT: -flang-experimental-hlfir
 ! CHECK-NEXT:                         Use HLFIR lowering (experimental)
-! CHECK-NEXT: -flang-experimental-polymorphism
-! CHECK-NEXT:                         Enable Fortran 2003 polymorphism (experimental)
 ! CHECK-NEXT: -flarge-sizes           Use INTEGER(KIND=8) for the result type in size-related intrinsics
 ! CHECK-NEXT: -flogical-abbreviations Enable logical abbreviations
 ! CHECK-NEXT: -flto=auto              Enable LTO in 'full' mode
diff --git a/flang/test/Driver/flang-experimental-polymorphism-flag.f90 b/flang/test/Driver/flang-experimental-polymorphism-flag.f90
deleted file mode 100644
index 095c1cc929e67..0000000000000
--- a/flang/test/Driver/flang-experimental-polymorphism-flag.f90
+++ /dev/null
@@ -1,10 +0,0 @@
-! Test -flang-experimental-hlfir flag
-! RUN: %flang_fc1 -flang-experimental-polymorphism -emit-fir -o - %s | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -o - %s 2>&1 | FileCheck %s --check-prefix NO-POLYMORPHISM
-
-! CHECK: func.func @_QPtest(%{{.*}}: !fir.class<none> {fir.bindc_name = "poly"})
-subroutine test(poly)
-  class(*) :: poly
-end subroutine test
-
-! NO-POLYMORPHISM: func.func @_QPtest
diff --git a/flang/test/Driver/frontend-forwarding.f90 b/flang/test/Driver/frontend-forwarding.f90
index 8e9c9b78c3c10..eac9773ce25c7 100644
--- a/flang/test/Driver/frontend-forwarding.f90
+++ b/flang/test/Driver/frontend-forwarding.f90
@@ -17,7 +17,6 @@
 ! RUN:     -fomit-frame-pointer \
 ! RUN:     -fpass-plugin=Bye%pluginext \
 ! RUN:     -fversion-loops-for-stride \
-! RUN:     -flang-experimental-polymorphism \
 ! RUN:     -flang-experimental-hlfir \
 ! RUN:     -flang-deprecated-no-hlfir \
 ! RUN:     -fno-ppc-native-vector-element-order \
@@ -49,7 +48,6 @@
 ! CHECK: "-fconvert=little-endian"
 ! CHECK: "-fpass-plugin=Bye
 ! CHECK: "-fversion-loops-for-stride"
-! CHECK: "-flang-experimental-polymorphism"
 ! CHECK: "-flang-experimental-hlfir"
 ! CHECK: "-flang-deprecated-no-hlfir"
 ! CHECK: "-fno-ppc-native-vector-element-order"
diff --git a/flang/test/Evaluate/fold-scale.f90 b/flang/test/Evaluate/fold-scale.f90
index a5c5f081668b1..6d767a9f4c9a3 100644
--- a/flang/test/Evaluate/fold-scale.f90
+++ b/flang/test/Evaluate/fold-scale.f90
@@ -7,5 +7,6 @@ module m
   logical, parameter :: test_4 = sign(1.0, scale(0.0, 0)) == 1.0
   logical, parameter :: test_5 = scale(1.0, -1) == 0.5
   logical, parameter :: test_6 = scale(2.0, -1) == 1.0
+  logical, parameter :: test_7 = scale(huge(0.d0), -1200) == 1.0440487148797638d-53
+  logical, parameter :: test_8 = scale(tiny(0.d0), 1200) == 3.8312388521647221d053
 end module
-
diff --git a/flang/test/Fir/dispatch.f90 b/flang/test/Fir/dispatch.f90
index 1dc71038813d8..1479d611b986a 100644
--- a/flang/test/Fir/dispatch.f90
+++ b/flang/test/Fir/dispatch.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | fir-opt --fir-polymorphic-op | FileCheck %s
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s --check-prefix=BT
+! RUN: bbc -emit-hlfir %s -o - | fir-opt --fir-polymorphic-op | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefix=BT
 
 ! Tests codegen of fir.dispatch operation. This test is intentionally run from
 ! Fortran through bbc and tco so we have all the binding tables lowered to FIR
diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir
new file mode 100644
index 0000000000000..24bde536667b5
--- /dev/null
+++ b/flang/test/Fir/omp-reduction-embox-codegen.fir
@@ -0,0 +1,36 @@
+// RUN: tco %s | FileCheck %s
+
+// the fir.embox in the init region is turned into an alloca for the box. Test
+// that CodeGen.cpp knows where to place an alloca when it is inside of an
+// omp.reduction.declare
+
+// regretably this has to be nonsense IR because we need the subsequent patches
+// to process anything useful
+
+omp.reduction.declare @test_reduction : !fir.ref<!fir.box<i32>> init {
+^bb0(%arg0: !fir.ref<!fir.box<i32>>):
+  %0 = fir.alloca !fir.box<i32>
+  %1 = fir.alloca i32
+  %2 = fir.embox %1 : (!fir.ref<i32>) -> !fir.box<i32>
+
+  // use the embox for something so it isn't removed
+  fir.store %2 to %0 : !fir.ref<!fir.box<i32>>
+
+  omp.yield(%0 : !fir.ref<!fir.box<i32>>)
+} combiner {
+^bb0(%arg0: !fir.ref<!fir.box<i32>>, %arg1: !fir.ref<!fir.box<i32>>):
+  %0 = fir.undefined !fir.ref<!fir.box<i32>>
+  omp.yield(%0 : !fir.ref<!fir.box<i32>>)
+}
+
+func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+  %4 = fir.alloca !fir.box<i32>
+  omp.parallel byref reduction(@test_reduction %4 -> %arg0 : !fir.ref<!fir.box<i32>>) {
+    omp.terminator
+  }
+  return
+}
+
+// basically we are testing that there isn't a crash
+// CHECK-LABEL: define void @_QQmain
+// CHECK-NEXT:    alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
diff --git a/flang/test/HLFIR/assumed-type-actual-args.f90 b/flang/test/HLFIR/assumed-type-actual-args.f90
index 58c282b6ab188..dbdfc1785ce9d 100644
--- a/flang/test/HLFIR/assumed-type-actual-args.f90
+++ b/flang/test/HLFIR/assumed-type-actual-args.f90
@@ -1,6 +1,6 @@
 ! Test lowering to FIR of actual arguments that are assumed type
 ! variables (Fortran 2018 7.3.2.2 point 3).
-! RUN: bbc --polymorphic-type -emit-hlfir -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine test1(x)
   interface
diff --git a/flang/test/HLFIR/boxchar_emboxing.f90 b/flang/test/HLFIR/boxchar_emboxing.f90
index 3e9ccde6babab..fbc41bbea72d8 100644
--- a/flang/test/HLFIR/boxchar_emboxing.f90
+++ b/flang/test/HLFIR/boxchar_emboxing.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.class<none> {fir.bindc_name = "x"}) {
diff --git a/flang/test/HLFIR/call_with_poly_dummy.f90 b/flang/test/HLFIR/call_with_poly_dummy.f90
index af6876e26603e..00a795c5b1fbe 100644
--- a/flang/test/HLFIR/call_with_poly_dummy.f90
+++ b/flang/test/HLFIR/call_with_poly_dummy.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
 ! Test passing arguments to subprograms with polymorphic dummy arguments.
 
diff --git a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
index c1e89d1978e4c..f4327b3261751 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
@@ -18,15 +18,17 @@ contains
 ! CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMtest_callFhostEa"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
     call dev_kernel0<<<10, 20>>>()
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}>>>()
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}>>>()
 
-    call dev_kernel0<<< __builtin_dim3(1,1), __builtin_dim3(32,1,1) >>>
+    call dev_kernel0<<< __builtin_dim3(1,1,4), __builtin_dim3(32,1,1) >>>
 ! CHECK: %[[ADDR_DIM3_GRID:.*]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_dim3.{{.*}}) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
 ! CHECK: %[[DIM3_GRID:.*]]:2 = hlfir.declare %[[ADDR_DIM3_GRID]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_dim3.0"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
 ! CHECK: %[[GRID_X:.*]] = hlfir.designate %[[DIM3_GRID]]#1{"x"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
 ! CHECK: %[[GRID_X_LOAD:.*]] = fir.load %[[GRID_X]] : !fir.ref<i32>
 ! CHECK: %[[GRID_Y:.*]] = hlfir.designate %[[DIM3_GRID]]#1{"y"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
 ! CHECK: %[[GRID_Y_LOAD:.*]] = fir.load %[[GRID_Y]] : !fir.ref<i32>
+! CHECK: %[[GRID_Z:.*]] = hlfir.designate %[[DIM3_GRID]]#1{"z"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: %[[GRID_Z_LOAD:.*]] = fir.load %[[GRID_Z]] : !fir.ref<i32>
 ! CHECK: %[[ADDR_DIM3_BLOCK:.*]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_dim3.{{.*}}) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
 ! CHECK: %[[DIM3_BLOCK:.*]]:2 = hlfir.declare %[[ADDR_DIM3_BLOCK]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_dim3.1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
 ! CHECK: %[[BLOCK_X:.*]] = hlfir.designate %[[DIM3_BLOCK]]#1{"x"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
@@ -35,16 +37,16 @@ contains
 ! CHECK: %[[BLOCK_Y_LOAD:.*]] = fir.load %[[BLOCK_Y]] : !fir.ref<i32>
 ! CHECK: %[[BLOCK_Z:.*]] = hlfir.designate %[[DIM3_BLOCK]]#1{"z"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
 ! CHECK: %[[BLOCK_Z_LOAD:.*]] = fir.load %[[BLOCK_Z]] : !fir.ref<i32>
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%[[GRID_X_LOAD]], %[[GRID_Y_LOAD]], %[[BLOCK_X_LOAD]], %[[BLOCK_Y_LOAD]], %[[BLOCK_Z_LOAD]]>>>()
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%[[GRID_X_LOAD]], %[[GRID_Y_LOAD]], %[[GRID_Z_LOAD]], %[[BLOCK_X_LOAD]], %[[BLOCK_Y_LOAD]], %[[BLOCK_Z_LOAD]]>>>()
 
     call dev_kernel0<<<10, 20, 2>>>()
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}>>>() 
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}>>>() 
 
     call dev_kernel0<<<10, 20, 2, 0>>>()
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>()
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>()
 
     call dev_kernel1<<<1, 32>>>(a)
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%1#1 : !fir.ref<f32>)
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%1#1) : (!fir.ref<f32>)
   end
 
 end
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index db628fe756b95..6179e609db383 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -42,10 +42,20 @@ subroutine sub1()
 ! CHECK: fir.cuda_kernel<<<%c1{{.*}}, (%c256{{.*}}, %c1{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index) step (%{{.*}}, %{{.*}} : index, index)
 ! CHECK: {n = 2 : i64}
 
-! TODO: currently these trigger error in the parser
-! !$cuf kernel do(2) <<< (1,*), (256,1) >>>
-! !$cuf kernel do(2) <<< (*,*), (32,4) >>>
-end
-
+  !$cuf kernel do(2) <<< (1,*), (256,1) >>>
+  do i = 1, n
+    do j = 1, n
+      c(i,j) = c(i,j) * d(i,j)
+    end do
+  end do
+! CHECK: fir.cuda_kernel<<<(%c1{{.*}}, %c0{{.*}}), (%c256{{.*}}, %c1{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index)  step (%{{.*}}, %{{.*}} : index, index)
 
+!$cuf kernel do(2) <<< (*,*), (32,4) >>>
+  do i = 1, n
+    do j = 1, n
+      c(i,j) = c(i,j) * d(i,j)
+    end do
+  end do
 
+! CHECK: fir.cuda_kernel<<<*, (%c32{{.*}}, %c4{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index)  step (%{{.*}}, %{{.*}} : index, index)
+end
diff --git a/flang/test/Lower/CUDA/cuda-module-use.cuf b/flang/test/Lower/CUDA/cuda-module-use.cuf
index f54083b026ee8..47d3805065a5a 100644
--- a/flang/test/Lower/CUDA/cuda-module-use.cuf
+++ b/flang/test/Lower/CUDA/cuda-module-use.cuf
@@ -5,7 +5,7 @@
 
 subroutine sub1()
   use cuf_mod
-  md = 1.0
+!  md = 1.0 ! currently a TODO
 end
 
 ! CHECK-LABEL: func.func @_QPsub1()
diff --git a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90 b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
index 76ed237fb6720..129aa49b811db 100644
--- a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
+++ b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
@@ -1,5 +1,5 @@
 ! Test actual TARGET argument association to dummy POINTER:
-! RUN: bbc -emit-hlfir --polymorphic-type -o - -I nowhere %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nowhere %s 2>&1 | FileCheck %s
 
 module target_to_pointer_types
   type t1
diff --git a/flang/test/Lower/HLFIR/allocatable-return.f90 b/flang/test/Lower/HLFIR/allocatable-return.f90
index d652b2f9f767d..e1bac34ef1a0a 100644
--- a/flang/test/Lower/HLFIR/allocatable-return.f90
+++ b/flang/test/Lower/HLFIR/allocatable-return.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir --polymorphic-type -I nowhere %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere %s -o - | FileCheck %s
 
 ! Test allocatable return.
 ! Allocatable arrays must have default runtime lbounds after the return.
diff --git a/flang/test/Lower/HLFIR/array-ctor-derived.f90 b/flang/test/Lower/HLFIR/array-ctor-derived.f90
index 21a3eb78a1995..111225462a4bb 100644
--- a/flang/test/Lower/HLFIR/array-ctor-derived.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-derived.f90
@@ -1,5 +1,5 @@
 ! Test lowering of derived type array constructors to HLFIR.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 module types
   type simple
diff --git a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
new file mode 100644
index 0000000000000..7a2ea5cc14b64
--- /dev/null
+++ b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
@@ -0,0 +1,309 @@
+! Test lowering of sequence associated arguments (F'2023 15.5.2.12) passed
+! by descriptor. The descriptor on the caller side is prepared according to
+! the dummy argument shape.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+module bindc_seq_assoc
+  interface
+    subroutine takes_char(x, n) bind(c)
+      integer :: n
+      character(*) :: x(n)
+    end subroutine
+    subroutine takes_char_assumed_size(x) bind(c)
+      character(*) :: x(10, *)
+    end subroutine
+    subroutine takes_optional_char(x, n) bind(c)
+      integer :: n
+      character(*), optional :: x(n)
+    end subroutine
+  end interface
+contains
+  subroutine test_char_1(x)
+    character(*) :: x(10, 20)
+    call takes_char(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_1(
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 {uniq_name = "_QMbindc_seq_assocFtest_char_1Ex"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_9:.*]] = fir.shift %[[VAL_8]], %[[VAL_8]] : (index, index) -> !fir.shift<2>
+! CHECK:           %[[VAL_10:.*]] = fir.rebox %[[VAL_6]]#0(%[[VAL_9]]) : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:           %[[VAL_11:.*]]:3 = hlfir.associate %[[VAL_7]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_12:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]]#1 {uniq_name = "_QMbindc_seq_assocFtakes_charEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_14:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i32) -> i64
+! CHECK:           %[[VAL_16:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_17:.*]] = arith.subi %[[VAL_15]], %[[VAL_16]] : i64
+! CHECK:           %[[VAL_18:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_19:.*]] = arith.addi %[[VAL_17]], %[[VAL_18]] : i64
+! CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i64) -> index
+! CHECK:           %[[VAL_21:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_22:.*]] = arith.cmpi sgt, %[[VAL_20]], %[[VAL_21]] : index
+! CHECK:           %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_20]], %[[VAL_21]] : index
+! CHECK:           %[[VAL_24:.*]] = fir.shape_shift %[[VAL_12]], %[[VAL_23]] : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_25:.*]] = fir.box_elesize %[[VAL_10]] : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_26:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> !fir.ref<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_28:.*]] = fir.embox %[[VAL_27]](%[[VAL_24]]) typeparams %[[VAL_25]] : (!fir.ref<!fir.array<?x!fir.char<1,?>>>, !fir.shapeshift<1>, index) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           fir.call @takes_char(%[[VAL_28]], %[[VAL_11]]#1) fastmath<contract> : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_11]]#1, %[[VAL_11]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_char_copy_in_copy_out(x)
+    character(*) :: x(:, :)
+    call takes_char(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_copy_in_copy_out(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMbindc_seq_assocFtest_char_copy_in_copy_outEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]] = fir.shift %[[VAL_4]], %[[VAL_4]] : (index, index) -> !fir.shift<2>
+! CHECK:           %[[VAL_6:.*]] = fir.rebox %[[VAL_3]]#0(%[[VAL_5]]) : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_7:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]]#1 {uniq_name = "_QMbindc_seq_assocFtakes_charEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> i64
+! CHECK:           %[[VAL_12:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_13:.*]] = arith.subi %[[VAL_11]], %[[VAL_12]] : i64
+! CHECK:           %[[VAL_14:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i64
+! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i64) -> index
+! CHECK:           %[[VAL_17:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_18:.*]] = arith.cmpi sgt, %[[VAL_16]], %[[VAL_17]] : index
+! CHECK:           %[[VAL_19:.*]] = arith.select %[[VAL_18]], %[[VAL_16]], %[[VAL_17]] : index
+! CHECK:           %[[VAL_20:.*]] = fir.shape_shift %[[VAL_8]], %[[VAL_19]] : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_21:.*]] = fir.box_elesize %[[VAL_6]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_22:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_24:.*]] = fir.embox %[[VAL_23]](%[[VAL_20]]) typeparams %[[VAL_21]] : (!fir.ref<!fir.array<?x!fir.char<1,?>>>, !fir.shapeshift<1>, index) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           fir.call @takes_char(%[[VAL_24]], %[[VAL_7]]#1) fastmath<contract> : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.copy_out %[[VAL_3]]#0, %[[VAL_3]]#1 to %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1, !fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_7]]#1, %[[VAL_7]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_char_assumed_size(x)
+    character(*) :: x(:, :)
+    call takes_char_assumed_size(x)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_assumed_size(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMbindc_seq_assocFtest_char_assumed_sizeEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shift %[[VAL_3]], %[[VAL_3]] : (index, index) -> !fir.shift<2>
+! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_2]]#0(%[[VAL_4]]) : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_8]] : i64
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i64
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i64) -> index
+! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_16:.*]] = arith.constant -1 : i64
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i64) -> index
+! CHECK:           %[[VAL_18:.*]] = fir.shape_shift %[[VAL_6]], %[[VAL_15]], %[[VAL_6]], %[[VAL_17]] : (index, index, index, index) -> !fir.shapeshift<2>
+! CHECK:           %[[VAL_19:.*]] = fir.box_elesize %[[VAL_5]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_20:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.array<10x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_22:.*]] = fir.embox %[[VAL_21]](%[[VAL_18]]) typeparams %[[VAL_19]] : (!fir.ref<!fir.array<10x?x!fir.char<1,?>>>, !fir.shapeshift<2>, index) -> !fir.box<!fir.array<10x?x!fir.char<1,?>>>
+! CHECK:           fir.call @takes_char_assumed_size(%[[VAL_22]]) fastmath<contract> : (!fir.box<!fir.array<10x?x!fir.char<1,?>>>) -> ()
+! CHECK:           hlfir.copy_out %[[VAL_2]]#0, %[[VAL_2]]#1 to %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1, !fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_optional_char(x)
+    character(*), optional :: x(10, 20)
+    call takes_optional_char(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_optional_char(
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMbindc_seq_assocFtest_optional_charEx"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_7:.*]] = fir.is_present %[[VAL_6]]#0 : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> i1
+! CHECK:           %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_9:.*]] = fir.if %[[VAL_7]] -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) {
+! CHECK:             %[[VAL_10:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_11:.*]] = fir.shift %[[VAL_10]], %[[VAL_10]] : (index, index) -> !fir.shift<2>
+! CHECK:             %[[VAL_12:.*]] = fir.rebox %[[VAL_6]]#0(%[[VAL_11]]) : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:             fir.result %[[VAL_12]] : !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_13:.*]] = fir.absent !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:             fir.result %[[VAL_13]] : !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:           }
+! CHECK:           %[[VAL_14:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_15:.*]] = fir.if %[[VAL_7]] -> (!fir.box<!fir.array<?x!fir.char<1,?>>>) {
+! CHECK:             %[[VAL_16:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_14]]#1 {uniq_name = "_QMbindc_seq_assocFtakes_optional_charEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i64
+! CHECK:             %[[VAL_21:.*]] = arith.subi %[[VAL_19]], %[[VAL_20]] : i64
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i64
+! CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_22]] : i64
+! CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i64) -> index
+! CHECK:             %[[VAL_25:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_26:.*]] = arith.cmpi sgt, %[[VAL_24]], %[[VAL_25]] : index
+! CHECK:             %[[VAL_27:.*]] = arith.select %[[VAL_26]], %[[VAL_24]], %[[VAL_25]] : index
+! CHECK:             %[[VAL_28:.*]] = fir.shape_shift %[[VAL_16]], %[[VAL_27]] : (index, index) -> !fir.shapeshift<1>
+! CHECK:             %[[VAL_29:.*]] = fir.box_elesize %[[VAL_9]] : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> index
+! CHECK:             %[[VAL_30:.*]] = fir.box_addr %[[VAL_9]] : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> !fir.ref<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
+! CHECK:             %[[VAL_32:.*]] = fir.embox %[[VAL_31]](%[[VAL_28]]) typeparams %[[VAL_29]] : (!fir.ref<!fir.array<?x!fir.char<1,?>>>, !fir.shapeshift<1>, index) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:             fir.result %[[VAL_32]] : !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_33:.*]] = fir.absent !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:             fir.result %[[VAL_33]] : !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           }
+! CHECK:           fir.call @takes_optional_char(%[[VAL_15]], %[[VAL_14]]#1) fastmath<contract> : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_14]]#1, %[[VAL_14]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+end module
+
+module poly_seq_assoc
+  interface
+    subroutine takes_poly(x, n)
+      integer :: n
+      class(*) :: x(n)
+    end subroutine
+    subroutine takes_poly_assumed_size(x)
+      class(*) :: x(10, *)
+    end subroutine
+    subroutine takes_optional_poly(x, n)
+      integer :: n
+      class(*), optional :: x(n)
+    end subroutine
+  end interface
+contains
+  subroutine test_poly_1(x)
+    class(*) :: x(10, 20)
+    call takes_poly(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_1(
+! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.class<!fir.array<10x20xnone>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMpoly_seq_assocFtest_poly_1Ex"} : (!fir.class<!fir.array<10x20xnone>>) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
+! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_3:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#1 {uniq_name = "_QMpoly_seq_assocFtakes_polyEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i32) -> i64
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_8:.*]] = arith.subi %[[VAL_6]], %[[VAL_7]] : i64
+! CHECK:           %[[VAL_9:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_8]], %[[VAL_9]] : i64
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i64) -> index
+! CHECK:           %[[VAL_12:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_11]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_16:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.class<!fir.array<10x20xnone>>) -> !fir.ref<!fir.array<10x20xnone>>
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.array<10x20xnone>>) -> !fir.ref<!fir.array<?xnone>>
+! CHECK:           %[[VAL_18:.*]] = fir.embox %[[VAL_17]](%[[VAL_15]]) source_box %[[VAL_1]]#0 : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.class<!fir.array<10x20xnone>>) -> !fir.class<!fir.array<?xnone>>
+! CHECK:           fir.call @_QPtakes_poly(%[[VAL_18]], %[[VAL_3]]#1) fastmath<contract> : (!fir.class<!fir.array<?xnone>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_3]]#1, %[[VAL_3]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_poly_copy_in_copy_out(x)
+    class(*) :: x(:, :)
+    call takes_poly(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_copy_in_copy_out(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMpoly_seq_assocFtest_poly_copy_in_copy_outEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
+! CHECK:           %[[VAL_4:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#1 {uniq_name = "_QMpoly_seq_assocFtakes_polyEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i32) -> i64
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_8]] : i64
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i64
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i64) -> index
+! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_16:.*]] = fir.shape %[[VAL_15]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_17:.*]] = fir.box_addr %[[VAL_3]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?x?xnone>>
+! CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (!fir.ref<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?xnone>>
+! CHECK:           %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) source_box %[[VAL_3]]#0 : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.class<!fir.array<?x?xnone>>) -> !fir.class<!fir.array<?xnone>>
+! CHECK:           fir.call @_QPtakes_poly(%[[VAL_19]], %[[VAL_4]]#1) fastmath<contract> : (!fir.class<!fir.array<?xnone>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.copy_out %[[VAL_3]]#0, %[[VAL_3]]#1 to %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>, i1, !fir.class<!fir.array<?x?xnone>>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_4]]#1, %[[VAL_4]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_poly_assumed_size(x)
+    class(*) :: x(:, :)
+    call takes_poly_assumed_size(x)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_assumed_size(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMpoly_seq_assocFtest_poly_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_5:.*]] = arith.subi %[[VAL_3]], %[[VAL_4]] : i64
+! CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_7:.*]] = arith.addi %[[VAL_5]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i64) -> index
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : index
+! CHECK:           %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : index
+! CHECK:           %[[VAL_12:.*]] = arith.constant -1 : i64
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i64) -> index
+! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_11]], %[[VAL_13]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[VAL_15:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?x?xnone>>
+! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (!fir.ref<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<10x?xnone>>
+! CHECK:           %[[VAL_17:.*]] = fir.embox %[[VAL_16]](%[[VAL_14]]) source_box %[[VAL_2]]#0 : (!fir.ref<!fir.array<10x?xnone>>, !fir.shape<2>, !fir.class<!fir.array<?x?xnone>>) -> !fir.class<!fir.array<10x?xnone>>
+! CHECK:           fir.call @_QPtakes_poly_assumed_size(%[[VAL_17]]) fastmath<contract> : (!fir.class<!fir.array<10x?xnone>>) -> ()
+! CHECK:           hlfir.copy_out %[[VAL_2]]#0, %[[VAL_2]]#1 to %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>, i1, !fir.class<!fir.array<?x?xnone>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_optional_poly(x)
+    class(*), optional :: x(10, 20)
+    call takes_optional_poly(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_optional_poly(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMpoly_seq_assocFtest_optional_polyEx"} : (!fir.class<!fir.array<10x20xnone>>) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
+! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.class<!fir.array<10x20xnone>>) -> i1
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_4:.*]] = fir.if %[[VAL_2]] -> (!fir.class<!fir.array<10x20xnone>>) {
+! CHECK:             fir.result %[[VAL_1]]#0 : !fir.class<!fir.array<10x20xnone>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_5:.*]] = fir.absent !fir.class<!fir.array<10x20xnone>>
+! CHECK:             fir.result %[[VAL_5]] : !fir.class<!fir.array<10x20xnone>>
+! CHECK:           }
+! CHECK:           %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_3]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_7:.*]] = fir.if %[[VAL_2]] -> (!fir.class<!fir.array<?xnone>>) {
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]]#1 {uniq_name = "_QMpoly_seq_assocFtakes_optional_polyEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i64
+! CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_10]], %[[VAL_11]] : i64
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i64
+! CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_12]], %[[VAL_13]] : i64
+! CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> index
+! CHECK:             %[[VAL_16:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_17:.*]] = arith.cmpi sgt, %[[VAL_15]], %[[VAL_16]] : index
+! CHECK:             %[[VAL_18:.*]] = arith.select %[[VAL_17]], %[[VAL_15]], %[[VAL_16]] : index
+! CHECK:             %[[VAL_19:.*]] = fir.shape %[[VAL_18]] : (index) -> !fir.shape<1>
+! CHECK:             %[[VAL_20:.*]] = fir.box_addr %[[VAL_4]] : (!fir.class<!fir.array<10x20xnone>>) -> !fir.ref<!fir.array<10x20xnone>>
+! CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.array<10x20xnone>>) -> !fir.ref<!fir.array<?xnone>>
+! CHECK:             %[[VAL_22:.*]] = fir.embox %[[VAL_21]](%[[VAL_19]]) source_box %[[VAL_4]] : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.class<!fir.array<10x20xnone>>) -> !fir.class<!fir.array<?xnone>>
+! CHECK:             fir.result %[[VAL_22]] : !fir.class<!fir.array<?xnone>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_23:.*]] = fir.absent !fir.class<!fir.array<?xnone>>
+! CHECK:             fir.result %[[VAL_23]] : !fir.class<!fir.array<?xnone>>
+! CHECK:           }
+! CHECK:           fir.call @_QPtakes_optional_poly(%[[VAL_7]], %[[VAL_6]]#1) fastmath<contract> : (!fir.class<!fir.array<?xnone>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_6]]#1, %[[VAL_6]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+end module
diff --git a/flang/test/Lower/HLFIR/calls-assumed-shape.f90 b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
index 92b87a6987a55..a2094f1f1f0eb 100644
--- a/flang/test/Lower/HLFIR/calls-assumed-shape.f90
+++ b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
@@ -1,6 +1,6 @@
 ! Test lowering of calls involving assumed shape arrays or arrays with
 ! VALUE attribute.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine test_assumed_to_assumed(x)
   interface
diff --git a/flang/test/Lower/HLFIR/calls-constant-expr-arg-polymorphic.f90 b/flang/test/Lower/HLFIR/calls-constant-expr-arg-polymorphic.f90
index a862033c0fefa..4ee70e55079da 100644
--- a/flang/test/Lower/HLFIR/calls-constant-expr-arg-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/calls-constant-expr-arg-polymorphic.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 ! Test when constant argument are copied in memory
 ! and passed to polymorphic arguments.
diff --git a/flang/test/Lower/HLFIR/calls-optional.f90 b/flang/test/Lower/HLFIR/calls-optional.f90
index cf684981f8003..df9519a24fb70 100644
--- a/flang/test/Lower/HLFIR/calls-optional.f90
+++ b/flang/test/Lower/HLFIR/calls-optional.f90
@@ -2,7 +2,7 @@
 ! that is syntactically present, but may be absent at runtime (is
 ! an optional or a pointer/allocatable).
 !
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine optional_copy_in_out(x)
   interface
diff --git a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90 b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
index ffd21e01ef98d..b14f1bb1f443b 100644
--- a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
+++ b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
@@ -1,6 +1,6 @@
 ! Test passing rank 2 CLASS(*) deferred shape to assumed size assumed type
 ! This requires copy-in/copy-out logic.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine pass_poly_to_assumed_type_assumed_size(x)
   class(*), target :: x(:,:)
diff --git a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
index b943cd3225a56..b9d55d3fde4f0 100644
--- a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
+++ b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
@@ -1,5 +1,5 @@
 ! Test conversion of MutableBoxValue to value.
-! RUN: bbc -emit-hlfir -polymorphic-type -I nowhere %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere %s -o - | FileCheck %s
 
 subroutine test_int_allocatable(a)
   integer, allocatable :: a
diff --git a/flang/test/Lower/HLFIR/designators-component-ref.f90 b/flang/test/Lower/HLFIR/designators-component-ref.f90
index 3e9fa037040d5..392eda66fd03c 100644
--- a/flang/test/Lower/HLFIR/designators-component-ref.f90
+++ b/flang/test/Lower/HLFIR/designators-component-ref.f90
@@ -1,5 +1,5 @@
 ! Test lowering of component reference to HLFIR
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 module comp_ref
 type t1
   integer :: scalar_i
diff --git a/flang/test/Lower/HLFIR/designators-parameter-array-slice.f90 b/flang/test/Lower/HLFIR/designators-parameter-array-slice.f90
index 3bf1992345335..42e130ee3f49c 100644
--- a/flang/test/Lower/HLFIR/designators-parameter-array-slice.f90
+++ b/flang/test/Lower/HLFIR/designators-parameter-array-slice.f90
@@ -1,5 +1,5 @@
 ! Test non-contiguous slice of parameter array.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 subroutine test2(i)
   integer, parameter :: a(*,*) = reshape( [ 1,2,3,4 ], [ 2,2 ])
   integer :: x(2)
diff --git a/flang/test/Lower/HLFIR/elemental-array-ops.f90 b/flang/test/Lower/HLFIR/elemental-array-ops.f90
index 6984e5ef74c69..9778adeb6179a 100644
--- a/flang/test/Lower/HLFIR/elemental-array-ops.f90
+++ b/flang/test/Lower/HLFIR/elemental-array-ops.f90
@@ -1,5 +1,5 @@
 ! Test lowering of elemental intrinsic operations with array arguments to HLFIR
-! RUN: bbc -emit-hlfir --polymorphic-type -I nowhere -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere -o - %s 2>&1 | FileCheck %s
 
 subroutine binary(x, y)
   integer :: x(100), y(100)
diff --git a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90 b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
index c2f97b6fccfeb..eb93099b3890f 100644
--- a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
+++ b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
@@ -1,5 +1,5 @@
 ! Test that the produced hlfir.elemental had proper result type and the mold.
-! RUN: bbc --emit-hlfir --polymorphic-type -I nowhere -o - %s | FileCheck %s
+! RUN: bbc --emit-hlfir -I nowhere -o - %s | FileCheck %s
 
 subroutine test_polymorphic_merge(x, y, r, m)
   type t
diff --git a/flang/test/Lower/HLFIR/elemental-user-procedure-ref-polymorphic.f90 b/flang/test/Lower/HLFIR/elemental-user-procedure-ref-polymorphic.f90
index 1cd5c5133bd96..84bb7a55377b8 100644
--- a/flang/test/Lower/HLFIR/elemental-user-procedure-ref-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/elemental-user-procedure-ref-polymorphic.f90
@@ -1,6 +1,6 @@
 ! Test lowering of user defined elemental procedure reference to HLFIR
 ! With polymorphic arguments.
-! RUN: bbc -emit-hlfir -I nw -polymorphic-type -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -I nw -o - %s 2>&1 | FileCheck %s
 module def_some_types
   type :: t
     integer :: i
diff --git a/flang/test/Lower/HLFIR/function-return-as-expr.f90 b/flang/test/Lower/HLFIR/function-return-as-expr.f90
index ae4f679a0fe60..95a0c090ef043 100644
--- a/flang/test/Lower/HLFIR/function-return-as-expr.f90
+++ b/flang/test/Lower/HLFIR/function-return-as-expr.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nowhere 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s -I nowhere 2>&1 | FileCheck %s
 
 module types
   type t1
diff --git a/flang/test/Lower/HLFIR/function-return-destroy.f90 b/flang/test/Lower/HLFIR/function-return-destroy.f90
index 4663dc57e0457..5bd014981c128 100644
--- a/flang/test/Lower/HLFIR/function-return-destroy.f90
+++ b/flang/test/Lower/HLFIR/function-return-destroy.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir -polymorphic-type %s -o - -I nowhere | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - -I nowhere | FileCheck %s
 
 module types
   type t1
diff --git a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90 b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
index 952e8f565eb93..43986c8198b94 100644
--- a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
@@ -1,6 +1,6 @@
 ! Test passing mismatching rank arguments to unlimited polymorphic
 ! dummy with IGNORE_TKR(R).
-! RUN: bbc -emit-hlfir -polymorphic-type -o - -I nowhere %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nowhere %s 2>&1 | FileCheck %s
 
 module m
   interface
diff --git a/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90 b/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90
index 3ad74ced61a3b..27747ff2ad943 100644
--- a/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90
+++ b/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90
@@ -2,7 +2,7 @@
 ! dummy has IGNORE_TKR(t). The descriptor should be prepared
 ! according to the actual argument type, but its bounds and
 ! attributes should still be set as expected for the dummy.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 module tkr_ifaces
   interface
diff --git a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90 b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
index 932fafd322a3e..797e4c89ae239 100644
--- a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
+++ b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
@@ -1,6 +1,6 @@
 ! Test that allocatable components of non pointer/non allocatable INTENT(OUT)
 ! dummy arguments are deallocated.
-! RUN: bbc -emit-hlfir -polymorphic-type %s -o - -I nowhere | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - -I nowhere | FileCheck %s
 
 subroutine test_intentout_component_deallocate(a)
   type :: t
diff --git a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90 b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
index 8645488290d71..5763d84cfd605 100644
--- a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
@@ -1,6 +1,6 @@
 ! Test lowering of internal procedure capturing OPTIONAL polymorphic
 ! objects.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nw | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s -I nw | FileCheck %s
 
 
 module captured_optional_polymorphic
diff --git a/flang/test/Lower/HLFIR/intrinsic-assumed-type.f90 b/flang/test/Lower/HLFIR/intrinsic-assumed-type.f90
index d0381344c8931..e82bc342ff88f 100644
--- a/flang/test/Lower/HLFIR/intrinsic-assumed-type.f90
+++ b/flang/test/Lower/HLFIR/intrinsic-assumed-type.f90
@@ -2,7 +2,7 @@
 ! arguments. These are a bit special because semantics do not represent
 ! assumed types actual arguments with an evaluate::Expr like for usual
 ! arguments.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine assumed_type_to_intrinsic(a)
   type(*) :: a(:)
diff --git a/flang/test/Lower/HLFIR/parent-component-ref.f90 b/flang/test/Lower/HLFIR/parent-component-ref.f90
index b08d8f450e6d7..7d04ef3b40a65 100644
--- a/flang/test/Lower/HLFIR/parent-component-ref.f90
+++ b/flang/test/Lower/HLFIR/parent-component-ref.f90
@@ -1,5 +1,5 @@
 ! Test lowering of parent component references to HLFIR.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s -I nw | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s -I nw | FileCheck %s
 
 module pc_types
   type t
diff --git a/flang/test/Lower/HLFIR/poly_expr_for_nonpoly_dummy.f90 b/flang/test/Lower/HLFIR/poly_expr_for_nonpoly_dummy.f90
index ec5be37f55152..f5ec7c35594bd 100644
--- a/flang/test/Lower/HLFIR/poly_expr_for_nonpoly_dummy.f90
+++ b/flang/test/Lower/HLFIR/poly_expr_for_nonpoly_dummy.f90
@@ -1,6 +1,6 @@
 ! Test passing polymorphic expression for non-polymorphic contiguous
 ! dummy argument:
-! RUN: bbc -emit-hlfir --polymorphic-type -o - -I nowhere %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nowhere %s | FileCheck %s
 
 module types
   type t
diff --git a/flang/test/Lower/HLFIR/polymorphic-expressions.f90 b/flang/test/Lower/HLFIR/polymorphic-expressions.f90
index 37e602895db39..d6934e6ae6bd4 100644
--- a/flang/test/Lower/HLFIR/polymorphic-expressions.f90
+++ b/flang/test/Lower/HLFIR/polymorphic-expressions.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nowhere | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s -I nowhere | FileCheck %s
 
 module polymorphic_expressions_types
   type t
diff --git a/flang/test/Lower/HLFIR/proc-pointer-comp-nopass.f90 b/flang/test/Lower/HLFIR/proc-pointer-comp-nopass.f90
index ebb310f581c10..6f41f678dd84a 100644
--- a/flang/test/Lower/HLFIR/proc-pointer-comp-nopass.f90
+++ b/flang/test/Lower/HLFIR/proc-pointer-comp-nopass.f90
@@ -1,5 +1,5 @@
 ! Test lowering of NOPASS procedure pointers components.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 module proc_comp_defs
   interface
diff --git a/flang/test/Lower/HLFIR/proc-pointer-comp-pass.f90 b/flang/test/Lower/HLFIR/proc-pointer-comp-pass.f90
index 25e4393f9dac7..247008e3a93df 100644
--- a/flang/test/Lower/HLFIR/proc-pointer-comp-pass.f90
+++ b/flang/test/Lower/HLFIR/proc-pointer-comp-pass.f90
@@ -1,5 +1,5 @@
 ! Test lowering of PASS procedure pointers components.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 module m
   type t
diff --git a/flang/test/Lower/HLFIR/select-type-selector.f90 b/flang/test/Lower/HLFIR/select-type-selector.f90
index a9536c9ccee69..256f003dcbf69 100644
--- a/flang/test/Lower/HLFIR/select-type-selector.f90
+++ b/flang/test/Lower/HLFIR/select-type-selector.f90
@@ -14,7 +14,7 @@
 ! (16.9.109) applied to the corresponding dimension of selector. The upper bound of each dimension is one less
 ! than the sum of the lower bound and the extent.
 
-! RUN: bbc -emit-hlfir -polymorphic-type -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
 
 subroutine test()
   type t
diff --git a/flang/test/Lower/HLFIR/transpose.f90 b/flang/test/Lower/HLFIR/transpose.f90
index e63e1ec1d6890..e37e83c7a5011 100644
--- a/flang/test/Lower/HLFIR/transpose.f90
+++ b/flang/test/Lower/HLFIR/transpose.f90
@@ -1,5 +1,5 @@
 ! Test lowering of TRANSPOSE intrinsic to HLFIR
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s 2>&1 | FileCheck %s
 
 subroutine transpose1(m, res)
   integer :: m(1,2), res(2, 1)
diff --git a/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90 b/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90
index 6794d11ece42d..2e0c72ccfe048 100644
--- a/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90
+++ b/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90
@@ -1,6 +1,6 @@
 ! Test interface that lowering handles small interface mismatch with
 ! type bound procedures.
-! RUN: bbc -emit-hlfir --polymorphic-type %s -o - -I nw | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - -I nw | FileCheck %s
 
 module dispatch_mismatch
 type t
diff --git a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
index d4026a37720f7..ee8ded197c959 100644
--- a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
+++ b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
@@ -1,6 +1,6 @@
 ! Test lowering of vector subscript designators outside of the
 ! assignment left-and side and input IO context.
-! RUN: bbc -emit-hlfir -o - -I nw %s --polymorphic-type 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nw %s 2>&1 | FileCheck %s
 
 subroutine foo(x, y)
   integer :: x(100)
diff --git a/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90 b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
new file mode 100644
index 0000000000000..38468739ead52
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
@@ -0,0 +1,56 @@
+! Test C_PTR_EQ and C_PTR_NE lowering.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+function test_c_ptr_eq(ptr1, ptr2)
+  use, intrinsic :: iso_c_binding
+  type(c_ptr), intent(in) :: ptr1, ptr2
+  logical :: test_c_ptr_eq
+  test_c_ptr_eq = (ptr1 .eq. ptr2)
+end
+
+! CHECK-LABEL: func.func @_QPtest_c_ptr_eq(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> {
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_eq", uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"}
+! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS0:.*]] = fir.coordinate_of %[[DECL_ARG0]]#1, %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS0:.*]] = fir.load %[[COORD_ADDRESS0]] : !fir.ref<i64>
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS1:.*]] = fir.coordinate_of %[[DECL_ARG1]]#1, %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS1:.*]] = fir.load %[[COORD_ADDRESS1]] : !fir.ref<i64>
+! CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[ADDRESS0]], %[[ADDRESS1]] : i64
+! CHECK: %[[RES:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
+! CHECK: %[[NO_REASSOC_RES:.*]] = hlfir.no_reassoc %[[RES]] : !fir.logical<4>
+! CHECK: hlfir.assign %[[NO_REASSOC_RES]] to %[[DECL_RET]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK: %[[LOAD_RET:.*]] = fir.load %[[DECL_RET]]#1 : !fir.ref<!fir.logical<4>>
+! CHECK: return %[[LOAD_RET]] : !fir.logical<4>
+! CHECK: }
+
+function test_c_ptr_ne(ptr1, ptr2)
+  use, intrinsic :: iso_c_binding
+  type(c_ptr), intent(in) :: ptr1, ptr2
+  logical :: test_c_ptr_ne
+  test_c_ptr_ne = (ptr1 .ne. ptr2)
+end
+
+! CHECK-LABEL: func.func @_QPtest_c_ptr_ne(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> {
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_ne", uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"}
+! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS0:.*]] = fir.coordinate_of %[[DECL_ARG0]]#1, %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS0:.*]] = fir.load %[[COORD_ADDRESS0]] : !fir.ref<i64>
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS1:.*]] = fir.coordinate_of %[[DECL_ARG1]]#1, %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS1:.*]] = fir.load %[[COORD_ADDRESS1]] : !fir.ref<i64>
+! CHECK: %[[CMP:.*]] = arith.cmpi ne, %[[ADDRESS0]], %[[ADDRESS1]] : i64
+! CHECK: %[[RES:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
+! CHECK: %[[NO_REASSOC_RES:.*]] = hlfir.no_reassoc %[[RES]] : !fir.logical<4>
+! CHECK: hlfir.assign %[[NO_REASSOC_RES]] to %[[DECL_RET]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK: %[[LOAD_RET:.*]] = fir.load %[[DECL_RET]]#1 : !fir.ref<!fir.logical<4>>
+! CHECK: return %[[LOAD_RET]] : !fir.logical<4>
+! CHECK: }
diff --git a/flang/test/Lower/Intrinsics/extends_type_of.f90 b/flang/test/Lower/Intrinsics/extends_type_of.f90
index b36d4e8b7c0d4..f99a63e30a552 100644
--- a/flang/test/Lower/Intrinsics/extends_type_of.f90
+++ b/flang/test/Lower/Intrinsics/extends_type_of.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module extends_type_of_mod
 
diff --git a/flang/test/Lower/Intrinsics/modulo.f90 b/flang/test/Lower/Intrinsics/modulo.f90
index 001e307aa077a..383cb34f83c70 100644
--- a/flang/test/Lower/Intrinsics/modulo.f90
+++ b/flang/test/Lower/Intrinsics/modulo.f90
@@ -40,17 +40,6 @@ subroutine modulo_testi(r, a, p)
 ! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f128>{{.*}}, %[[arg1:.*]]: !fir.ref<f128>{{.*}}, %[[arg2:.*]]: !fir.ref<f128>{{.*}}) {
 subroutine modulo_testr16(r, a, p)
   real(16) :: r, a, p
-  ! CHECK-DAG: %[[a:.*]] = fir.load %[[arg1]] : !fir.ref<f128>
-  ! CHECK-DAG: %[[p:.*]] = fir.load %[[arg2]] : !fir.ref<f128>
-  ! CHECK-DAG: %[[rem:.*]] = arith.remf %[[a]], %[[p]] {{.*}}: f128
-  ! CHECK-DAG: %[[zero:.*]] = arith.constant 0.000000e+00 : f128
-  ! CHECK-DAG: %[[remNotZero:.*]] = arith.cmpf une, %[[rem]], %[[zero]] {{.*}} : f128
-  ! CHECK-DAG: %[[aNeg:.*]] = arith.cmpf olt, %[[a]], %[[zero]] {{.*}} : f128
-  ! CHECK-DAG: %[[pNeg:.*]] = arith.cmpf olt, %[[p]], %[[zero]] {{.*}} : f128
-  ! CHECK-DAG: %[[signDifferent:.*]] = arith.xori %[[aNeg]], %[[pNeg]] : i1
-  ! CHECK-DAG: %[[mustAddP:.*]] = arith.andi %[[remNotZero]], %[[signDifferent]] : i1
-  ! CHECK-DAG: %[[remPlusP:.*]] = arith.addf %[[rem]], %[[p]] {{.*}}: f128
-  ! CHECK: %[[res:.*]] = arith.select %[[mustAddP]], %[[remPlusP]], %[[rem]] : f128
-  ! CHECK: fir.store %[[res]] to %[[arg0]] : !fir.ref<f128>
+  ! CHECK: fir.call @_FortranAModuloReal16({{.*}}){{.*}}: (f128, f128, !fir.ref<i8>, i32) -> f128
   r = modulo(a, p)
 end subroutine
diff --git a/flang/test/Lower/Intrinsics/same_type_as.f90 b/flang/test/Lower/Intrinsics/same_type_as.f90
index e1333024927d5..e0fda563f08bb 100644
--- a/flang/test/Lower/Intrinsics/same_type_as.f90
+++ b/flang/test/Lower/Intrinsics/same_type_as.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module same_type_as_mod
 
diff --git a/flang/test/Lower/Intrinsics/sizeof.f90 b/flang/test/Lower/Intrinsics/sizeof.f90
index 959ca1692b514..e10cb79981a69 100644
--- a/flang/test/Lower/Intrinsics/sizeof.f90
+++ b/flang/test/Lower/Intrinsics/sizeof.f90
@@ -1,5 +1,5 @@
 ! Test SIZEOF lowering for polymorphic entities.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 integer(8) function test1(x)
   class(*) :: x
diff --git a/flang/test/Lower/Intrinsics/spread.f90 b/flang/test/Lower/Intrinsics/spread.f90
index 4cd823d837453..d58725aba6987 100644
--- a/flang/test/Lower/Intrinsics/spread.f90
+++ b/flang/test/Lower/Intrinsics/spread.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module spread_mod
 
diff --git a/flang/test/Lower/Intrinsics/storage_size.f90 b/flang/test/Lower/Intrinsics/storage_size.f90
index 668322f45bd85..b0c9d51f95328 100644
--- a/flang/test/Lower/Intrinsics/storage_size.f90
+++ b/flang/test/Lower/Intrinsics/storage_size.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module storage_size_test
   type :: p1
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
new file mode 100644
index 0000000000000..aa835f82b2730
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
@@ -0,0 +1,36 @@
+! Test delayed privatization for allocatables: `firstprivate`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+subroutine delayed_privatization_allocatable
+  implicit none
+  integer, allocatable :: var1
+
+!$omp parallel firstprivate(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CHECK: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+
+! CHECK-NEXT:  %[[PRIV_BASE_VAL:.*]] = fir.load %[[PRIV_PRIV_ARG]]
+! CHECK-NEXT:  %[[PRIV_BASE_BOX:.*]] = fir.box_addr %[[PRIV_BASE_VAL]]
+! CHECK-NEXT:  %[[PRIV_BASE_ADDR:.*]] = fir.convert %[[PRIV_BASE_BOX]]
+! CHECK-NEXT:  %[[C0:.*]] = arith.constant 0 : i64
+! CHECK-NEXT:  %[[COPY_COND:.*]] = arith.cmpi ne, %[[PRIV_BASE_ADDR]], %[[C0]] : i64
+
+! CHECK-NEXT:  fir.if %[[COPY_COND]] {
+! CHECK-NEXT:    %[[ORIG_BASE_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
+! CHECK-NEXT:    %[[ORIG_BASE_ADDR:.*]] = fir.box_addr %[[ORIG_BASE_VAL]]
+! CHECK-NEXT:    %[[ORIG_BASE_LD:.*]] = fir.load %[[ORIG_BASE_ADDR]]
+! CHECK-NEXT:    hlfir.assign %[[ORIG_BASE_LD]] to %[[PRIV_BASE_BOX]] temporary_lhs
+! CHECK-NEXT:  }
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
new file mode 100644
index 0000000000000..cc1818b00b809
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
@@ -0,0 +1,43 @@
+! Test delayed privatization for allocatables: `private`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+subroutine delayed_privatization_allocatable
+  implicit none
+  integer, allocatable :: var1
+
+!$omp parallel private(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_allocatableEvar1"}
+
+! CHECK-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
+! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+
+! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
+! CHECK-NEXT:     %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFdelayed_privatization_allocatableEvar1.alloc"}
+! CHECK-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK-NEXT:   } else {
+! CHECK-NEXT:     %[[ZERO_BITS:.*]] = fir.zero_bits !fir.heap<i32>
+! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[ZERO_BITS]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK-NEXT:   }
+
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+
+! CHECK-NEXT: }
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-pointer.f90 b/flang/test/Lower/OpenMP/delayed-privatization-pointer.f90
new file mode 100644
index 0000000000000..796e4720c8c95
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-pointer.f90
@@ -0,0 +1,31 @@
+! Test delayed privatization for pointers: `private`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+subroutine delayed_privatization_pointer
+  implicit none
+  integer, pointer :: var1
+
+!$omp parallel firstprivate(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.ptr<i32>>>]] alloc {
+
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_pointerEvar1"}
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+
+! CHECK-NEXT: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! CHECK-NEXT:    %[[ORIG_BASE_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
+ ! CHECK-NEXT:   fir.store %[[ORIG_BASE_VAL]] to %[[PRIV_PRIV_ARG]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
+! CHECK-NEXT: }
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-rename.f90 b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
new file mode 100644
index 0000000000000..86db2ffbcfeba
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
@@ -0,0 +1,36 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+module m1
+  intrinsic max
+end module m1
+program main
+  use m1, ren=>max
+  n=0
+  !$omp parallel reduction(ren:n)
+  print *, "par"
+  !$omp end parallel
+end program main
+
+! test that we understood that this should be a max reduction
+
+! CHECK-LABEL:   omp.reduction.declare @max_i_32 : i32 init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
+! CHECK:           %[[VAL_1:.*]] = arith.constant -2147483648 : i32
+! CHECK:           omp.yield(%[[VAL_1]] : i32)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+! CHECK:           %[[VAL_2:.*]] = arith.maxsi %[[VAL_0]], %[[VAL_1]] : i32
+! CHECK:           omp.yield(%[[VAL_2]] : i32)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFEn"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_1]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel reduction(@max_i_32 %[[VAL_1]]#0 -> %[[VAL_3:.*]] : !fir.ref<i32>) {
+! ...
+! CHECK:             omp.terminator
+
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
index 9289973bae202..86fb067fc48eb 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
@@ -17,8 +17,16 @@ program reduce
 
 end program
 
-! TODO: the reduction is not curently lowered correctly. This test is checking
-! that we do not crash and we still produce the same broken IR as before.
+! CHECK-LABEL:   omp.reduction.declare @min_i_32 : i32 init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 2147483647 : i32
+! CHECK:           omp.yield(%[[VAL_1]] : i32)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+! CHECK:           %[[VAL_2:.*]] = arith.minsi %[[VAL_0]], %[[VAL_1]] : i32
+! CHECK:           omp.yield(%[[VAL_2]] : i32)
+! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
@@ -31,10 +39,11 @@ program reduce
 ! CHECK:             %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop  for  (%[[VAL_9:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:               fir.store %[[VAL_9]] to %[[VAL_5]]#1 : !fir.ref<i32>
-! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
-! CHECK:               hlfir.assign %[[VAL_10]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:             omp.wsloop reduction(@min_i_32 %[[VAL_3]]#0 -> %[[VAL_9:.*]] : !fir.ref<i32>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_12:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:               hlfir.assign %[[VAL_12]] to %[[VAL_11]]#0 : i32, !fir.ref<i32>
 ! CHECK:               omp.yield
 ! CHECK:             }
 ! CHECK:             omp.terminator
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index 6ba0b13b24b6c..10d7d957a2573 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -1,5 +1,5 @@
-! RUN: bbc --use-desc-for-alloc=false -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
-! RUN: bbc --use-desc-for-alloc=false -polymorphic-type -emit-hlfir %s -o - | tco | FileCheck %s --check-prefix=LLVM
+! RUN: bbc --use-desc-for-alloc=false -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc --use-desc-for-alloc=false -emit-hlfir %s -o - | tco | FileCheck %s --check-prefix=LLVM
 
 module poly
   type p1
diff --git a/flang/test/Lower/allocatable-return.f90 b/flang/test/Lower/allocatable-return.f90
index 92c5f8ad78d7c..363d16237b9bf 100644
--- a/flang/test/Lower/allocatable-return.f90
+++ b/flang/test/Lower/allocatable-return.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false --polymorphic-type -I nowhere %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -I nowhere %s -o - | FileCheck %s
 
 ! Test allocatable return.
 ! Allocatable arrays must have default runtime lbounds after the return.
diff --git a/flang/test/Lower/assumed-type.f90 b/flang/test/Lower/assumed-type.f90
index 99472a255adb7..44ce41d757379 100644
--- a/flang/test/Lower/assumed-type.f90
+++ b/flang/test/Lower/assumed-type.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module assumed_type_test
 
diff --git a/flang/test/Lower/default-initialization.f90 b/flang/test/Lower/default-initialization.f90
index e692b9b08446e..7a6133452b3a2 100644
--- a/flang/test/Lower/default-initialization.f90
+++ b/flang/test/Lower/default-initialization.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of local and dummy variables (dynamic initialization)
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module test_dinit
   type t
diff --git a/flang/test/Lower/derived-type-finalization.f90 b/flang/test/Lower/derived-type-finalization.f90
index 5c4531884213d..e7ade0d8145bb 100644
--- a/flang/test/Lower/derived-type-finalization.f90
+++ b/flang/test/Lower/derived-type-finalization.f90
@@ -1,5 +1,5 @@
 ! Test derived type finalization
-! RUN: bbc --use-desc-for-alloc=false -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 ! Missing tests:
 ! - finalization within BLOCK construct
diff --git a/flang/test/Lower/dispatch-table.f90 b/flang/test/Lower/dispatch-table.f90
index 0df4981b3eaf8..023dbcbedeaf4 100644
--- a/flang/test/Lower/dispatch-table.f90
+++ b/flang/test/Lower/dispatch-table.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
 ! Tests the generation of fir.type_info operations.
 
diff --git a/flang/test/Lower/dispatch.f90 b/flang/test/Lower/dispatch.f90
index 1aad4a4b8e46f..60364076e633b 100644
--- a/flang/test/Lower/dispatch.f90
+++ b/flang/test/Lower/dispatch.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
 ! Tests the different possible type involving polymorphic entities.
 
diff --git a/flang/test/Lower/intentout-deallocate.f90 b/flang/test/Lower/intentout-deallocate.f90
index 30502fd5d722a..8e7ccbcc9fdb9 100644
--- a/flang/test/Lower/intentout-deallocate.f90
+++ b/flang/test/Lower/intentout-deallocate.f90
@@ -1,6 +1,6 @@
 ! Test correct deallocation of intent(out) allocatables.
-! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s --check-prefixes=CHECK,FIR
-! RUN: bbc -emit-hlfir -polymorphic-type %s -o - -I nw | FileCheck %s --check-prefixes=CHECK,HLFIR
+! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false %s -o - | FileCheck %s --check-prefixes=CHECK,FIR
+! RUN: bbc -emit-hlfir %s -o - -I nw | FileCheck %s --check-prefixes=CHECK,HLFIR
 
 module mod1
   type, bind(c) :: t1
diff --git a/flang/test/Lower/io-derived-type-2.f90 b/flang/test/Lower/io-derived-type-2.f90
index c2f1ff1850725..eeea2ae156f94 100644
--- a/flang/test/Lower/io-derived-type-2.f90
+++ b/flang/test/Lower/io-derived-type-2.f90
@@ -1,6 +1,6 @@
 ! Check that InputDerivedType/OutputDeriverType APIs are used
 ! for io of derived types.
-! RUN: bbc -polymorphic-type -emit-fir -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -o - %s | FileCheck %s
 
 module p
   type :: person
diff --git a/flang/test/Lower/io-derived-type.f90 b/flang/test/Lower/io-derived-type.f90
index 84f57f46289bd..08b1207f55ad1 100644
--- a/flang/test/Lower/io-derived-type.f90
+++ b/flang/test/Lower/io-derived-type.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 module m
   type t
diff --git a/flang/test/Lower/nullify-polymorphic.f90 b/flang/test/Lower/nullify-polymorphic.f90
index ee271b815bdd5..5cb966810f1b9 100644
--- a/flang/test/Lower/nullify-polymorphic.f90
+++ b/flang/test/Lower/nullify-polymorphic.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
 module poly
   type p1
diff --git a/flang/test/Lower/pass-null-for-class-arg.f90 b/flang/test/Lower/pass-null-for-class-arg.f90
index d3d657f29a36c..f60cad30b1bf6 100644
--- a/flang/test/Lower/pass-null-for-class-arg.f90
+++ b/flang/test/Lower/pass-null-for-class-arg.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -emit-fir -polymorphic-type %s -o - | FileCheck %s --check-prefix=FIR
-! RUN: bbc -emit-fir -polymorphic-type -hlfir %s -o - | FileCheck %s --check-prefix=HLFIR
+! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefix=FIR
+! RUN: bbc -emit-fir -hlfir %s -o - | FileCheck %s --check-prefix=HLFIR
 
 subroutine test
   interface
diff --git a/flang/test/Lower/pointer-association-polymorphic.f90 b/flang/test/Lower/pointer-association-polymorphic.f90
index bde96429c0ee9..6c56db892d1b8 100644
--- a/flang/test/Lower/pointer-association-polymorphic.f90
+++ b/flang/test/Lower/pointer-association-polymorphic.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module poly
   type p1
diff --git a/flang/test/Lower/pointer-disassociate.f90 b/flang/test/Lower/pointer-disassociate.f90
index 8d71905831655..e341bca5cd89b 100644
--- a/flang/test/Lower/pointer-disassociate.f90
+++ b/flang/test/Lower/pointer-disassociate.f90
@@ -1,5 +1,5 @@
 ! Test lowering of pointer disassociation
-! RUN: bbc -emit-fir -hlfir=false --polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 
 ! -----------------------------------------------------------------------------
diff --git a/flang/test/Lower/polymorphic-temp.f90 b/flang/test/Lower/polymorphic-temp.f90
index 46cf14dcf93f4..8633620e8430e 100644
--- a/flang/test/Lower/polymorphic-temp.f90
+++ b/flang/test/Lower/polymorphic-temp.f90
@@ -1,5 +1,5 @@
 ! Test creation of temporary from polymorphic enities
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module poly_tmp
   type p1
diff --git a/flang/test/Lower/polymorphic-types.f90 b/flang/test/Lower/polymorphic-types.f90
index 0288fea6eb38b..a06e0a29b6ae8 100644
--- a/flang/test/Lower/polymorphic-types.f90
+++ b/flang/test/Lower/polymorphic-types.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 ! Tests the different possible type involving polymorphic entities. 
 
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index 15d8a86e4ef47..e031b4805dc5b 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -1,4 +1,4 @@
-! RUN: bbc --use-desc-for-alloc=false -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 ! Tests various aspect of the lowering of polymorphic entities.
 
diff --git a/flang/test/Lower/select-type-2.f90 b/flang/test/Lower/select-type-2.f90
index 3769132b01c1f..7b4cf96069e53 100644
--- a/flang/test/Lower/select-type-2.f90
+++ b/flang/test/Lower/select-type-2.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | fir-opt --fir-polymorphic-op | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | fir-opt --fir-polymorphic-op | FileCheck %s
 module select_type_2
   type p1
     integer :: a
diff --git a/flang/test/Lower/select-type.f90 b/flang/test/Lower/select-type.f90
index ef91fb913a1c8..3243a813e9d59 100644
--- a/flang/test/Lower/select-type.f90
+++ b/flang/test/Lower/select-type.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | fir-opt --fir-polymorphic-op | FileCheck --check-prefix=CFG %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | fir-opt --fir-polymorphic-op | FileCheck --check-prefix=CFG %s
 module select_type_lower_test
   type p1
     integer :: a
diff --git a/flang/test/Parser/cuf-sanity-tree.CUF b/flang/test/Parser/cuf-sanity-tree.CUF
index f6cf9bbdd6b0c..dc12759d3ce52 100644
--- a/flang/test/Parser/cuf-sanity-tree.CUF
+++ b/flang/test/Parser/cuf-sanity-tree.CUF
@@ -144,11 +144,11 @@ include "cuf-sanity-common"
 !CHECK: | | | | | | EndDoStmt -> 
 !CHECK: | | | | ExecutionPartConstruct -> ExecutableConstruct -> CUFKernelDoConstruct
 !CHECK: | | | | | Directive
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '1_4'
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Integer -> Expr = '1_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '2_4'
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Integer -> Expr = '2_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '3_4'
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Integer -> Expr = '3_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '3'
 !CHECK: | | | | | | Scalar -> Integer -> Expr = '1_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
diff --git a/flang/test/Semantics/OpenMP/reduction11.f90 b/flang/test/Semantics/OpenMP/reduction11.f90
new file mode 100644
index 0000000000000..3893fe70b407f
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/reduction11.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols -o - %s 2>&1 | FileCheck %s
+! Check intrinsic reduction symbols (in this case "max" are marked as INTRINSIC
+
+! CHECK: MainProgram scope: omp_reduction
+program omp_reduction
+  ! CHECK: i size=4 offset=0: ObjectEntity type: INTEGER(4)
+  integer i
+  ! CHECK: k size=4 offset=4: ObjectEntity type: INTEGER(4) init:10_4
+  integer :: k = 10
+  ! CHECK: m size=4 offset=8: ObjectEntity type: INTEGER(4) init:12_4
+  integer :: m = 12
+
+  ! CHECK: OtherConstruct scope
+  ! CHECK: i (OmpPrivate, OmpPreDetermined): HostAssoc
+  ! CHECK: k (OmpReduction): HostAssoc
+  ! CHECK: max, INTRINSIC: ProcEntity
+  !$omp parallel do  reduction(max:k)
+  do i=1,10
+    k = i
+  end do
+  !$omp end parallel do
+end program omp_reduction
diff --git a/flang/test/Semantics/call41.f90 b/flang/test/Semantics/call41.f90
new file mode 100644
index 0000000000000..a4c7514d99ba5
--- /dev/null
+++ b/flang/test/Semantics/call41.f90
@@ -0,0 +1,12 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror
+module m
+ contains
+  subroutine unlimited(x)
+    class(*), intent(in) :: x
+  end
+  subroutine test
+    !PORTABILITY: passing Hollerith to unlimited polymorphic as if it were CHARACTER
+    call unlimited(6HHERMAN)
+    call unlimited('abc') ! ok
+  end
+end
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index dd70c3b1ff5ef..4bc93132044fd 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -10,6 +10,15 @@ module m
 end
 
 program main
+  !$cuf kernel do <<< *, * >>> ! ok
+  do j = 1, 0
+  end do
+  !$cuf kernel do <<< (*), (*) >>> ! ok
+  do j = 1, 0
+  end do
+  !$cuf kernel do <<< (1,*), (2,*) >>> ! ok
+  do j = 1, 0
+  end do
   !ERROR: !$CUF KERNEL DO (1) must be followed by a DO construct with tightly nested outer levels of counted DO loops
   !$cuf kernel do <<< 1, 2 >>>
   do while (.false.)
diff --git a/flang/test/Semantics/declarations02.f90 b/flang/test/Semantics/declarations02.f90
index 439527a0edb6a..f39c233c1c3a4 100644
--- a/flang/test/Semantics/declarations02.f90
+++ b/flang/test/Semantics/declarations02.f90
@@ -10,6 +10,20 @@ module m
   integer, parameter :: x3 = 1
   bind(c) :: x3
 
+  !ERROR: 'x4' may not have both the ALLOCATABLE and PARAMETER attributes
+  !ERROR: 'x4' may not have both the ASYNCHRONOUS and PARAMETER attributes
+  !ERROR: 'x4' may not have both the SAVE and PARAMETER attributes
+  !ERROR: 'x4' may not have both the TARGET and PARAMETER attributes
+  !ERROR: 'x4' may not have both the VOLATILE and PARAMETER attributes
+  !ERROR: The entity 'x4' with an explicit SAVE attribute must be a variable, procedure pointer, or COMMON block
+  !ERROR: An entity may not have the ASYNCHRONOUS attribute unless it is a variable
+  integer, parameter :: x4 = 1
+  allocatable x4
+  asynchronous x4
+  save x4
+  target x4
+  volatile x4
+
   type :: my_type1
     integer :: x4
   end type
diff --git a/flang/test/Semantics/doconcurrent08.f90 b/flang/test/Semantics/doconcurrent08.f90
index 41cd71e233d0d..52b382741d073 100644
--- a/flang/test/Semantics/doconcurrent08.f90
+++ b/flang/test/Semantics/doconcurrent08.f90
@@ -209,6 +209,8 @@ module m2
   type :: impureFinal
    contains
     final :: impureSub
+    final :: impureSubRank1
+    final :: impureSubRank2
   end type
 
   type :: pureFinal
@@ -222,16 +224,27 @@ impure subroutine impureSub(x)
     type(impureFinal), intent(in) :: x
   end subroutine
 
+  impure subroutine impureSubRank1(x)
+    type(impureFinal), intent(in) :: x(:)
+  end subroutine
+
+  impure subroutine impureSubRank2(x)
+    type(impureFinal), intent(in) :: x(:,:)
+  end subroutine
+
   pure subroutine pureSub(x)
     type(pureFinal), intent(in) :: x
   end subroutine
 
   subroutine s4()
     type(impureFinal), allocatable :: ifVar, ifvar1
+    type(impureFinal), allocatable :: ifArr1(:), ifArr2(:,:)
+    type(impureFinal) :: if0
     type(pureFinal), allocatable :: pfVar
     allocate(ifVar)
     allocate(ifVar1)
     allocate(pfVar)
+    allocate(ifArr1(5), ifArr2(5,5))
 
     ! OK for an ordinary DO loop
     do i = 1,10
@@ -239,11 +252,9 @@ subroutine s4()
     end do
 
     ! OK to invoke a PURE FINAL procedure in a DO CONCURRENT
-    ! This case does not work currently because the compiler's test for
-    ! HasImpureFinal() in .../lib/Semantics/tools.cc doesn't work correctly
-!    do concurrent (i = 1:10)
-!      if (i .eq. 1) deallocate(pfVar)
-!    end do
+    do concurrent (i = 1:10)
+      if (i .eq. 1) deallocate(pfVar)
+    end do
 
     ! Error to invoke an IMPURE FINAL procedure in a DO CONCURRENT
     do concurrent (i = 1:10)
@@ -271,6 +282,34 @@ subroutine s4()
         ifvar = ifvar1
       end if
     end do
+
+    do concurrent (i = 1:5)
+      if (i .eq. 1) then
+        !ERROR: Deallocation of an entity with an IMPURE FINAL procedure 'impuresub' caused by assignment not allowed in DO CONCURRENT
+        ifArr1(i) = if0
+      end if
+    end do
+
+    do concurrent (i = 1:5)
+      if (i .eq. 1) then
+        !ERROR: Deallocation of an entity with an IMPURE FINAL procedure 'impuresubrank1' caused by assignment not allowed in DO CONCURRENT
+        ifArr1 = if0
+      end if
+    end do
+
+    do concurrent (i = 1:5)
+      if (i .eq. 1) then
+        !ERROR: Deallocation of an entity with an IMPURE FINAL procedure 'impuresubrank1' caused by assignment not allowed in DO CONCURRENT
+        ifArr2(i,:) = if0
+      end if
+    end do
+
+    do concurrent (i = 1:5)
+      if (i .eq. 1) then
+        !ERROR: Deallocation of an entity with an IMPURE FINAL procedure 'impuresubrank2' caused by assignment not allowed in DO CONCURRENT
+        ifArr2(:,:) = if0
+      end if
+    end do
   end subroutine s4
 
 end module m2
diff --git a/flang/test/Semantics/forall02.f90 b/flang/test/Semantics/forall02.f90
new file mode 100644
index 0000000000000..c4f4311a175a3
--- /dev/null
+++ b/flang/test/Semantics/forall02.f90
@@ -0,0 +1,67 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+
+module m1
+  type :: impureFinal
+  contains
+    final :: impureSub
+    final :: impureSubRank1
+    final :: impureSubRank2
+  end type
+
+ contains
+
+  impure subroutine impureSub(x)
+    type(impureFinal), intent(in) :: x
+  end subroutine
+
+  impure subroutine impureSubRank1(x)
+    type(impureFinal), intent(in) :: x(:)
+  end subroutine
+
+  impure subroutine impureSubRank2(x)
+    type(impureFinal), intent(in) :: x(:,:)
+  end subroutine
+
+  subroutine s1()
+    implicit none
+    integer :: i
+    type(impureFinal), allocatable :: ifVar, ifvar1
+    type(impureFinal), allocatable :: ifArr1(:), ifArr2(:,:)
+    type(impureFinal) :: if0
+    integer a(10)
+    allocate(ifVar)
+    allocate(ifVar1)
+    allocate(ifArr1(5), ifArr2(5,5))
+
+    ! Error to invoke an IMPURE FINAL procedure in a FORALL
+    forall (i = 1:10)
+      !WARNING: FORALL index variable 'i' not used on left-hand side of assignment
+      !ERROR: Impure procedure 'impuresub' is referenced by finalization in a FORALL
+      ifvar = ifvar1
+    end forall
+
+    forall (i = 1:5)
+      !ERROR: Impure procedure 'impuresub' is referenced by finalization in a FORALL
+      ifArr1(i) = if0
+    end forall
+
+    forall (i = 1:5)
+      !WARNING: FORALL index variable 'i' not used on left-hand side of assignment
+      !ERROR: Impure procedure 'impuresubrank1' is referenced by finalization in a FORALL
+      ifArr1 = if0
+    end forall
+
+    forall (i = 1:5)
+      !ERROR: Impure procedure 'impuresubrank1' is referenced by finalization in a FORALL
+      ifArr2(i,:) = if0
+    end forall
+
+    forall (i = 1:5)
+      !WARNING: FORALL index variable 'i' not used on left-hand side of assignment
+      !ERROR: Impure procedure 'impuresubrank2' is referenced by finalization in a FORALL
+      ifArr2(:,:) = if0
+    end forall
+  end subroutine
+
+end module m1
+
diff --git a/flang/test/Semantics/modfile63.f90 b/flang/test/Semantics/modfile63.f90
index aaf1f7beaa48f..0783121017243 100644
--- a/flang/test/Semantics/modfile63.f90
+++ b/flang/test/Semantics/modfile63.f90
@@ -1,12 +1,10 @@
 ! RUN: %flang_fc1 -fsyntax-only -I%S/Inputs/dir1 %s
 ! RUN: not %flang_fc1 -fsyntax-only -I%S/Inputs/dir2 %s 2>&1 | FileCheck --check-prefix=ERROR %s
 ! RUN: %flang_fc1 -Werror -fsyntax-only -I%S/Inputs/dir1 -I%S/Inputs/dir2 %s
-! RUN: not %flang_fc1 -Werror -fsyntax-only -I%S/Inputs/dir2 -I%S/Inputs/dir1 %s 2>&1 | FileCheck  --check-prefix=WARNING %s
 
 ! Inputs/dir1 and Inputs/dir2 each have identical copies of modfile63b.mod.
 ! modfile63b.mod depends on Inputs/dir1/modfile63a.mod - the version in
-! Inputs/dir2/modfile63a.mod has a distinct checksum and should be
-! ignored with a warning.
+! Inputs/dir2/modfile63a.mod has a distinct checksum.
 
 ! If it becomes necessary to recompile those modules, just use the
 ! module files as Fortran source.
@@ -15,5 +13,4 @@
 call s2
 end
 
-! ERROR: Could not find a module file for 'modfile63a' in the module search path with the expected checksum
-! WARNING: Module file for 'modfile63a' appears later in the module search path than conflicting modules with different checksums
+! ERROR: Cannot read module file for module 'modfile63a': File is not the right module file for 'modfile63a':
diff --git a/flang/test/Semantics/resolve11.f90 b/flang/test/Semantics/resolve11.f90
index 33ce88342b49b..db508f062d1d1 100644
--- a/flang/test/Semantics/resolve11.f90
+++ b/flang/test/Semantics/resolve11.f90
@@ -49,3 +49,40 @@ logical function gt(x, y)
   !ERROR: The accessibility of 'OPERATOR(.GT.)' has already been specified as PUBLIC
   private :: operator(.gt.)
 end
+
+module m4
+  private
+  type, public :: foo
+  end type
+  interface foo
+    procedure fun
+  end interface
+ contains
+  function fun
+  end
+end
+
+subroutine s4
+  !ERROR: 'fun' is PRIVATE in 'm4'
+  use m4, only: foo, fun
+  type(foo) x ! ok
+  print *, foo() ! ok
+end
+
+module m5
+  public
+  type, private :: foo
+  end type
+  interface foo
+    procedure fun
+  end interface
+ contains
+  function fun
+  end
+end
+
+subroutine s5
+  !ERROR: 'foo' is PRIVATE in 'm5'
+  use m5, only: foo, fun
+  print *, fun() ! ok
+end
diff --git a/flang/test/Semantics/separate-mp05.f90 b/flang/test/Semantics/separate-mp05.f90
index 5b7e2523a2286..ad002142fd285 100644
--- a/flang/test/Semantics/separate-mp05.f90
+++ b/flang/test/Semantics/separate-mp05.f90
@@ -7,8 +7,10 @@ module m
   !DEF: /m/smp MODULE, PUBLIC, PURE (Function) Subprogram REAL(4)
   !DEF: /m/smp/f EXTERNAL, PURE (Function) Subprogram REAL(4)
   !DEF: /m/smp/x INTENT(IN) ObjectEntity REAL(4)
-  !DEF: /m/smp/res (Implicit) ObjectEntity REAL(4)
+  !DEF: /m/smp/res ObjectEntity REAL(4)
   pure module function smp(f, x) result(res)
+  !REF: /m/smp/res
+  real res
    interface
     !REF: /m/smp/f
     !DEF: /m/smp/f/x INTENT(IN) ObjectEntity REAL(4)
@@ -32,7 +34,7 @@ pure function f(x) result(r)
 contains
  !DEF: /m/sm/smp MODULE, PUBLIC, PURE (Function) Subprogram REAL(4)
  module procedure smp
-  !DEF: /m/sm/smp/res (Implicit) ObjectEntity REAL(4)
+  !DEF: /m/sm/smp/res ObjectEntity REAL(4)
   !DEF: /m/sm/smp/f EXTERNAL, PURE (Function) Subprogram REAL(4)
   !DEF: /m/sm/smp/x INTENT(IN) ObjectEntity REAL(4)
   res = f(x)
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index 73d740ff43942..a0870d3649c2e 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -191,11 +191,6 @@ static llvm::cl::opt<bool> enableOpenACC("fopenacc",
                                          llvm::cl::desc("enable openacc"),
                                          llvm::cl::init(false));
 
-static llvm::cl::opt<bool> enablePolymorphic(
-    "polymorphic-type",
-    llvm::cl::desc("enable polymorphic type lowering (experimental)"),
-    llvm::cl::init(false));
-
 static llvm::cl::opt<bool> enableNoPPCNativeVecElemOrder(
     "fno-ppc-native-vector-element-order",
     llvm::cl::desc("no PowerPC native vector element order."),
@@ -351,7 +346,6 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
   std::string targetTriple = targetMachine.getTargetTriple().normalize();
   // Use default lowering options for bbc.
   Fortran::lower::LoweringOptions loweringOptions{};
-  loweringOptions.setPolymorphicTypeImpl(enablePolymorphic);
   loweringOptions.setNoPPCNativeVecElemOrder(enableNoPPCNativeVecElemOrder);
   loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR);
   std::vector<Fortran::lower::EnvironmentDefault> envDefaults = {};
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index ba6c6642c0b62..a249583168313 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -63,7 +63,7 @@ if (NOT CMAKE_CROSSCOMPILING)
       COMMAND ${CMAKE_COMMAND} -E make_directory ${FLANG_INTRINSIC_MODULES_DIR}
       COMMAND flang-new -cpp -fsyntax-only ${opts} -module-dir ${FLANG_INTRINSIC_MODULES_DIR}
         ${FLANG_SOURCE_DIR}/module/${filename}.f90
-      DEPENDS flang-new ${FLANG_SOURCE_DIR}/module/${filename}.f90 ${depends}
+      DEPENDS flang-new ${FLANG_SOURCE_DIR}/module/${filename}.f90 ${FLANG_SOURCE_DIR}/module/__fortran_builtins.f90 ${depends}
     )
     add_custom_command(OUTPUT ${base}.f18.mod
       DEPENDS ${base}.mod
diff --git a/flang/unittests/Runtime/NumericalFormatTest.cpp b/flang/unittests/Runtime/NumericalFormatTest.cpp
index 37eecd7708a1e..dee4dda4a2286 100644
--- a/flang/unittests/Runtime/NumericalFormatTest.cpp
+++ b/flang/unittests/Runtime/NumericalFormatTest.cpp
@@ -902,6 +902,14 @@ TEST(IOApiTests, EditDoubleInputValues) {
           0}, // max finite
       {"(EX22.0)", "0X.8P1025             ", 0x7ff0000000000000, ovf}, // +Inf
       {"(EX22.0)", "-0X.8P1025            ", 0xfff0000000000000, ovf}, // -Inf
+      {"(RC,EX22.0)", "0X1.0000000000000P0   ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X1.00000000000008P0  ", 0x3ff0000000000001, 0},
+      {"(RC,EX22.0)", "0X1.000000000000008P0 ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X1.00000000000004P0  ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X.80000000000000P1   ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X.80000000000004P1   ", 0x3ff0000000000001, 0},
+      {"(RC,EX22.0)", "0X.800000000000004P1  ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X.80000000000002P1   ", 0x3ff0000000000000, 0},
       {"(RZ,F7.0)", " 2.e308", 0x7fefffffffffffff, 0}, // +HUGE()
       {"(RD,F7.0)", " 2.e308", 0x7fefffffffffffff, 0}, // +HUGE()
       {"(RU,F7.0)", " 2.e308", 0x7ff0000000000000, ovf}, // +Inf
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 6edf5c656193d..7afb3c5f0faa9 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -134,7 +134,6 @@ option(LLVM_LIBC_FULL_BUILD "Build and test LLVM libc as if it is the full libc"
 option(LLVM_LIBC_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR "Build LLVM libc tests assuming our implementation-defined behavior" ON)
 option(LLVM_LIBC_ENABLE_LINTING "Enables linting of libc source files" OFF)
 
-option(LIBC_GPU_BUILD "Build libc for the GPU. All CPU build options will be ignored." OFF)
 set(LIBC_TARGET_TRIPLE "" CACHE STRING "The target triple for the libc build.")
 
 option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF)
diff --git a/libc/config/baremetal/api.td b/libc/config/baremetal/api.td
index 80d0e0ba22ca5..25aa06aacb642 100644
--- a/libc/config/baremetal/api.td
+++ b/libc/config/baremetal/api.td
@@ -52,6 +52,14 @@ def IntTypesAPI : PublicAPI<"inttypes.h"> {
   let Types = ["imaxdiv_t"];
 }
 
+def MathAPI : PublicAPI<"math.h"> {
+  let Types = ["double_t", "float_t"];
+}
+
+def StdIOAPI : PublicAPI<"stdio.h"> {
+  let Types = ["size_t"];
+}
+
 def StdlibAPI : PublicAPI<"stdlib.h"> {
   let Types = [
     "div_t",
@@ -60,7 +68,6 @@ def StdlibAPI : PublicAPI<"stdlib.h"> {
     "size_t",
     "__bsearchcompare_t",
     "__qsortcompare_t",
-    "__atexithandler_t",
   ];
 }
 
@@ -68,6 +75,15 @@ def StringAPI : PublicAPI<"string.h"> {
   let Types = ["size_t"];
 }
 
+def TimeAPI : PublicAPI<"time.h"> {
+  let Types = [
+    "clock_t",
+    "time_t",
+    "struct tm",
+    "struct timespec",
+  ];
+}
+
 def UCharAPI : PublicAPI<"uchar.h"> {
   let Types = ["mbstate_t"];
 }
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index 589ec5237e98d..17ce56e228a6a 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -79,6 +79,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdio.h entrypoints
+    libc.src.stdio.remove
     libc.src.stdio.sprintf
     libc.src.stdio.snprintf
     libc.src.stdio.vsprintf
@@ -170,7 +171,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.llabs
     libc.src.stdlib.lldiv
     libc.src.stdlib.qsort
-    libc.src.stdlib.qsort_r
     libc.src.stdlib.rand
     libc.src.stdlib.srand
     libc.src.stdlib.strtod
@@ -180,6 +180,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoll
     libc.src.stdlib.strtoul
     libc.src.stdlib.strtoull
+
+    # time.h entrypoints
+    libc.src.time.difftime
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/config/baremetal/arm/headers.txt b/libc/config/baremetal/arm/headers.txt
index 962981f8a2087..3608364e45bde 100644
--- a/libc/config/baremetal/arm/headers.txt
+++ b/libc/config/baremetal/arm/headers.txt
@@ -13,5 +13,6 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.string
     libc.include.strings
     libc.include.sys_queue
+    libc.include.time
     libc.include.uchar
 )
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 09de1b416e0e9..39756e1ee29f5 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -79,6 +79,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdio.h entrypoints
+    libc.src.stdio.remove
     libc.src.stdio.sprintf
     libc.src.stdio.snprintf
     libc.src.stdio.vsprintf
@@ -170,7 +171,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.llabs
     libc.src.stdlib.lldiv
     libc.src.stdlib.qsort
-    libc.src.stdlib.qsort_r
     libc.src.stdlib.rand
     libc.src.stdlib.srand
     libc.src.stdlib.strtod
@@ -180,6 +180,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoll
     libc.src.stdlib.strtoul
     libc.src.stdlib.strtoull
+
+    # time.h entrypoints
+    libc.src.time.difftime
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/config/baremetal/riscv/headers.txt b/libc/config/baremetal/riscv/headers.txt
index 962981f8a2087..3608364e45bde 100644
--- a/libc/config/baremetal/riscv/headers.txt
+++ b/libc/config/baremetal/riscv/headers.txt
@@ -13,5 +13,6 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.string
     libc.include.strings
     libc.include.sys_queue
+    libc.include.time
     libc.include.uchar
 )
diff --git a/libc/config/gpu/api.td b/libc/config/gpu/api.td
index 607b8b6d5900c..adaf5bfd747ac 100644
--- a/libc/config/gpu/api.td
+++ b/libc/config/gpu/api.td
@@ -63,7 +63,6 @@ def StdIOAPI : PublicAPI<"stdio.h"> {
     SimpleMacroDef<"_IOFBF", "0">,
     SimpleMacroDef<"_IOLBF", "1">,
     SimpleMacroDef<"_IONBF", "2">,
-    SimpleMacroDef<"EOF", "-1">,
   ];
   let Types = ["size_t", "FILE"];
 }
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 7d69099e3cb9d..dbf81c284e784 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -216,6 +216,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.mman.mlockall
     libc.src.sys.mman.munlockall
     libc.src.sys.mman.msync
+    libc.src.sys.mman.shm_open
+    libc.src.sys.mman.shm_unlink
 
     # sys/random.h entrypoints
     libc.src.sys.random.getrandom
@@ -415,9 +417,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainderf
     libc.src.math.remainder
@@ -468,7 +476,10 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.lrintf128
     libc.src.math.lroundf128
     libc.src.math.modff128
+    libc.src.math.nanf128
     libc.src.math.nextafterf128
+    libc.src.math.nextdownf128
+    libc.src.math.nextupf128
     libc.src.math.rintf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
@@ -539,6 +550,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.ferror_unlocked
     libc.src.stdio.fgetc
     libc.src.stdio.fflush
+    libc.src.stdio.fileno
     libc.src.stdio.fopen
     libc.src.stdio.fputc
     libc.src.stdio.fputs
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index 75432a2a29865..e9e82c5d47894 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -76,7 +76,6 @@ def StdIOAPI : PublicAPI<"stdio.h"> {
     SimpleMacroDef<"_IOFBF", "0">,
     SimpleMacroDef<"_IOLBF", "1">,
     SimpleMacroDef<"_IONBF", "2">,
-    SimpleMacroDef<"EOF", "-1">,
   ];
   let Types = ["size_t", "FILE", "cookie_io_functions_t"];
 }
@@ -118,7 +117,7 @@ def SchedAPI : PublicAPI<"sched.h"> {
 }
 
 def SysMManAPI : PublicAPI<"sys/mman.h"> {
-  let Types = ["off_t", "size_t"];
+  let Types = ["off_t", "size_t", "mode_t"];
 }
 
 def SignalAPI : PublicAPI<"signal.h"> {
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index bf1559b2f0236..3bc5d8efc9d26 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -285,9 +285,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainder
     libc.src.math.remainderf
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index b1c9dd0428eea..b42a55a4d712e 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -221,6 +221,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.mman.mlockall
     libc.src.sys.mman.munlockall
     libc.src.sys.mman.msync
+    libc.src.sys.mman.shm_open
+    libc.src.sys.mman.shm_unlink
 
     # sys/random.h entrypoints
     libc.src.sys.random.getrandom
@@ -423,9 +425,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainderf
     libc.src.math.remainder
@@ -476,7 +484,10 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.lrintf128
     libc.src.math.lroundf128
     libc.src.math.modff128
+    libc.src.math.nanf128
     libc.src.math.nextafterf128
+    libc.src.math.nextdownf128
+    libc.src.math.nextupf128
     libc.src.math.rintf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
@@ -562,6 +573,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.fgetc_unlocked
     libc.src.stdio.fgets
     libc.src.stdio.fflush
+    libc.src.stdio.fileno
     libc.src.stdio.fopen
     libc.src.stdio.fputc
     libc.src.stdio.fputs
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 4fb31c593b9dc..e8cf11266624a 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -208,6 +208,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.sscanf
     libc.src.stdio.scanf
     libc.src.stdio.fscanf
+    libc.src.stdio.fileno
 
     # sys/epoll.h entrypoints
     libc.src.sys.epoll.epoll_wait
@@ -229,6 +230,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.mman.mlockall
     libc.src.sys.mman.munlockall
     libc.src.sys.mman.msync
+    libc.src.sys.mman.shm_open
+    libc.src.sys.mman.shm_unlink
 
     # sys/random.h entrypoints
     libc.src.sys.random.getrandom
@@ -426,9 +429,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainderf
     libc.src.math.remainder
@@ -481,7 +490,10 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.lrintf128
     libc.src.math.lroundf128
     libc.src.math.modff128
+    libc.src.math.nanf128
     libc.src.math.nextafterf128
+    libc.src.math.nextdownf128
+    libc.src.math.nextupf128
     libc.src.math.rintf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
diff --git a/libc/docs/dev/printf_behavior.rst b/libc/docs/dev/printf_behavior.rst
index 00d6c83f4b0d3..9548bfda57aa7 100644
--- a/libc/docs/dev/printf_behavior.rst
+++ b/libc/docs/dev/printf_behavior.rst
@@ -1,3 +1,5 @@
+.. _printf_behavior:
+
 ====================================
 Printf Behavior Under All Conditions
 ====================================
diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst
index 6e73a305e8e05..50e8bdde89ddd 100644
--- a/libc/docs/dev/undefined_behavior.rst
+++ b/libc/docs/dev/undefined_behavior.rst
@@ -70,3 +70,14 @@ Design Decisions
 Resizable Tables for hsearch
 ----------------------------
 The POSIX.1 standard does not delineate the behavior consequent to invoking hsearch or hdestroy without prior initialization of the hash table via hcreate. Furthermore, the standard does not specify the outcomes of successive invocations of hsearch absent intervening hdestroy calls. Libraries such as MUSL and Glibc do not apply checks to these scenarios, potentially leading to memory corruption or leakage. Conversely, FreeBSD's libc and Bionic automatically initialize the hash table to a minimal size if it is found uninitialized, and proceeding to destroy the table only if initialization has occurred. This approach also avoids redundant table allocation if an initialized hash table is already present. Given that the hash table starts with a minimal size, resizing becomes necessary to accommodate additional user insertions. LLVM's libc mirrors the approach of FreeBSD's libc and Bionic, owing to its enhanced robustness and user-friendliness. Notably, such resizing behavior itself aligns with POSIX.1 standards, which explicitly permit implementations to modify the capacity of the hash table.
+
+Path without Leading Slashs in shm_open
+----------------------------------------
+POSIX.1 leaves that when the name of a shared memory object does not begin with a slash, the behavior is implementation defined. In such cases, the shm_open in LLVM libc is implemented to behave as if the name began with a slash.
+
+Handling of NULL arguments to the 's' format specifier
+------------------------------------------------------
+The C standard does not specify behavior for ``printf("%s", NULL)``. We will
+print the string literal ``(null)`` unless using the 
+``LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS`` option described in :ref:`printf 
+behavior<printf_behavior>`.
diff --git a/libc/docs/fullbuild_mode.rst b/libc/docs/fullbuild_mode.rst
index 0af923de42f84..b1151017fbc79 100644
--- a/libc/docs/fullbuild_mode.rst
+++ b/libc/docs/fullbuild_mode.rst
@@ -6,7 +6,7 @@ Fullbuild Mode
 
 The *fullbuild* mode of LLVM's libc is the mode in which it is to be used as
 the only libc (as opposed to the :ref:`overlay_mode` in which it is used along
-with the system libc.) In to order use it as the only libc, one will have to
+with the system libc.) In order to use it as the only libc, one will have to
 build and install not only the static archives like ``libc.a`` from LLVM's libc,
 but also the start-up objects like ``crt1.o`` and the public headers.
 
diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst
index dab21e1324d28..6d94134a407d3 100644
--- a/libc/docs/gpu/building.rst
+++ b/libc/docs/gpu/building.rst
@@ -220,11 +220,15 @@ targets. This section will briefly describe their purpose.
   be used to enable host services for anyone looking to interface with the
   :ref:`RPC client<libc_gpu_rpc>`.
 
+.. _gpu_cmake_options:
+
 CMake options
 =============
 
 This section briefly lists a few of the CMake variables that specifically
-control the GPU build of the C library.
+control the GPU build of the C library. These options can be passed individually
+to each target using ``-DRUNTIMES_<target>_<variable>=<value>`` when using a
+standard runtime build.
 
 **LLVM_LIBC_FULL_BUILD**:BOOL
   This flag controls whether or not the libc build will generate its own
diff --git a/libc/docs/gpu/testing.rst b/libc/docs/gpu/testing.rst
index 9842a67528361..9f17159fb6d5e 100644
--- a/libc/docs/gpu/testing.rst
+++ b/libc/docs/gpu/testing.rst
@@ -1,9 +1,9 @@
 .. _libc_gpu_testing:
 
 
-============================
-Testing the GPU libc library
-============================
+=========================
+Testing the GPU C library
+=========================
 
 .. note::
    Running GPU tests with high parallelism is likely to cause spurious failures,
@@ -14,24 +14,134 @@ Testing the GPU libc library
   :depth: 4
   :local:
 
-Testing Infrastructure
+Testing infrastructure
 ======================
 
-The testing support in LLVM's libc implementation for GPUs is designed to mimic
-the standard unit tests as much as possible. We use the :ref:`libc_gpu_rpc`
-support to provide the necessary utilities like printing from the GPU. Execution
-is performed by emitting a ``_start`` kernel from the GPU
-that is then called by an external loader utility. This is an example of how
-this can be done manually:
+The LLVM C library supports different kinds of :ref:`tests <build_and_test>`
+depending on the build configuration. The GPU target is considered a full build
+and therefore provides all of its own utilities to build and run the generated
+tests. Currently the GPU supports two kinds of tests.
+
+#. **Hermetic tests** - These are unit tests built with a test suite similar to
+   Google's ``gtest`` infrastructure. These use the same infrastructure as unit
+   tests except that the entire environment is self-hosted. This allows us to
+   run them on the GPU using our custom utilities. These are used to test the
+   majority of functional implementations.
+
+#. **Integration tests** - These are lightweight tests that simply call a
+   ``main`` function and checks if it returns non-zero. These are primarily used
+   to test interfaces that are sensitive to threading.
+
+The GPU uses the same testing infrastructure as the other supported ``libc``
+targets. We do this by treating the GPU as a standard hosted environment capable
+of launching a ``main`` function. Effectively, this means building our own
+startup libraries and loader.
+
+Testing utilities
+=================
+
+We provide two utilities to execute arbitrary programs on the GPU. That is the
+``loader`` and the ``start`` object.
+
+Startup object
+--------------
+
+This object mimics the standard object used by existing C library
+implementations. Its job is to perform the necessary setup prior to calling the
+``main`` function. In the GPU case, this means exporting GPU kernels that will
+perform the necessary operations. Here we use ``_begin`` and ``_end`` to handle
+calling global constructors and destructors while ``_start`` begins the standard
+execution. The following code block shows the implementation for AMDGPU
+architectures.
+
+.. code-block:: c++
+
+  extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+  _begin(int argc, char **argv, char **env) {
+    LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
+    LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
+  }
+
+  extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+  _start(int argc, char **argv, char **envp, int *ret) {
+    __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+  }
+
+  extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+  _end(int retval) {
+    LIBC_NAMESPACE::exit(retval);
+  }
+
+Loader runtime
+--------------
+
+The startup object provides a GPU executable with callable kernels for the
+respective runtime. We can then define a minimal runtime that will launch these
+kernels on the given device. Currently we provide the ``amdhsa-loader`` and
+``nvptx-loader`` targeting the AMD HSA runtime and CUDA driver runtime
+respectively. By default these will launch with a single thread on the GPU.
 
 .. code-block:: sh
 
-   $> clang++ crt1.o test.cpp --target=amdgcn-amd-amdhsa -mcpu=gfx90a -flto
-   $> ./amdhsa_loader --threads 1 --blocks 1 a.out
+   $> clang++ crt1.o test.cpp --target=amdgcn-amd-amdhsa -mcpu=native -flto
+   $> amdhsa_loader --threads 1 --blocks 1 ./a.out
    Test Passed!
 
-Unlike the exported ``libcgpu.a``, the testing architecture can only support a
-single architecture at a time. This is either detected automatically, or set
-manually by the user using ``LIBC_GPU_TEST_ARCHITECTURE``. The latter is useful
-in cases where the user does not build LLVM's libc on machine with the GPU to
-use for testing.
+The loader utility will forward any arguments passed after the executable image
+to the program on the GPU as well as any set environment variables. The number
+of threads and blocks to be set can be controlled with ``--threads`` and
+``--blocks``. These also accept additional ``x``, ``y``, ``z`` variants for
+multidimensional grids.
+
+Running tests
+=============
+
+Tests will only be built and run if a GPU target architecture is set and the
+corresponding loader utility was built. These can be overridden with the
+``LIBC_GPU_TEST_ARCHITECTURE`` and ``LIBC_GPU_LOADER_EXECUTABLE`` :ref:`CMake
+options <gpu_cmake_options>`. Once built, they can be run like any other tests.
+The CMake target depends on how the library was built.
+
+#. **Cross build** - If the C library was built using ``LLVM_ENABLE_PROJECTS``
+   or a runtimes cross build, then the standard targets will be present in the
+   base CMake build directory.
+
+   #. All tests - You can run all supported tests with the command:
+
+      .. code-block:: sh
+
+        $> ninja check-libc
+
+   #. Hermetic tests - You can run hermetic with tests the command:
+
+      .. code-block:: sh
+
+        $> ninja libc-hermetic-tests
+
+   #. Integration tests - You can run integration tests by the command:
+
+      .. code-block:: sh
+
+        $> ninja libc-integration-tests
+
+#. **Runtimes build** - If the library was built using ``LLVM_ENABLE_RUNTIMES``
+   then the actual ``libc`` build will be in a separate directory.
+
+   #. All tests - You can run all supported tests with the command:
+
+      .. code-block:: sh
+
+        $> ninja check-libc-amdgcn-amd-amdhsa
+        $> ninja check-libc-nvptx64-nvidia-cuda
+
+   #. Specific tests - You can use the same targets as above by entering the
+      runtimes build directory.
+
+      .. code-block:: sh
+
+        $> ninja -C runtimes/runtimes-amdgcn-amd-amdhsa-bins check-libc
+        $> ninja -C runtimes/runtimes-nvptx64-nvidia-cuda-bins check-libc
+        $> cd runtimes/runtimes-amdgcn-amd-amdhsa-bins && ninja check-libc
+        $> cd runtimes/runtimes-nvptx64-nvidia-cuda-bins && ninja check-libc
+
+Tests can also be built and run manually using the respective loader utility.
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index 11a00cd620d86..1a9446eeb1130 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -159,17 +159,21 @@ GPUs.
   }
 
 We can then compile this for both NVPTX and AMDGPU into LLVM-IR using the
-following commands.
+following commands. This will yield valid LLVM-IR for the given target just like
+if we were using CUDA, OpenCL, or OpenMP.
 
 .. code-block:: sh
 
   $> clang id.c --target=amdgcn-amd-amdhsa -mcpu=native -nogpulib -flto -c
   $> clang id.c --target=nvptx64-nvidia-cuda -march=native -nogpulib -flto -c
 
-We use this support to treat the GPU as a hosted environment by providing a C
-library and startup object just like a standard C library running on the host
-machine. Then, in order to execute these programs, we provide a loader utility
-to launch the executable on the GPU similar to a cross-compiling emulator.
+We can also use this support to treat the GPU as a hosted environment by
+providing a C library and startup object just like a standard C library running
+on the host machine. Then, in order to execute these programs, we provide a
+loader utility to launch the executable on the GPU similar to a cross-compiling
+emulator. This is how we run :ref:`unit tests <libc_gpu_testing>` targeting the
+GPU. This is clearly not the most efficient way to use a GPU, but it provides a
+simple method to test execution on a GPU for debugging or development.
 
 Building for AMDGPU targets
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index ed54a7d091ecb..d337d060fb5dd 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -259,6 +259,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nanl         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nanf128      | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nearbyint    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nearbyintf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
@@ -273,12 +275,28 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nextafterf128| |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextdown     | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextdownf    | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextdownl    | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextdownf128 | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nexttoward   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nexttowardf  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nexttowardl  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextup       | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextupf      | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextupl      | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextupf128   | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | remainder    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | remainderf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
diff --git a/libc/fuzzing/CMakeLists.txt b/libc/fuzzing/CMakeLists.txt
index a3ef888167ee3..82487688af116 100644
--- a/libc/fuzzing/CMakeLists.txt
+++ b/libc/fuzzing/CMakeLists.txt
@@ -1,7 +1,8 @@
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
 add_custom_target(libc-fuzzer)
 
-add_subdirectory(math)
+# TODO(#85680): Re-enable math fuzzing after headers are sorted out
+# add_subdirectory(math)
 add_subdirectory(stdlib)
 add_subdirectory(stdio)
 add_subdirectory(string)
diff --git a/libc/fuzzing/math/RemQuoDiff.h b/libc/fuzzing/math/RemQuoDiff.h
index 2a96224961767..95a9866f29dbb 100644
--- a/libc/fuzzing/math/RemQuoDiff.h
+++ b/libc/fuzzing/math/RemQuoDiff.h
@@ -11,7 +11,7 @@
 
 #include "src/__support/FPUtil/FPBits.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/libc/fuzzing/math/SingleInputSingleOutputDiff.h b/libc/fuzzing/math/SingleInputSingleOutputDiff.h
index 8c1e4b0d0290e..bdf54ee9af6c4 100644
--- a/libc/fuzzing/math/SingleInputSingleOutputDiff.h
+++ b/libc/fuzzing/math/SingleInputSingleOutputDiff.h
@@ -10,9 +10,7 @@
 #define LLVM_LIBC_FUZZING_MATH_SINGLE_INPUT_SINGLE_OUTPUT_DIFF_H
 
 #include "fuzzing/math/Compare.h"
-#include "src/__support/FPUtil/FPBits.h"
 
-#include <math.h>
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/libc/fuzzing/math/TwoInputSingleOutputDiff.h b/libc/fuzzing/math/TwoInputSingleOutputDiff.h
index e3e3fe703b139..f017dbd63f4af 100644
--- a/libc/fuzzing/math/TwoInputSingleOutputDiff.h
+++ b/libc/fuzzing/math/TwoInputSingleOutputDiff.h
@@ -10,9 +10,7 @@
 #define LLVM_LIBC_FUZZING_MATH_TWO_INPUT_SINGLE_OUTPUT_DIFF_H
 
 #include "fuzzing/math/Compare.h"
-#include "src/__support/FPUtil/FPBits.h"
 
-#include <math.h>
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index b773043bda67d..b000321854d16 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -16,7 +16,7 @@
 
 #include "src/__support/FPUtil/FPBits.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/libc/include/llvm-libc-macros/stdio-macros.h b/libc/include/llvm-libc-macros/stdio-macros.h
index db747c5d5d675..4664801c5731f 100644
--- a/libc/include/llvm-libc-macros/stdio-macros.h
+++ b/libc/include/llvm-libc-macros/stdio-macros.h
@@ -9,6 +9,10 @@
 #ifndef LLVM_LIBC_MACROS_STDIO_MACROS_H
 #define LLVM_LIBC_MACROS_STDIO_MACROS_H
 
+#ifndef EOF
+#define EOF (-1)
+#endif
+
 #define BUFSIZ 1024
 
 #endif // LLVM_LIBC_MACROS_STDIO_MACROS_H
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index 591919aac95dd..3b793ea90ffd3 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -246,6 +246,7 @@ def POSIX : StandardSpec<"POSIX"> {
       [
         SizeTType,
         OffTType,
+        ModeTType,
       ],
       [], // Enumerations
       [
@@ -310,6 +311,16 @@ def POSIX : StandardSpec<"POSIX"> {
           RetValSpec<IntType>,
           [ArgSpec<VoidPtr>, ArgSpec<SizeTType>, ArgSpec<IntType>]
         >,
+        FunctionSpec<
+          "shm_open",
+          RetValSpec<IntType>,
+          [ArgSpec<ConstCharPtr>, ArgSpec<IntType>, ArgSpec<ModeTType>]
+        >,
+        FunctionSpec<
+          "shm_unlink",
+          RetValSpec<IntType>,
+          [ArgSpec<ConstCharPtr>]
+        >,
       ]
   >;
 
@@ -1158,6 +1169,11 @@ def POSIX : StandardSpec<"POSIX"> {
               RetValSpec<IntType>,
               [ArgSpec<VoidType>]
           >,
+          FunctionSpec<
+            "fileno",
+            RetValSpec<IntType>,
+            [ArgSpec<FILEPtr>]
+          >,
       ]
   >;
 
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index afe01b1bb6856..84d28cc335030 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -536,6 +536,16 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"nexttoward", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<LongDoubleType>]>,
           FunctionSpec<"nexttowardl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
 
+          FunctionSpec<"nextdown", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
+          FunctionSpec<"nextdownf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+          FunctionSpec<"nextdownl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"nextdownf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
+          FunctionSpec<"nextup", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
+          FunctionSpec<"nextupf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+          FunctionSpec<"nextupl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"nextupf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
           FunctionSpec<"powf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"pow", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
 
@@ -559,6 +569,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"nanf", RetValSpec<FloatType>, [ArgSpec<ConstCharPtr>]>,
           FunctionSpec<"nan", RetValSpec<DoubleType>, [ArgSpec<ConstCharPtr>]>,
           FunctionSpec<"nanl", RetValSpec<LongDoubleType>, [ArgSpec<ConstCharPtr>]>,
+          GuardedFunctionSpec<"nanf128", RetValSpec<Float128Type>, [ArgSpec<ConstCharPtr>], "LIBC_TYPES_HAS_FLOAT128">,
       ]
   >;
 
diff --git a/libc/src/__support/CPP/array.h b/libc/src/__support/CPP/array.h
index fb5a79225beb7..4e69ba003e800 100644
--- a/libc/src/__support/CPP/array.h
+++ b/libc/src/__support/CPP/array.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_ARRAY_H
 #define LLVM_LIBC_SRC___SUPPORT_CPP_ARRAY_H
 
+#include "src/__support/CPP/iterator.h" // reverse_iterator
 #include "src/__support/macros/attributes.h"
 #include <stddef.h> // For size_t.
 
@@ -23,6 +24,8 @@ template <class T, size_t N> struct array {
   using value_type = T;
   using iterator = T *;
   using const_iterator = const T *;
+  using reverse_iterator = cpp::reverse_iterator<iterator>;
+  using const_reverse_iterator = cpp::reverse_iterator<const_iterator>;
 
   LIBC_INLINE constexpr T *data() { return Data; }
   LIBC_INLINE constexpr const T *data() const { return Data; }
@@ -45,9 +48,29 @@ template <class T, size_t N> struct array {
 
   LIBC_INLINE constexpr iterator begin() { return Data; }
   LIBC_INLINE constexpr const_iterator begin() const { return Data; }
+  LIBC_INLINE constexpr const_iterator cbegin() const { return begin(); }
 
   LIBC_INLINE constexpr iterator end() { return Data + N; }
   LIBC_INLINE constexpr const_iterator end() const { return Data + N; }
+  LIBC_INLINE constexpr const_iterator cend() const { return end(); }
+
+  LIBC_INLINE constexpr reverse_iterator rbegin() {
+    return reverse_iterator{end()};
+  }
+  LIBC_INLINE constexpr const_reverse_iterator rbegin() const {
+    return const_reverse_iterator{end()};
+  }
+  LIBC_INLINE constexpr const_reverse_iterator crbegin() const {
+    return rbegin();
+  }
+
+  LIBC_INLINE constexpr reverse_iterator rend() {
+    return reverse_iterator{begin()};
+  }
+  LIBC_INLINE constexpr const_reverse_iterator rend() const {
+    return const_reverse_iterator{begin()};
+  }
+  LIBC_INLINE constexpr const_reverse_iterator crend() const { return rend(); }
 };
 
 } // namespace cpp
diff --git a/libc/src/__support/CPP/iterator.h b/libc/src/__support/CPP/iterator.h
new file mode 100644
index 0000000000000..c5bfb1912c7b7
--- /dev/null
+++ b/libc/src/__support/CPP/iterator.h
@@ -0,0 +1,63 @@
+//===-- Standalone implementation of iterator -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_CPP_ITERATOR_H
+#define LLVM_LIBC_SRC___SUPPORT_CPP_ITERATOR_H
+
+#include "src/__support/CPP/type_traits/enable_if.h"
+#include "src/__support/CPP/type_traits/is_convertible.h"
+#include "src/__support/CPP/type_traits/is_same.h"
+#include "src/__support/macros/attributes.h"
+
+namespace LIBC_NAMESPACE {
+namespace cpp {
+
+template <typename T> struct iterator_traits;
+template <typename T> struct iterator_traits<T *> {
+  using reference = T &;
+};
+
+template <typename Iter> class reverse_iterator {
+  Iter current;
+
+public:
+  using reference = typename iterator_traits<Iter>::reference;
+
+  LIBC_INLINE reverse_iterator() : current() {}
+  LIBC_INLINE constexpr explicit reverse_iterator(Iter it) : current(it) {}
+
+  template <typename Other,
+            cpp::enable_if_t<!cpp::is_same_v<Iter, Other> &&
+                                 cpp::is_convertible_v<const Other &, Iter>,
+                             int> = 0>
+  LIBC_INLINE constexpr explicit reverse_iterator(const Other &it)
+      : current(it) {}
+
+  LIBC_INLINE constexpr reference operator*() const {
+    Iter tmp = current;
+    return *--tmp;
+  }
+  LIBC_INLINE constexpr reverse_iterator operator--() {
+    ++current;
+    return *this;
+  }
+  LIBC_INLINE constexpr reverse_iterator &operator++() {
+    --current;
+    return *this;
+  }
+  LIBC_INLINE constexpr reverse_iterator operator++(int) {
+    reverse_iterator tmp(*this);
+    --current;
+    return tmp;
+  }
+};
+
+} // namespace cpp
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_CPP_ITERATOR_H
diff --git a/libc/src/__support/CPP/string_view.h b/libc/src/__support/CPP/string_view.h
index d23aa261ca061..8aa96fa87f698 100644
--- a/libc/src/__support/CPP/string_view.h
+++ b/libc/src/__support/CPP/string_view.h
@@ -179,7 +179,8 @@ class string_view {
   LIBC_INLINE char back() const { return Data[Len - 1]; }
 
   // Finds the first occurence of c in this view, starting at position From.
-  LIBC_INLINE size_t find_first_of(const char c, size_t From = 0) const {
+  LIBC_INLINE constexpr size_t find_first_of(const char c,
+                                             size_t From = 0) const {
     for (size_t Pos = From; Pos < size(); ++Pos)
       if ((*this)[Pos] == c)
         return Pos;
@@ -187,13 +188,29 @@ class string_view {
   }
 
   // Finds the last occurence of c in this view, ending at position End.
-  LIBC_INLINE size_t find_last_of(const char c, size_t End = npos) const {
+  LIBC_INLINE constexpr size_t find_last_of(const char c,
+                                            size_t End = npos) const {
     End = End >= size() ? size() : End + 1;
     for (; End > 0; --End)
       if ((*this)[End - 1] == c)
         return End - 1;
     return npos;
   }
+
+  // Finds the first character not equal to c in this view, starting at position
+  // From.
+  LIBC_INLINE constexpr size_t find_first_not_of(const char c,
+                                                 size_t From = 0) const {
+    for (size_t Pos = From; Pos < size(); ++Pos)
+      if ((*this)[Pos] != c)
+        return Pos;
+    return npos;
+  }
+
+  // Check if this view contains the given character.
+  LIBC_INLINE constexpr bool contains(char c) const {
+    return find_first_of(c) != npos;
+  }
 };
 
 } // namespace cpp
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index 6810659733de2..a6a533dcfdf4a 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -9,13 +9,12 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_FENVIMPL_H
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_FENVIMPL_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 #include "src/__support/macros/properties/architectures.h"
 #include "src/errno/libc_errno.h"
-
 #include <fenv.h>
-#include <math.h>
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
 #if defined(__APPLE__)
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index f148d984f5f35..2c90b4888c2e5 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -15,6 +15,7 @@
 #include "dyadic_float.h"
 #include "rounding_mode.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/limits.h" // INT_MAX, INT_MIN
 #include "src/__support/CPP/type_traits.h"
@@ -22,8 +23,6 @@
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include <math.h>
-
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
@@ -230,11 +229,36 @@ LIBC_INLINE T nextafter(T from, U to) {
   return from_bits.get_val();
 }
 
+template <bool IsDown, typename T,
+          cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
+LIBC_INLINE constexpr T nextupdown(T x) {
+  constexpr Sign sign = IsDown ? Sign::NEG : Sign::POS;
+
+  FPBits<T> xbits(x);
+  if (xbits.is_nan() || xbits == FPBits<T>::max_normal(sign) ||
+      xbits == FPBits<T>::inf(sign))
+    return x;
+
+  using StorageType = typename FPBits<T>::StorageType;
+  if (x != T(0)) {
+    if (xbits.sign() == sign) {
+      xbits = FPBits<T>(StorageType(xbits.uintval() + 1));
+    } else {
+      xbits = FPBits<T>(StorageType(xbits.uintval() - 1));
+    }
+  } else {
+    xbits = FPBits<T>::min_subnormal(sign);
+  }
+
+  return xbits.get_val();
+}
+
 } // namespace fputil
 } // namespace LIBC_NAMESPACE
 
 #ifdef LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80
 #include "x86_64/NextAfterLongDouble.h"
+#include "x86_64/NextUpDownLongDouble.h"
 #endif // LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80
 
 #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_MANIPULATIONFUNCTIONS_H
diff --git a/libc/src/__support/FPUtil/NearestIntegerOperations.h b/libc/src/__support/FPUtil/NearestIntegerOperations.h
index 19ae75ea78891..e890e38ba4ae7 100644
--- a/libc/src/__support/FPUtil/NearestIntegerOperations.h
+++ b/libc/src/__support/FPUtil/NearestIntegerOperations.h
@@ -13,11 +13,10 @@
 #include "FPBits.h"
 #include "rounding_mode.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/common.h"
 
-#include <math.h>
-
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
diff --git a/libc/src/__support/FPUtil/x86_64/NextUpDownLongDouble.h b/libc/src/__support/FPUtil/x86_64/NextUpDownLongDouble.h
new file mode 100644
index 0000000000000..1bc849500be04
--- /dev/null
+++ b/libc/src/__support/FPUtil/x86_64/NextUpDownLongDouble.h
@@ -0,0 +1,60 @@
+//===-- nextupdown implementation for x86 long double numbers ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_X86_64_NEXTUPDOWNLONGDOUBLE_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_X86_64_NEXTUPDOWNLONGDOUBLE_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/properties/architectures.h"
+
+#if !defined(LIBC_TARGET_ARCH_IS_X86)
+#error "Invalid include"
+#endif
+
+namespace LIBC_NAMESPACE::fputil {
+
+template <bool IsDown>
+LIBC_INLINE constexpr long double nextupdown(long double x) {
+  constexpr Sign sign = IsDown ? Sign::NEG : Sign::POS;
+
+  using FPBits_t = FPBits<long double>;
+  FPBits_t xbits(x);
+  if (xbits.is_nan() || xbits == FPBits_t::max_normal(sign) ||
+      xbits == FPBits_t::inf(sign))
+    return x;
+
+  if (x == 0.0l)
+    return FPBits_t::min_subnormal(sign).get_val();
+
+  using StorageType = typename FPBits_t::StorageType;
+
+  if (xbits.sign() == sign) {
+    if (xbits.get_mantissa() == FPBits_t::FRACTION_MASK) {
+      xbits.set_mantissa(0);
+      xbits.set_biased_exponent(xbits.get_biased_exponent() + 1);
+    } else {
+      xbits = FPBits_t(StorageType(xbits.uintval() + 1));
+    }
+
+    return xbits.get_val();
+  }
+
+  if (xbits.get_mantissa() == 0) {
+    xbits.set_mantissa(FPBits_t::FRACTION_MASK);
+    xbits.set_biased_exponent(xbits.get_biased_exponent() - 1);
+  } else {
+    xbits = FPBits_t(StorageType(xbits.uintval() - 1));
+  }
+
+  return xbits.get_val();
+}
+
+} // namespace LIBC_NAMESPACE::fputil
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_X86_64_NEXTUPDOWNLONGDOUBLE_H
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index d92d61ed094eb..df01e081e3c19 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -993,6 +993,68 @@ struct is_big_int<BigInt<Bits, Signed, T>> : cpp::true_type {};
 template <class T>
 LIBC_INLINE_VAR constexpr bool is_big_int_v = is_big_int<T>::value;
 
+// extensions of type traits to include BigInt
+
+// is_integral_or_big_int
+template <typename T>
+struct is_integral_or_big_int
+    : cpp::bool_constant<(cpp::is_integral_v<T> || is_big_int_v<T>)> {};
+
+template <typename T>
+LIBC_INLINE_VAR constexpr bool is_integral_or_big_int_v =
+    is_integral_or_big_int<T>::value;
+
+// make_big_int_unsigned
+template <typename T> struct make_big_int_unsigned;
+
+template <size_t Bits, bool Signed, typename T>
+struct make_big_int_unsigned<BigInt<Bits, Signed, T>>
+    : cpp::type_identity<BigInt<Bits, false, T>> {};
+
+template <typename T>
+using make_big_int_unsigned_t = typename make_big_int_unsigned<T>::type;
+
+// make_big_int_signed
+template <typename T> struct make_big_int_signed;
+
+template <size_t Bits, bool Signed, typename T>
+struct make_big_int_signed<BigInt<Bits, Signed, T>>
+    : cpp::type_identity<BigInt<Bits, true, T>> {};
+
+template <typename T>
+using make_big_int_signed_t = typename make_big_int_signed<T>::type;
+
+// make_integral_or_big_int_unsigned
+template <typename T, class = void> struct make_integral_or_big_int_unsigned;
+
+template <typename T>
+struct make_integral_or_big_int_unsigned<
+    T, cpp::enable_if_t<cpp::is_integral_v<T>>> : cpp::make_unsigned<T> {};
+
+template <typename T>
+struct make_integral_or_big_int_unsigned<T, cpp::enable_if_t<is_big_int_v<T>>>
+    : make_big_int_unsigned<T> {};
+
+template <typename T>
+using make_integral_or_big_int_unsigned_t =
+    typename make_integral_or_big_int_unsigned<T>::type;
+
+// make_integral_or_big_int_signed
+template <typename T, class = void> struct make_integral_or_big_int_signed;
+
+template <typename T>
+struct make_integral_or_big_int_signed<T,
+                                       cpp::enable_if_t<cpp::is_integral_v<T>>>
+    : cpp::make_signed<T> {};
+
+template <typename T>
+struct make_integral_or_big_int_signed<T, cpp::enable_if_t<is_big_int_v<T>>>
+    : make_big_int_signed<T> {};
+
+template <typename T>
+using make_integral_or_big_int_signed_t =
+    typename make_integral_or_big_int_signed<T>::type;
+
 namespace cpp {
 
 // Specialization of cpp::bit_cast ('bit.h') from T to BigInt.
diff --git a/libc/src/__support/blockstore.h b/libc/src/__support/blockstore.h
index dc5fdd1b92fc2..ac0eb22692b40 100644
--- a/libc/src/__support/blockstore.h
+++ b/libc/src/__support/blockstore.h
@@ -16,7 +16,6 @@
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE {
-namespace cpp {
 
 // The difference between BlockStore a traditional vector types is that,
 // when more capacity is desired, a new block is added instead of allocating
@@ -203,7 +202,6 @@ void BlockStore<T, BLOCK_SIZE, REVERSE_ORDER>::destroy(
 template <typename T, size_t BLOCK_SIZE>
 using ReverseOrderBlockStore = BlockStore<T, BLOCK_SIZE, true>;
 
-} // namespace cpp
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_BLOCKSTORE_H
diff --git a/libc/src/__support/integer_to_string.h b/libc/src/__support/integer_to_string.h
index ac0bdd688aeab..f72d00d1a7456 100644
--- a/libc/src/__support/integer_to_string.h
+++ b/libc/src/__support/integer_to_string.h
@@ -67,7 +67,7 @@
 #include "src/__support/CPP/span.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/UInt.h" // is_big_int
+#include "src/__support/UInt.h" // make_integral_or_big_int_unsigned_t
 #include "src/__support/common.h"
 
 namespace LIBC_NAMESPACE {
@@ -150,18 +150,6 @@ template <bool forward> class StringBufferWriterImpl {
 using StringBufferWriter = StringBufferWriterImpl<true>;
 using BackwardStringBufferWriter = StringBufferWriterImpl<false>;
 
-template <typename T, class = void> struct IntegerWriterUnsigned {};
-
-template <typename T>
-struct IntegerWriterUnsigned<T, cpp::enable_if_t<cpp::is_integral_v<T>>> {
-  using type = cpp::make_unsigned_t<T>;
-};
-
-template <typename T>
-struct IntegerWriterUnsigned<T, cpp::enable_if_t<is_big_int_v<T>>> {
-  using type = typename T::unsigned_type;
-};
-
 } // namespace details
 
 namespace radix {
@@ -222,7 +210,7 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
   // An internal stateless structure that handles the number formatting logic.
   struct IntegerWriter {
     static_assert(cpp::is_integral_v<T> || is_big_int_v<T>);
-    using UNSIGNED_T = typename details::IntegerWriterUnsigned<T>::type;
+    using UNSIGNED_T = make_integral_or_big_int_unsigned_t<T>;
 
     LIBC_INLINE static char digit_char(uint8_t digit) {
       if (digit < 10)
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 073e1dc672172..2cf2cfb027243 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -1056,17 +1056,17 @@ hexadecimal_string_to_float(const char *__restrict src,
   return output;
 }
 
-LIBC_INLINE uint64_t
+template <class T>
+LIBC_INLINE typename fputil::FPBits<T>::StorageType
 nan_mantissa_from_ncharseq(const cpp::string_view ncharseq) {
-  uint64_t nan_mantissa = 0;
+  using FPBits = typename fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+
+  StorageType nan_mantissa = 0;
 
   if (ncharseq.data() != nullptr && isdigit(ncharseq[0])) {
-    // This is to prevent errors when StorageType is larger than 64
-    // bits, since strtointeger only supports up to 64 bits. This is
-    // actually more than is required by the specification, which says
-    // for the input type "NAN(n-char-sequence)" that "the meaning of
-    // the n-char sequence is implementation-defined."
-    auto strtoint_result = strtointeger<uint64_t>(ncharseq.data(), 0);
+    StrToNumResult<StorageType> strtoint_result =
+        strtointeger<StorageType>(ncharseq.data(), 0);
     if (!strtoint_result.has_error())
       nan_mantissa = strtoint_result.value;
 
@@ -1172,9 +1172,8 @@ LIBC_INLINE StrToNumResult<T> strtofloatingpoint(const char *__restrict src) {
           ++index;
         if (src[index] == ')') {
           ++index;
-          auto nan_mantissa_result = nan_mantissa_from_ncharseq(
+          nan_mantissa = nan_mantissa_from_ncharseq<T>(
               cpp::string_view(src + (left_paren + 1), index - left_paren - 2));
-          nan_mantissa = static_cast<StorageType>(nan_mantissa_result);
         } else {
           index = left_paren;
         }
@@ -1221,11 +1220,8 @@ template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) {
   while (isalnum(arg[index]) || arg[index] == '_')
     ++index;
 
-  if (arg[index] == '\0') {
-    auto nan_mantissa_result =
-        nan_mantissa_from_ncharseq(cpp::string_view(arg, index));
-    nan_mantissa = static_cast<StorageType>(nan_mantissa_result);
-  }
+  if (arg[index] == '\0')
+    nan_mantissa = nan_mantissa_from_ncharseq<T>(cpp::string_view(arg, index));
 
   result = FPBits::quiet_nan(fputil::Sign::POS, nan_mantissa);
   return {result.get_val(), 0, error};
diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h
index b87808993fee5..02c71d40a1c0a 100644
--- a/libc/src/__support/str_to_integer.h
+++ b/libc/src/__support/str_to_integer.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/UInt128.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/str_to_num_result.h"
@@ -75,8 +76,12 @@ template <class T>
 LIBC_INLINE StrToNumResult<T>
 strtointeger(const char *__restrict src, int base,
              const size_t src_len = cpp::numeric_limits<size_t>::max()) {
-  // TODO: Rewrite to support numbers longer than long long
-  unsigned long long result = 0;
+  using ResultType = typename cpp::conditional_t<(cpp::is_same_v<T, UInt128> ||
+                                                  cpp::is_same_v<T, Int128>),
+                                                 UInt128, unsigned long long>;
+
+  ResultType result = 0;
+
   bool is_number = false;
   size_t src_cur = 0;
   int error_val = 0;
@@ -101,15 +106,16 @@ strtointeger(const char *__restrict src, int base,
   if (base == 16 && is_hex_start(src + src_cur, src_len - src_cur))
     src_cur = src_cur + 2;
 
-  constexpr bool IS_UNSIGNED = (cpp::numeric_limits<T>::min() == 0);
+  constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>;
   const bool is_positive = (result_sign == '+');
-  unsigned long long constexpr NEGATIVE_MAX =
-      !IS_UNSIGNED
-          ? static_cast<unsigned long long>(cpp::numeric_limits<T>::max()) + 1
-          : cpp::numeric_limits<T>::max();
-  unsigned long long const abs_max =
+
+  ResultType constexpr NEGATIVE_MAX =
+      !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1
+                   : cpp::numeric_limits<T>::max();
+  ResultType const abs_max =
       (is_positive ? cpp::numeric_limits<T>::max() : NEGATIVE_MAX);
-  unsigned long long const abs_max_div_by_base = abs_max / base;
+  ResultType const abs_max_div_by_base = abs_max / base;
+
   while (src_cur < src_len && isalnum(src[src_cur])) {
     int cur_digit = b36_char_to_int(src[src_cur]);
     if (cur_digit >= base)
@@ -149,10 +155,12 @@ strtointeger(const char *__restrict src, int base,
       return {cpp::numeric_limits<T>::min(), str_len, error_val};
   }
 
-  return {is_positive
-              ? static_cast<T>(result)
-              : static_cast<T>(-static_cast<cpp::make_unsigned_t<T>>(result)),
-          str_len, error_val};
+  return {
+      is_positive
+          ? static_cast<T>(result)
+          : static_cast<T>(
+                -static_cast<make_integral_or_big_int_unsigned_t<T>>(result)),
+      str_len, error_val};
 }
 
 } // namespace internal
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 750fd5f0e3a9b..5e2e6e699d0e0 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -190,6 +190,7 @@ add_math_entrypoint_object(modff128)
 add_math_entrypoint_object(nan)
 add_math_entrypoint_object(nanf)
 add_math_entrypoint_object(nanl)
+add_math_entrypoint_object(nanf128)
 
 add_math_entrypoint_object(nearbyint)
 add_math_entrypoint_object(nearbyintf)
@@ -204,6 +205,16 @@ add_math_entrypoint_object(nexttoward)
 add_math_entrypoint_object(nexttowardf)
 add_math_entrypoint_object(nexttowardl)
 
+add_math_entrypoint_object(nextdown)
+add_math_entrypoint_object(nextdownf)
+add_math_entrypoint_object(nextdownl)
+add_math_entrypoint_object(nextdownf128)
+
+add_math_entrypoint_object(nextup)
+add_math_entrypoint_object(nextupf)
+add_math_entrypoint_object(nextupl)
+add_math_entrypoint_object(nextupf128)
+
 add_math_entrypoint_object(pow)
 add_math_entrypoint_object(powf)
 
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 667381d615d1e..176c32e47768d 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1780,6 +1780,19 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  nanf128
+  SRCS
+    nanf128.cpp
+  HDRS
+    ../nanf128.h
+  DEPENDS
+    libc.src.__support.str_to_float
+    libc.src.errno.errno
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   nextafter
   SRCS
@@ -1865,6 +1878,104 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  nextdown
+  SRCS
+    nextdown.cpp
+  HDRS
+    ../nextdown.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextdownl
+  SRCS
+    nextdownl.cpp
+  HDRS
+    ../nextdownl.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextdownf
+  SRCS
+    nextdownf.cpp
+  HDRS
+    ../nextdownf.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextdownf128
+  SRCS
+    nextdownf128.cpp
+  HDRS
+    ../nextdownf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextup
+  SRCS
+    nextup.cpp
+  HDRS
+    ../nextup.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextupl
+  SRCS
+    nextupl.cpp
+  HDRS
+    ../nextupl.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextupf
+  SRCS
+    nextupf.cpp
+  HDRS
+    ../nextupf.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextupf128
+  SRCS
+    nextupf128.cpp
+  HDRS
+    ../nextupf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   fmod
   SRCS
@@ -2044,16 +2155,9 @@ add_object_library(
   SRCS
     inv_trigf_utils.cpp
   DEPENDS
-    .math_utils
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.common
-    libc.include.errno
-    libc.src.errno.errno
-    libc.include.math
 )
 
 add_entrypoint_object(
@@ -2100,8 +2204,11 @@ add_entrypoint_object(
     ../atanf.h
   DEPENDS
     .inv_trigf_utils
-    .math_utils
+    libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
   COMPILE_OPTIONS
diff --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp
index e0f8a1bfc2ecc..5f66ea52d0d7a 100644
--- a/libc/src/math/generic/atanf.cpp
+++ b/libc/src/math/generic/atanf.cpp
@@ -7,11 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atanf.h"
-#include "math_utils.h"
+#include "inv_trigf_utils.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/math/generic/inv_trigf_utils.h"
 
 namespace LIBC_NAMESPACE {
 
@@ -19,48 +22,102 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
   using FPBits = typename fputil::FPBits<float>;
   using Sign = fputil::Sign;
 
-  // x == 0.0
-  if (LIBC_UNLIKELY(x == 0.0f))
-    return x;
+  constexpr double FINAL_SIGN[2] = {1.0, -1.0};
+  constexpr double SIGNED_PI_OVER_2[2] = {0x1.921fb54442d18p0,
+                                          -0x1.921fb54442d18p0};
 
-  FPBits xbits(x);
-  Sign sign = xbits.sign();
-  xbits.set_sign(Sign::POS);
+  FPBits x_bits(x);
+  Sign sign = x_bits.sign();
+  x_bits.set_sign(Sign::POS);
+  uint32_t x_abs = x_bits.uintval();
 
-  if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
-    if (xbits.is_inf())
-      return static_cast<float>(
-          opt_barrier(sign.is_neg() ? -M_MATH_PI_2 : M_MATH_PI_2));
-    else
+  // x is inf or nan, |x| < 2^-4 or |x|= > 16.
+  if (LIBC_UNLIKELY(x_abs <= 0x3d80'0000U || x_abs >= 0x4180'0000U)) {
+    double x_d = static_cast<double>(x);
+    double const_term = 0.0;
+    if (LIBC_UNLIKELY(x_abs >= 0x4180'0000)) {
+      // atan(+-Inf) = +-pi/2.
+      if (x_bits.is_inf()) {
+        volatile double sign_pi_over_2 = SIGNED_PI_OVER_2[sign.is_neg()];
+        return static_cast<float>(sign_pi_over_2);
+      }
+      if (x_bits.is_nan())
+        return x;
+      // x >= 16
+      x_d = -1.0 / x_d;
+      const_term = SIGNED_PI_OVER_2[sign.is_neg()];
+    }
+    // 0 <= x < 1/16;
+    if (LIBC_UNLIKELY(x_bits.is_zero()))
       return x;
-  }
-  // |x| == 0.06905200332403183
-  if (LIBC_UNLIKELY(xbits.uintval() == 0x3d8d6b23U)) {
-    if (fputil::fenv_is_round_to_nearest()) {
-      // 0.06894256919622421
-      FPBits br(0x3d8d31c3U);
-      br.set_sign(sign);
-      return br.get_val();
+    // x <= 2^-12;
+    if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
+#if defined(LIBC_TARGET_CPU_HAS_FMA)
+      return fputil::multiply_add(x, -0x1.0p-25f, x);
+#else
+      double x_d = static_cast<double>(x);
+      return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
+#endif // LIBC_TARGET_CPU_HAS_FMA
     }
+    // Use Taylor polynomial:
+    //   atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
+    double x2 = x_d * x_d;
+    double x4 = x2 * x2;
+    double c0 = fputil::multiply_add(x2, ATAN_COEFFS[0][1], ATAN_COEFFS[0][0]);
+    double c1 = fputil::multiply_add(x2, ATAN_COEFFS[0][3], ATAN_COEFFS[0][2]);
+    double c2 = fputil::multiply_add(x2, ATAN_COEFFS[0][5], ATAN_COEFFS[0][4]);
+    double p = fputil::polyeval(x4, c0, c1, c2);
+    double r = fputil::multiply_add(x_d, p, const_term);
+    return static_cast<float>(r);
   }
 
-  // |x| == 1.8670953512191772
-  if (LIBC_UNLIKELY(xbits.uintval() == 0x3feefcfbU)) {
-    int rounding_mode = fputil::quick_get_round();
-    if (sign.is_neg()) {
-      if (rounding_mode == FE_DOWNWARD) {
-        // -1.0790828466415405
-        return FPBits(0xbf8a1f63U).get_val();
-      }
-    } else {
-      if (rounding_mode == FE_UPWARD) {
-        // 1.0790828466415405
-        return FPBits(0x3f8a1f63U).get_val();
-      }
+  // Range reduction steps:
+  // 1)  atan(x) = sign(x) * atan(|x|)
+  // 2)  If |x| > 1, atan(|x|) = pi/2 - atan(1/|x|)
+  // 3)  For 1/16 < x <= 1, we find k such that: |x - k/16| <= 1/32.
+  // 4)  Then we use polynomial approximation:
+  //   atan(x) ~ atan((k/16) + (x - (k/16)) * Q(x - k/16)
+  //           = P(x - k/16)
+  double x_d, const_term, final_sign;
+  int idx;
+
+  if (x_abs > 0x3f80'0000U) {
+    // Exceptional value:
+    if (LIBC_UNLIKELY(x_abs == 0x3ffe'2ec1U)) { // |x| = 0x1.fc5d82p+0
+      return sign.is_pos() ? fputil::round_result_slightly_up(0x1.1ab2fp0f)
+                           : fputil::round_result_slightly_down(-0x1.1ab2fp0f);
     }
+    // |x| > 1, we need to invert x, so we will perform range reduction in
+    // double precision.
+    x_d = 1.0 / static_cast<double>(x_bits.get_val());
+    double k_d = fputil::nearest_integer(x_d * 0x1.0p4);
+    x_d = fputil::multiply_add(k_d, -0x1.0p-4, x_d);
+    idx = static_cast<int>(k_d);
+    final_sign = FINAL_SIGN[sign.is_pos()];
+    // Adjust constant term of the polynomial by +- pi/2.
+    const_term = fputil::multiply_add(final_sign, ATAN_COEFFS[idx][0],
+                                      SIGNED_PI_OVER_2[sign.is_neg()]);
+  } else {
+    // Exceptional value:
+    if (LIBC_UNLIKELY(x_abs == 0x3dbb'6ac7U)) { // |x| = 0x1.76d58ep-4
+      return sign.is_pos()
+                 ? fputil::round_result_slightly_up(0x1.75cb06p-4f)
+                 : fputil::round_result_slightly_down(-0x1.75cb06p-4f);
+    }
+    // Perform range reduction in single precision.
+    float x_f = x_bits.get_val();
+    float k_f = fputil::nearest_integer(x_f * 0x1.0p4f);
+    x_f = fputil::multiply_add(k_f, -0x1.0p-4f, x_f);
+    x_d = static_cast<double>(x_f);
+    idx = static_cast<int>(k_f);
+    final_sign = FINAL_SIGN[sign.is_neg()];
+    const_term = final_sign * ATAN_COEFFS[idx][0];
   }
 
-  return static_cast<float>(atan_eval(x));
+  double p = atan_eval(x_d, idx);
+  double r = fputil::multiply_add(final_sign * x_d, p, const_term);
+
+  return static_cast<float>(r);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/inv_trigf_utils.cpp b/libc/src/math/generic/inv_trigf_utils.cpp
index 8013c0470affb..93d5bcbf7b567 100644
--- a/libc/src/math/generic/inv_trigf_utils.cpp
+++ b/libc/src/math/generic/inv_trigf_utils.cpp
@@ -10,19 +10,71 @@
 
 namespace LIBC_NAMESPACE {
 
-// N[Table[ArcTan[x], {x, 1/16, 16/16, 1/16}], 40]
-alignas(64) const double ATAN_T[ATAN_T_SIZE] = {
-    0x1.ff55bb72cfdeap-5, 0x1.fd5ba9aac2f6ep-4, 0x1.7b97b4bce5b02p-3,
-    0x1.f5b75f92c80ddp-3, 0x1.362773707ebccp-2, 0x1.6f61941e4def1p-2,
-    0x1.a64eec3cc23fdp-2, 0x1.dac670561bb4fp-2, 0x1.0657e94db30d0p-1,
-    0x1.1e00babdefeb4p-1, 0x1.345f01cce37bbp-1, 0x1.4978fa3269ee1p-1,
-    0x1.5d58987169b18p-1, 0x1.700a7c5784634p-1, 0x1.819d0b7158a4dp-1,
-    0x1.921fb54442d18p-1};
-
-// for(int i = 0; i < 5; i++)
-//     printf("%.13a,\n", (-2 * (i % 2) + 1) * 1.0 / (2 * i + 1));
-alignas(64) const double ATAN_K[5] = {
-    0x1.0000000000000p+0, -0x1.5555555555555p-2, 0x1.999999999999ap-3,
-    -0x1.2492492492492p-3, 0x1.c71c71c71c71cp-4};
+// Polynomial approximation for 0 <= x <= 1:
+//   atan(x) ~ atan((i/16) + (x - (i/16)) * Q(x - i/16)
+//           = P(x - i/16)
+// Generated by Sollya with:
+// > for i from 1 to 16 do {
+//     mid_point = i/16;
+//     P = fpminimax(atan(mid_point + x), 7, [|D...|], [-1/32, 1/32]);
+//     print("{", coeff(P, 0), ",", coeff(P, 1), ",", coeff(P, 2), ",",
+//           coeff(P, 3), ",", coeff(P, 4), ",", coeff(P, 5), ",", coeff(P, 6),
+//           ",", coeff(P, 7), "},");
+//   };
+// For i = 0, ATAN_COEFFS[0][j] = (-1)^j * (1/(2*j + 1)) is the odd coefficients
+// of the Taylor polynomial of atan(x).
+double ATAN_COEFFS[17][8] = {
+    {0x1.0000000000000p+0, -0x1.5555555555555p-2, 0x1.999999999999ap-3,
+     -0x1.2492492492492p-3, 0x1.c71c71c71c71cp-4, -0x1.745d1745d1746p-4,
+     0x1.3b13b13b13b14p-4, -0x1.1111111111111p-4},
+    {0x1.ff55bb72cfdb1p-5, 0x1.fe01fe01fe1bp-1, -0x1.fc05f80821d1ap-5,
+     -0x1.4d6930419fc5fp-2, 0x1.f61b9f6d69313p-5, 0x1.8208a32f4346cp-3,
+     -0x1.ecb8fc53d04efp-5, -0x1.060710cb59cbcp-3},
+    {0x1.fd5ba9aac2f3cp-4, 0x1.f81f81f81f96ap-1, -0x1.f05e09cf4c1b2p-4,
+     -0x1.368c3aac7543ep-2, 0x1.d9b14bddfac55p-4, 0x1.4048e55ec725ep-3,
+     -0x1.b98ca3c1594b5p-4, -0x1.664eabaabbc16p-4},
+    {0x1.7b97b4bce5ae7p-3, 0x1.ee9c7f8458f06p-1, -0x1.665c226c8dc69p-3,
+     -0x1.1344bb77961b7p-2, 0x1.42ac97745d3ccp-3, 0x1.c32e142047ec1p-4,
+     -0x1.137ae41ab96cbp-3, -0x1.1a6ae8c09a4b6p-5},
+    {0x1.f5b75f92c80c6p-3, 0x1.e1e1e1e1e1ed4p-1, -0x1.c5894d101ad4p-3,
+     -0x1.ce6de02b38c38p-3, 0x1.78a3920c336b9p-3, 0x1.dd5ff94a9d499p-5,
+     -0x1.1ac2d3f9d072ep-3, 0x1.0af9735dff373p-6},
+    {0x1.362773707ebc5p-2, 0x1.d272ca3fc5b8bp-1, -0x1.0997e8ae90cb6p-2,
+     -0x1.6cf6667146798p-3, 0x1.8dd1dff17f3d3p-3, 0x1.24860eced656fp-7,
+     -0x1.f4220e8f18ed5p-4, 0x1.b700aed7cdc34p-5},
+    {0x1.6f61941e4deeep-2, 0x1.c0e070381c115p-1, -0x1.2726dd1347c7ep-2,
+     -0x1.09f37b3ad010dp-3, 0x1.85eaca5196f5cp-3, -0x1.04d640117852ap-5,
+     -0x1.802c2956871c7p-4, 0x1.2992b45df0ee7p-4},
+    {0x1.a64eec3cc23fep-2, 0x1.adbe87f94906bp-1, -0x1.3b9d8eab5eae5p-2,
+     -0x1.57c09646faabbp-4, 0x1.6795330e73aep-3, -0x1.f2d89a702a652p-5,
+     -0x1.f3afd90a9d4d7p-5, 0x1.3261723d3f153p-4},
+    {0x1.dac670561bb53p-2, 0x1.999999999998fp-1, -0x1.47ae147afd8cap-2,
+     -0x1.5d867c3dfd72ap-5, 0x1.3a92a76cba833p-3, -0x1.3ec460286928ap-4,
+     -0x1.ed02ff86892acp-6, 0x1.0a674c8f05727p-4},
+    {0x1.0657e94db30d2p-1, 0x1.84f00c27805ffp-1, -0x1.4c62cb564f677p-2,
+     -0x1.e6495b262dfe7p-8, 0x1.063c34eca262bp-3, -0x1.58b78dc79b5aep-4,
+     -0x1.4623815233be1p-8, 0x1.93afe94328089p-5},
+    {0x1.1e00babdefeb6p-1, 0x1.702e05c0b8159p-1, -0x1.4af2b78236bd6p-2,
+     0x1.5d0b7ea46ed08p-6, 0x1.a124870236935p-4, -0x1.519e1ec133a88p-4,
+     0x1.a54632a3f48c7p-7, 0x1.099ca0945096dp-5},
+    {0x1.345f01cce37bdp-1, 0x1.5babcc647fa7ep-1, -0x1.449db09443a67p-2,
+     0x1.655caac78a0fcp-5, 0x1.3bbbdb0d09efap-4, -0x1.34a306c27e021p-4,
+     0x1.83fe749c7966p-6, 0x1.2057cc96d9edcp-6},
+    {0x1.4978fa3269ee2p-1, 0x1.47ae147ae146bp-1, -0x1.3a92a305652e1p-2,
+     0x1.ec21b5172657fp-5, 0x1.c2f8c45d2f4eep-5, -0x1.0ba99c4aeb8acp-4,
+     0x1.d716a4af4d1d6p-6, 0x1.97fba0a9696dep-8},
+    {0x1.5d58987169b19p-1, 0x1.34679ace0133cp-1, -0x1.2ddfb03920e2fp-2,
+     0x1.2491307c0fa0bp-4, 0x1.29c7eca0136fp-5, -0x1.bca792caa6f1cp-5,
+     0x1.e5d92545576bcp-6, -0x1.8ca76fcf5ccd2p-10},
+    {0x1.700a7c5784634p-1, 0x1.21fb78121fb71p-1, -0x1.1f6a8499ea541p-2,
+     0x1.41b15e5e77bcfp-4, 0x1.59bc9bf54fb02p-6, -0x1.63b54ff058e0fp-5,
+     0x1.c8da01221306fp-6, -0x1.906b2c274c39cp-8},
+    {0x1.819d0b7158a4dp-1, 0x1.107fbbe01107cp-1, -0x1.0feeb40897d4ep-2,
+     0x1.50e5afb95f5d6p-4, 0x1.2a7c2f0c7495dp-7, -0x1.12bd2bb5062cdp-5,
+     0x1.93e8ceb89afebp-6, -0x1.10da9b8c6b731p-7},
+    {0x1.921fb54442d18p-1, 0x1.fffffffffffebp-2, -0x1.fffffffffcbbcp-3,
+     0x1.555555564e2fep-4, -0x1.20b17d5dd89dcp-30, -0x1.9999c5ad71711p-6,
+     0x1.5558b76e7aaf9p-6, -0x1.236e803c6c1f6p-7},
+};
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/inv_trigf_utils.h b/libc/src/math/generic/inv_trigf_utils.h
index 20f912de2ac00..c621f27e10146 100644
--- a/libc/src/math/generic/inv_trigf_utils.h
+++ b/libc/src/math/generic/inv_trigf_utils.h
@@ -9,85 +9,32 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_INV_TRIGF_UTILS_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_INV_TRIGF_UTILS_H
 
-#include "math_utils.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/common.h"
 
-#include <errno.h>
-
 namespace LIBC_NAMESPACE {
 
 // PI and PI / 2
 constexpr double M_MATH_PI = 0x1.921fb54442d18p+1;
 constexpr double M_MATH_PI_2 = 0x1.921fb54442d18p+0;
 
-// atan table size
-constexpr int ATAN_T_BITS = 4;
-constexpr int ATAN_T_SIZE = 1 << ATAN_T_BITS;
-
-// N[Table[ArcTan[x], {x, 1/8, 8/8, 1/8}], 40]
-extern const double ATAN_T[ATAN_T_SIZE];
-extern const double ATAN_K[5];
-
-// The main idea of the function is to use formula
-// atan(u) + atan(v) = atan((u+v)/(1-uv))
-
-// x should be positive, normal finite value
-LIBC_INLINE double atan_eval(double x) {
-  using FPB = fputil::FPBits<double>;
-  using Sign = fputil::Sign;
-  // Added some small value to umin and umax mantissa to avoid possible rounding
-  // errors.
-  FPB::StorageType umin =
-      FPB::create_value(Sign::POS, FPB::EXP_BIAS - ATAN_T_BITS - 1,
-                        0x100000000000UL)
-          .uintval();
-  FPB::StorageType umax =
-      FPB::create_value(Sign::POS, FPB::EXP_BIAS + ATAN_T_BITS,
-                        0xF000000000000UL)
-          .uintval();
-
-  FPB bs(x);
-  auto x_abs = bs.abs().uintval();
-
-  if (x_abs <= umin) {
-    double pe = LIBC_NAMESPACE::fputil::polyeval(
-        x * x, 0.0, ATAN_K[1], ATAN_K[2], ATAN_K[3], ATAN_K[4]);
-    return fputil::multiply_add(pe, x, x);
-  }
-
-  if (x_abs >= umax) {
-    double one_over_x_m = -1.0 / x;
-    double one_over_x2 = one_over_x_m * one_over_x_m;
-    double pe = LIBC_NAMESPACE::fputil::polyeval(
-        one_over_x2, ATAN_K[0], ATAN_K[1], ATAN_K[2], ATAN_K[3]);
-    return fputil::multiply_add(pe, one_over_x_m,
-                                bs.is_neg() ? (-M_MATH_PI_2) : (M_MATH_PI_2));
-  }
+extern double ATAN_COEFFS[17][8];
 
-  double pos_x = FPB(x_abs).get_val();
-  bool one_over_x = pos_x > 1.0;
-  if (one_over_x) {
-    pos_x = 1.0 / pos_x;
-  }
+// For |x| <= 1/32 and 1 <= i <= 16, return Q(x) such that:
+//   Q(x) ~ (atan(x + i/16) - atan(i/16)) / x.
+LIBC_INLINE double atan_eval(double x, int i) {
+  double x2 = x * x;
 
-  double near_x = fputil::nearest_integer(pos_x * ATAN_T_SIZE);
-  int val = static_cast<int>(near_x);
-  near_x *= 1.0 / ATAN_T_SIZE;
+  double c0 = fputil::multiply_add(x, ATAN_COEFFS[i][2], ATAN_COEFFS[i][1]);
+  double c1 = fputil::multiply_add(x, ATAN_COEFFS[i][4], ATAN_COEFFS[i][3]);
+  double c2 = fputil::multiply_add(x, ATAN_COEFFS[i][6], ATAN_COEFFS[i][5]);
 
-  double v = (pos_x - near_x) / fputil::multiply_add(near_x, pos_x, 1.0);
-  double v2 = v * v;
-  double pe = LIBC_NAMESPACE::fputil::polyeval(v2, ATAN_K[0], ATAN_K[1],
-                                               ATAN_K[2], ATAN_K[3], ATAN_K[4]);
-  double result;
-  if (one_over_x)
-    result = M_MATH_PI_2 - fputil::multiply_add(pe, v, ATAN_T[val - 1]);
-  else
-    result = fputil::multiply_add(pe, v, ATAN_T[val - 1]);
-  return bs.is_neg() ? -result : result;
+  double x4 = x2 * x2;
+  double d1 = fputil::multiply_add(x2, c1, c0);
+  double d2 = fputil::multiply_add(x2, ATAN_COEFFS[i][7], c2);
+  double p = fputil::multiply_add(x4, d2, d1);
+  return p;
 }
 
 // > Q = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20|],
diff --git a/libc/src/math/generic/math_utils.h b/libc/src/math/generic/math_utils.h
index e884fe2deae28..cced761fc8c82 100644
--- a/libc/src/math/generic/math_utils.h
+++ b/libc/src/math/generic/math_utils.h
@@ -9,13 +9,12 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_MATH_UTILS_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_MATH_UTILS_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/common.h"
 #include "src/errno/libc_errno.h"
 
-#include <math.h>
-
 #include <stdint.h>
 
 // TODO: evaluate which functions from this file are actually used.
diff --git a/libc/src/math/generic/nanf128.cpp b/libc/src/math/generic/nanf128.cpp
new file mode 100644
index 0000000000000..f087c9f074fde
--- /dev/null
+++ b/libc/src/math/generic/nanf128.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of nanf128 function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nanf128.h"
+#include "src/__support/common.h"
+#include "src/__support/str_to_float.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, nanf128, (const char *arg)) {
+  auto result = internal::strtonan<float128>(arg);
+  if (result.has_error())
+    libc_errno = result.error;
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextdown.cpp b/libc/src/math/generic/nextdown.cpp
new file mode 100644
index 0000000000000..51dee483b70e6
--- /dev/null
+++ b/libc/src/math/generic/nextdown.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextdown function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextdown.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, nextdown, (double x)) {
+  return fputil::nextupdown</*IsDown=*/true>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextdownf.cpp b/libc/src/math/generic/nextdownf.cpp
new file mode 100644
index 0000000000000..857b412d64c3c
--- /dev/null
+++ b/libc/src/math/generic/nextdownf.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextdownf function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextdownf.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, nextdownf, (float x)) {
+  return fputil::nextupdown</*IsDown=*/true>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextdownf128.cpp b/libc/src/math/generic/nextdownf128.cpp
new file mode 100644
index 0000000000000..2585a13df3a3c
--- /dev/null
+++ b/libc/src/math/generic/nextdownf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextdownf128 function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextdownf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, nextdownf128, (float128 x)) {
+  return fputil::nextupdown</*IsDown=*/true>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextdownl.cpp b/libc/src/math/generic/nextdownl.cpp
new file mode 100644
index 0000000000000..06a09c9daea59
--- /dev/null
+++ b/libc/src/math/generic/nextdownl.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextdownl function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextdownl.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long double, nextdownl, (long double x)) {
+  return fputil::nextupdown</*IsDown=*/true>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextup.cpp b/libc/src/math/generic/nextup.cpp
new file mode 100644
index 0000000000000..d75a336eefcc8
--- /dev/null
+++ b/libc/src/math/generic/nextup.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextup function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextup.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, nextup, (double x)) {
+  return fputil::nextupdown</*IsDown=*/false>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextupf.cpp b/libc/src/math/generic/nextupf.cpp
new file mode 100644
index 0000000000000..3b18dae4488d4
--- /dev/null
+++ b/libc/src/math/generic/nextupf.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextupf function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextupf.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, nextupf, (float x)) {
+  return fputil::nextupdown</*IsDown=*/false>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextupf128.cpp b/libc/src/math/generic/nextupf128.cpp
new file mode 100644
index 0000000000000..7d862c30976cb
--- /dev/null
+++ b/libc/src/math/generic/nextupf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextupf128 function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextupf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, nextupf128, (float128 x)) {
+  return fputil::nextupdown</*IsDown=*/false>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextupl.cpp b/libc/src/math/generic/nextupl.cpp
new file mode 100644
index 0000000000000..ccc52445eec5c
--- /dev/null
+++ b/libc/src/math/generic/nextupl.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextupl function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextupl.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long double, nextupl, (long double x)) {
+  return fputil::nextupdown</*IsDown=*/false>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nanf128.h b/libc/src/math/nanf128.h
new file mode 100644
index 0000000000000..b06d14e2f945e
--- /dev/null
+++ b/libc/src/math/nanf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for nanf128 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NANF128_H
+#define LLVM_LIBC_SRC_MATH_NANF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 nanf128(const char *arg);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NANF128_H
diff --git a/libc/src/math/nextdown.h b/libc/src/math/nextdown.h
new file mode 100644
index 0000000000000..8049b170ee721
--- /dev/null
+++ b/libc/src/math/nextdown.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextdown ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTDOWN_H
+#define LLVM_LIBC_SRC_MATH_NEXTDOWN_H
+
+namespace LIBC_NAMESPACE {
+
+double nextdown(double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTDOWN_H
diff --git a/libc/src/math/nextdownf.h b/libc/src/math/nextdownf.h
new file mode 100644
index 0000000000000..0a2f23480574d
--- /dev/null
+++ b/libc/src/math/nextdownf.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextdownf ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTDOWNF_H
+#define LLVM_LIBC_SRC_MATH_NEXTDOWNF_H
+
+namespace LIBC_NAMESPACE {
+
+float nextdownf(float x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTDOWNF_H
diff --git a/libc/src/math/nextdownf128.h b/libc/src/math/nextdownf128.h
new file mode 100644
index 0000000000000..0a3043bb431d8
--- /dev/null
+++ b/libc/src/math/nextdownf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for nextdownf128 ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTDOWNF128_H
+#define LLVM_LIBC_SRC_MATH_NEXTDOWNF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 nextdownf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTDOWNF128_H
diff --git a/libc/src/math/nextdownl.h b/libc/src/math/nextdownl.h
new file mode 100644
index 0000000000000..9cb274a89b1c5
--- /dev/null
+++ b/libc/src/math/nextdownl.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextdownl ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTDOWNL_H
+#define LLVM_LIBC_SRC_MATH_NEXTDOWNL_H
+
+namespace LIBC_NAMESPACE {
+
+long double nextdownl(long double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTDOWNL_H
diff --git a/libc/src/math/nextup.h b/libc/src/math/nextup.h
new file mode 100644
index 0000000000000..97ae82270b06e
--- /dev/null
+++ b/libc/src/math/nextup.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextup ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTUP_H
+#define LLVM_LIBC_SRC_MATH_NEXTUP_H
+
+namespace LIBC_NAMESPACE {
+
+double nextup(double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTUP_H
diff --git a/libc/src/math/nextupf.h b/libc/src/math/nextupf.h
new file mode 100644
index 0000000000000..ffc0fa168a108
--- /dev/null
+++ b/libc/src/math/nextupf.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextupf -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTUPF_H
+#define LLVM_LIBC_SRC_MATH_NEXTUPF_H
+
+namespace LIBC_NAMESPACE {
+
+float nextupf(float x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTUPF_H
diff --git a/libc/src/math/nextupf128.h b/libc/src/math/nextupf128.h
new file mode 100644
index 0000000000000..b4429922e4beb
--- /dev/null
+++ b/libc/src/math/nextupf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for nextupf128 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTUPF128_H
+#define LLVM_LIBC_SRC_MATH_NEXTUPF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 nextupf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTUPF128_H
diff --git a/libc/src/math/nextupl.h b/libc/src/math/nextupl.h
new file mode 100644
index 0000000000000..cbc6a168e4bea
--- /dev/null
+++ b/libc/src/math/nextupl.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextupl -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTUPL_H
+#define LLVM_LIBC_SRC_MATH_NEXTUPL_H
+
+namespace LIBC_NAMESPACE {
+
+long double nextupl(long double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTUPL_H
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index bb8e41606c5df..ece93fd56ef0c 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -236,6 +236,16 @@ add_entrypoint_object(
     libc.src.stdio.printf_core.vfprintf_internal
 )
 
+add_stdio_entrypoint_object(
+  fileno
+  SRCS
+    fileno.cpp
+  HDRS
+    fileno.h
+  DEPENDS
+    libc.src.stdio.fileno
+)
+
 add_subdirectory(printf_core)
 add_subdirectory(scanf_core)
 
diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt
new file mode 100644
index 0000000000000..65089274050bd
--- /dev/null
+++ b/libc/src/stdio/baremetal/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_entrypoint_object(
+  remove
+  SRCS
+    remove.cpp
+  HDRS
+    ../remove.h
+  DEPENDS
+    libc.include.stdio
+)
diff --git a/libc/src/stdio/baremetal/remove.cpp b/libc/src/stdio/baremetal/remove.cpp
new file mode 100644
index 0000000000000..f4f2cdca7c69b
--- /dev/null
+++ b/libc/src/stdio/baremetal/remove.cpp
@@ -0,0 +1,19 @@
+//===-- Linux implementation of remove ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/remove.h"
+
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+// TODO: This is a temporary workaround for issue #85335.
+
+LLVM_LIBC_FUNCTION(int, remove, (const char *)) { return -1; }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/fileno.h b/libc/src/stdio/fileno.h
new file mode 100644
index 0000000000000..d41f112226c51
--- /dev/null
+++ b/libc/src/stdio/fileno.h
@@ -0,0 +1,21 @@
+//===-- Implementation header of fileno --------------------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_FILENO_H
+#define LLVM_LIBC_SRC_STDIO_FILENO_H
+
+#include "include/llvm-libc-types/FILE.h"
+
+namespace LIBC_NAMESPACE {
+
+int fileno(::FILE *f);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDIO_FILENO_H
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index 4e4a709e94061..0aa213caba7b8 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -70,6 +70,18 @@ add_entrypoint_object(
     libc.src.__support.File.platform_file
 )
 
+add_entrypoint_object(
+  fileno
+  SRCS
+    fileno.cpp
+  HDRS
+    ../fileno.h
+  DEPENDS
+    libc.include.stdio
+    libc.src.__support.File.file
+    libc.src.__support.File.platform_file
+)
+
 add_entrypoint_object(
   fflush
   SRCS
diff --git a/libc/src/stdio/generic/fileno.cpp b/libc/src/stdio/generic/fileno.cpp
new file mode 100644
index 0000000000000..663ba92663762
--- /dev/null
+++ b/libc/src/stdio/generic/fileno.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of fileno
+//-------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/fileno.h"
+
+#include "include/llvm-libc-types/FILE.h"
+#include "src/__support/File/file.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, fileno, (::FILE * stream)) {
+  return get_fileno(reinterpret_cast<LIBC_NAMESPACE::File *>(stream));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/linux/CMakeLists.txt b/libc/src/stdio/linux/CMakeLists.txt
index 3b5a9960dc125..774f24b2db0bd 100644
--- a/libc/src/stdio/linux/CMakeLists.txt
+++ b/libc/src/stdio/linux/CMakeLists.txt
@@ -6,6 +6,7 @@ add_entrypoint_object(
     ../remove.h
   DEPENDS
     libc.include.fcntl
+    libc.include.stdio
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
diff --git a/libc/src/stdio/printf_core/string_converter.h b/libc/src/stdio/printf_core/string_converter.h
index 04dc5a06da222..9e05591079faa 100644
--- a/libc/src/stdio/printf_core/string_converter.h
+++ b/libc/src/stdio/printf_core/string_converter.h
@@ -25,7 +25,7 @@ LIBC_INLINE int convert_string(Writer *writer, const FormatSection &to_conv) {
 
 #ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
   if (str_ptr == nullptr) {
-    str_ptr = "null";
+    str_ptr = "(null)";
   }
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 
diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp
index 1513b7969f0db..741ea4f25103a 100644
--- a/libc/src/stdlib/atexit.cpp
+++ b/libc/src/stdlib/atexit.cpp
@@ -36,7 +36,7 @@ struct AtExitUnit {
 //        mutexes simply passthrough. We will need a lock free stack.
 using ExitCallbackList = FixedVector<AtExitUnit, 64>;
 #elif defined(LIBC_COPT_PUBLIC_PACKAGING)
-using ExitCallbackList = cpp::ReverseOrderBlockStore<AtExitUnit, 32>;
+using ExitCallbackList = ReverseOrderBlockStore<AtExitUnit, 32>;
 #else
 // BlockStore uses dynamic memory allocation. To avoid dynamic memory
 // allocation in tests, we use a fixed size callback list when built for
diff --git a/libc/src/sys/mman/CMakeLists.txt b/libc/src/sys/mman/CMakeLists.txt
index b49f73873c200..9c74202a09f03 100644
--- a/libc/src/sys/mman/CMakeLists.txt
+++ b/libc/src/sys/mman/CMakeLists.txt
@@ -85,3 +85,17 @@ add_entrypoint_object(
   DEPENDS
     .${LIBC_TARGET_OS}.msync
 )
+
+add_entrypoint_object(
+  shm_open
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.shm_open
+)
+
+add_entrypoint_object(
+  shm_unlink
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.shm_unlink
+)
diff --git a/libc/src/sys/mman/linux/CMakeLists.txt b/libc/src/sys/mman/linux/CMakeLists.txt
index 04086ee5d3325..00f4f0e64ec06 100644
--- a/libc/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/src/sys/mman/linux/CMakeLists.txt
@@ -152,3 +152,40 @@ add_entrypoint_object(
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
+
+add_header_library(
+  shm_common
+  HDRS
+    shm_common.h
+  DEPENDS
+    libc.src.__support.CPP.array
+    libc.src.__support.CPP.string_view
+    libc.src.__support.CPP.optional
+    libc.src.__support.common
+    libc.src.errno.errno
+    libc.src.string.memory_utils.inline_memcpy
+)
+
+add_entrypoint_object(
+  shm_open
+  SRCS
+    shm_open.cpp
+  HDRS
+    ../shm_open.h
+  DEPENDS
+    libc.src.fcntl.open
+    libc.include.llvm-libc-macros.fcntl_macros
+    libc.include.llvm-libc-types.mode_t
+    .shm_common
+)
+
+add_entrypoint_object(
+  shm_unlink
+  SRCS
+    shm_unlink.cpp
+  HDRS
+    ../shm_unlink.h
+  DEPENDS
+    libc.src.unistd.unlink
+    .shm_common
+)
diff --git a/libc/src/sys/mman/linux/shm_common.h b/libc/src/sys/mman/linux/shm_common.h
new file mode 100644
index 0000000000000..6f2a3fdc71b64
--- /dev/null
+++ b/libc/src/sys/mman/linux/shm_common.h
@@ -0,0 +1,53 @@
+//===---------- Shared implementations for shm_open/shm_unlink ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/errno/libc_errno.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+
+// TODO: Get PATH_MAX via https://github.com/llvm/llvm-project/issues/85121
+#include <linux/limits.h>
+
+namespace LIBC_NAMESPACE {
+
+namespace shm_common {
+
+LIBC_INLINE_VAR constexpr cpp::string_view SHM_PREFIX = "/dev/shm/";
+using SHMPath = cpp::array<char, NAME_MAX + SHM_PREFIX.size() + 1>;
+
+LIBC_INLINE cpp::optional<SHMPath> translate_name(cpp::string_view name) {
+  // trim leading slashes
+  size_t offset = name.find_first_not_of('/');
+  if (offset == cpp::string_view::npos) {
+    libc_errno = EINVAL;
+    return cpp::nullopt;
+  }
+  name = name.substr(offset);
+
+  // check the name
+  if (name.size() > NAME_MAX) {
+    libc_errno = ENAMETOOLONG;
+    return cpp::nullopt;
+  }
+  if (name == "." || name == ".." || name.contains('/')) {
+    libc_errno = EINVAL;
+    return cpp::nullopt;
+  }
+
+  // prepend the prefix
+  SHMPath buffer;
+  inline_memcpy(buffer.data(), SHM_PREFIX.data(), SHM_PREFIX.size());
+  inline_memcpy(buffer.data() + SHM_PREFIX.size(), name.data(), name.size());
+  buffer[SHM_PREFIX.size() + name.size()] = '\0';
+  return buffer;
+}
+} // namespace shm_common
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/sys/mman/linux/shm_open.cpp b/libc/src/sys/mman/linux/shm_open.cpp
new file mode 100644
index 0000000000000..0d39b8b4e53db
--- /dev/null
+++ b/libc/src/sys/mman/linux/shm_open.cpp
@@ -0,0 +1,25 @@
+//===---------- Linux implementation of the shm_open function -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/mman/shm_open.h"
+#include "llvm-libc-macros/fcntl-macros.h"
+#include "src/fcntl/open.h"
+#include "src/sys/mman/linux/shm_common.h"
+
+namespace LIBC_NAMESPACE {
+
+static constexpr int DEFAULT_OFLAGS = O_NOFOLLOW | O_CLOEXEC | O_NONBLOCK;
+
+LLVM_LIBC_FUNCTION(int, shm_open, (const char *name, int oflags, mode_t mode)) {
+  using namespace shm_common;
+  if (cpp::optional<SHMPath> buffer = translate_name(name))
+    return open(buffer->data(), oflags | DEFAULT_OFLAGS, mode);
+  return -1;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/sys/mman/linux/shm_unlink.cpp b/libc/src/sys/mman/linux/shm_unlink.cpp
new file mode 100644
index 0000000000000..32f48d3e3e718
--- /dev/null
+++ b/libc/src/sys/mman/linux/shm_unlink.cpp
@@ -0,0 +1,22 @@
+//===---------- Linux implementation of the shm_unlink function -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/mman/shm_unlink.h"
+#include "src/sys/mman/linux/shm_common.h"
+#include "src/unistd/unlink.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, shm_unlink, (const char *name)) {
+  using namespace shm_common;
+  if (cpp::optional<SHMPath> buffer = translate_name(name))
+    return unlink(buffer->data());
+  return -1;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/sys/mman/shm_open.h b/libc/src/sys/mman/shm_open.h
new file mode 100644
index 0000000000000..91796d7b5c050
--- /dev/null
+++ b/libc/src/sys/mman/shm_open.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for shm_open function -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_MMAN_SHM_OPEN_H
+#define LLVM_LIBC_SRC_SYS_MMAN_SHM_OPEN_H
+
+#include <llvm-libc-types/mode_t.h>
+
+namespace LIBC_NAMESPACE {
+
+int shm_open(const char *name, int oflag, mode_t mode);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_SYS_MMAN_SHM_OPEN_H
diff --git a/libc/src/sys/mman/shm_unlink.h b/libc/src/sys/mman/shm_unlink.h
new file mode 100644
index 0000000000000..c38c06adbaa29
--- /dev/null
+++ b/libc/src/sys/mman/shm_unlink.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for shm_unlink function ------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_MMAN_SHM_UNLINK_H
+#define LLVM_LIBC_SRC_SYS_MMAN_SHM_UNLINK_H
+
+namespace LIBC_NAMESPACE {
+
+int shm_unlink(const char *name);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_SYS_MMAN_SHM_UNLINK_H
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 4525b9e830195..43000efa09a39 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -17,7 +17,7 @@
 #include "test/UnitTest/StringUtils.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace LIBC_NAMESPACE {
 namespace testing {
@@ -102,8 +102,10 @@ template <typename T> struct FPTest : public Test {
   const T inf = FPBits::inf(Sign::POS).get_val();                              \
   const T neg_inf = FPBits::inf(Sign::NEG).get_val();                          \
   const T min_normal = FPBits::min_normal().get_val();                         \
-  const T max_normal = FPBits::max_normal().get_val();                         \
-  const T min_denormal = FPBits::min_subnormal().get_val();                    \
+  const T max_normal = FPBits::max_normal(Sign::POS).get_val();                \
+  const T neg_max_normal = FPBits::max_normal(Sign::NEG).get_val();            \
+  const T min_denormal = FPBits::min_subnormal(Sign::POS).get_val();           \
+  const T neg_min_denormal = FPBits::min_subnormal(Sign::NEG).get_val();       \
   const T max_denormal = FPBits::max_subnormal().get_val();
 
 #define EXPECT_FP_EQ(expected, actual)                                         \
diff --git a/libc/test/src/__support/CPP/CMakeLists.txt b/libc/test/src/__support/CPP/CMakeLists.txt
index f94429e03b3cb..74aa0c705ec46 100644
--- a/libc/test/src/__support/CPP/CMakeLists.txt
+++ b/libc/test/src/__support/CPP/CMakeLists.txt
@@ -1,5 +1,15 @@
 add_custom_target(libc-cpp-utils-tests)
 
+add_libc_test(
+  array_test
+  SUITE
+    libc-cpp-utils-tests
+  SRCS
+    array_test.cpp
+  DEPENDS
+    libc.src.__support.CPP.array
+  )
+
 add_libc_test(
   bit_test
   SUITE
diff --git a/libc/test/src/__support/CPP/array_test.cpp b/libc/test/src/__support/CPP/array_test.cpp
new file mode 100644
index 0000000000000..f2d7bff636e42
--- /dev/null
+++ b/libc/test/src/__support/CPP/array_test.cpp
@@ -0,0 +1,66 @@
+//===-- Unittests for Array -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/array.h"
+#include "test/UnitTest/Test.h"
+
+using LIBC_NAMESPACE::cpp::array;
+
+TEST(LlvmLibcArrayTest, Basic) {
+  array<int, 3> a = {0, 1, 2};
+
+  ASSERT_EQ(a.data(), &a.front());
+  ASSERT_EQ(a.front(), 0);
+  ASSERT_EQ(a.back(), 2);
+  ASSERT_EQ(a.size(), size_t{3});
+  ASSERT_EQ(a[1], 1);
+  ASSERT_FALSE(a.empty());
+  ASSERT_NE(a.begin(), a.end());
+  ASSERT_EQ(*a.begin(), a.front());
+
+  auto it = a.rbegin();
+  ASSERT_EQ(*it, 2);
+  ASSERT_EQ(*(++it), 1);
+  ASSERT_EQ(*(++it), 0);
+
+  for (int &x : a)
+    ASSERT_GE(x, 0);
+}
+
+// Test const_iterator and const variant methods.
+TEST(LlvmLibcArrayTest, Const) {
+  const array<int, 3> z = {3, 4, 5};
+
+  ASSERT_EQ(3, z.front());
+  ASSERT_EQ(4, z[1]);
+  ASSERT_EQ(5, z.back());
+  ASSERT_EQ(3, *z.data());
+
+  // begin, cbegin, end, cend
+  array<int, 3>::const_iterator it2 = z.begin();
+  ASSERT_EQ(*it2, z.front());
+  it2 = z.cbegin();
+  ASSERT_EQ(*it2, z.front());
+  it2 = z.end();
+  ASSERT_NE(it2, z.begin());
+  it2 = z.cend();
+  ASSERT_NE(it2, z.begin());
+
+  // rbegin, crbegin, rend, crend
+  array<int, 3>::const_reverse_iterator it = z.rbegin();
+  ASSERT_EQ(*it, z.back());
+  it = z.crbegin();
+  ASSERT_EQ(*it, z.back());
+  it = z.rend();
+  ASSERT_EQ(*--it, z.front());
+  it = z.crend();
+  ASSERT_EQ(*--it, z.front());
+
+  for (const int &x : z)
+    ASSERT_GE(x, 0);
+}
diff --git a/libc/test/src/__support/CPP/stringview_test.cpp b/libc/test/src/__support/CPP/stringview_test.cpp
index 33eb8ab476572..6b68f2a1c47a9 100644
--- a/libc/test/src/__support/CPP/stringview_test.cpp
+++ b/libc/test/src/__support/CPP/stringview_test.cpp
@@ -174,3 +174,111 @@ TEST(LlvmLibcStringViewTest, FindLastOf) {
   ASSERT_EQ(Empty1.find_last_of('a', 0), string_view::npos);
   ASSERT_EQ(Empty1.find_last_of('a', 123), string_view::npos);
 }
+
+TEST(LlvmLibcStringViewTest, FindFirstNotOf) {
+  string_view Tmp("abada");
+
+  EXPECT_EQ(Tmp.find_first_not_of('a'), size_t(1));
+  EXPECT_EQ(Tmp.find_first_not_of('a', 123), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('a', 5), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('a', 4), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('a', 3), size_t(3));
+  EXPECT_EQ(Tmp.find_first_not_of('a', 2), size_t(3));
+  EXPECT_EQ(Tmp.find_first_not_of('a', 1), size_t(1));
+  EXPECT_EQ(Tmp.find_first_not_of('a', 0), size_t(1));
+
+  EXPECT_EQ(Tmp.find_first_not_of('b'), size_t(0));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 123), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('b', 5), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('b', 4), size_t(4));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 3), size_t(3));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 2), size_t(2));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 1), size_t(2));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 0), size_t(0));
+
+  EXPECT_EQ(Tmp.find_first_not_of('d'), size_t(0));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 123), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('d', 5), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('d', 4), size_t(4));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 3), size_t(4));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 2), size_t(2));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 1), size_t(1));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 0), size_t(0));
+
+  EXPECT_EQ(Tmp.find_first_not_of('e'), size_t(0));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 123), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('e', 5), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('e', 4), size_t(4));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 3), size_t(3));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 2), size_t(2));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 1), size_t(1));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 0), size_t(0));
+
+  string_view Empty;
+  EXPECT_EQ(Empty.find_first_not_of('a'), string_view::npos);
+  EXPECT_EQ(Empty.find_first_not_of('a', 0), string_view::npos);
+  EXPECT_EQ(Empty.find_first_not_of('a', 123), string_view::npos);
+
+  string_view Empty1("");
+  EXPECT_EQ(Empty1.find_first_not_of('a'), string_view::npos);
+  EXPECT_EQ(Empty1.find_first_not_of('a', 0), string_view::npos);
+  EXPECT_EQ(Empty1.find_first_not_of('a', 123), string_view::npos);
+
+  string_view Full("aaaaaaa");
+  EXPECT_EQ(Full.find_first_not_of('a'), string_view::npos);
+  EXPECT_EQ(Full.find_first_not_of('a', 0), string_view::npos);
+  EXPECT_EQ(Full.find_first_not_of('a', 123), string_view::npos);
+
+  EXPECT_EQ(Full.find_first_not_of('b'), size_t(0));
+  EXPECT_EQ(Full.find_first_not_of('b', 0), size_t(0));
+  EXPECT_EQ(Full.find_first_not_of('b', 123), string_view::npos);
+}
+
+TEST(LlvmLibcStringViewTest, Contains) {
+  string_view Empty;
+  for (char c = 'a'; c < 'z'; ++c)
+    EXPECT_FALSE(Empty.contains(c));
+
+  string_view Tmp("abada");
+  EXPECT_TRUE(Tmp.contains('a'));
+  EXPECT_TRUE(Tmp.contains('b'));
+  EXPECT_FALSE(Tmp.contains('c'));
+  EXPECT_TRUE(Tmp.contains('d'));
+  EXPECT_FALSE(Tmp.contains('e'));
+
+  EXPECT_TRUE(Tmp.substr(1).contains('a'));
+  EXPECT_TRUE(Tmp.substr(1).contains('b'));
+  EXPECT_FALSE(Tmp.substr(1).contains('c'));
+  EXPECT_TRUE(Tmp.substr(1).contains('d'));
+  EXPECT_FALSE(Tmp.substr(1).contains('e'));
+
+  EXPECT_TRUE(Tmp.substr(2).contains('a'));
+  EXPECT_FALSE(Tmp.substr(2).contains('b'));
+  EXPECT_FALSE(Tmp.substr(2).contains('c'));
+  EXPECT_TRUE(Tmp.substr(2).contains('d'));
+  EXPECT_FALSE(Tmp.substr(2).contains('e'));
+
+  EXPECT_TRUE(Tmp.substr(3).contains('a'));
+  EXPECT_FALSE(Tmp.substr(3).contains('b'));
+  EXPECT_FALSE(Tmp.substr(3).contains('c'));
+  EXPECT_TRUE(Tmp.substr(3).contains('d'));
+  EXPECT_FALSE(Tmp.substr(3).contains('e'));
+
+  EXPECT_TRUE(Tmp.substr(4).contains('a'));
+  EXPECT_FALSE(Tmp.substr(4).contains('b'));
+  EXPECT_FALSE(Tmp.substr(4).contains('c'));
+  EXPECT_FALSE(Tmp.substr(4).contains('d'));
+  EXPECT_FALSE(Tmp.substr(4).contains('e'));
+
+  EXPECT_FALSE(Tmp.substr(5).contains('a'));
+  EXPECT_FALSE(Tmp.substr(5).contains('b'));
+  EXPECT_FALSE(Tmp.substr(5).contains('c'));
+  EXPECT_FALSE(Tmp.substr(5).contains('d'));
+  EXPECT_FALSE(Tmp.substr(5).contains('e'));
+
+  EXPECT_FALSE(Tmp.substr(6).contains('a'));
+  EXPECT_FALSE(Tmp.substr(6).contains('b'));
+  EXPECT_FALSE(Tmp.substr(6).contains('c'));
+  EXPECT_FALSE(Tmp.substr(6).contains('d'));
+  EXPECT_FALSE(Tmp.substr(6).contains('e'));
+}
diff --git a/libc/test/src/__support/blockstore_test.cpp b/libc/test/src/__support/blockstore_test.cpp
index f62857275fe4c..5fe8fef1b6edc 100644
--- a/libc/test/src/__support/blockstore_test.cpp
+++ b/libc/test/src/__support/blockstore_test.cpp
@@ -19,7 +19,7 @@ class LlvmLibcBlockStoreTest : public LIBC_NAMESPACE::testing::Test {
 public:
   template <size_t BLOCK_SIZE, size_t ELEMENT_COUNT, bool REVERSE>
   void populate_and_iterate() {
-    LIBC_NAMESPACE::cpp::BlockStore<Element, BLOCK_SIZE, REVERSE> block_store;
+    LIBC_NAMESPACE::BlockStore<Element, BLOCK_SIZE, REVERSE> block_store;
     for (int i = 0; i < int(ELEMENT_COUNT); ++i)
       ASSERT_TRUE(block_store.push_back({i, 2 * i, 3 * unsigned(i)}));
     auto end = block_store.end();
@@ -38,12 +38,12 @@ class LlvmLibcBlockStoreTest : public LIBC_NAMESPACE::testing::Test {
       }
     }
     ASSERT_EQ(i, int(ELEMENT_COUNT));
-    LIBC_NAMESPACE::cpp::BlockStore<Element, BLOCK_SIZE, REVERSE>::destroy(
+    LIBC_NAMESPACE::BlockStore<Element, BLOCK_SIZE, REVERSE>::destroy(
         &block_store);
   }
 
   template <bool REVERSE> void back_test() {
-    using LIBC_NAMESPACE::cpp::BlockStore;
+    using LIBC_NAMESPACE::BlockStore;
     BlockStore<int, 4, REVERSE> block_store;
     for (int i = 0; i < 20; i++)
       ASSERT_TRUE(block_store.push_back(i));
@@ -53,7 +53,7 @@ class LlvmLibcBlockStoreTest : public LIBC_NAMESPACE::testing::Test {
   }
 
   template <bool REVERSE> void empty_test() {
-    using LIBC_NAMESPACE::cpp::BlockStore;
+    using LIBC_NAMESPACE::BlockStore;
     BlockStore<int, 2, REVERSE> block_store;
 
     ASSERT_TRUE(block_store.empty());
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index 851656e9fcbbb..5764324ca2881 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/UInt.h"
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
 
+#include "include/llvm-libc-macros/math-macros.h" // HUGE_VALF, HUGE_VALF
 #include "test/UnitTest/Test.h"
-#include <math.h> // HUGE_VALF, HUGE_VALF
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 7b952901da76e..026bcd12928bd 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1518,6 +1518,8 @@ add_fp_unittest(
   DEPENDS
     libc.include.math
     libc.src.math.generic.explogxf
+    libc.src.math.fabs
+    libc.src.math.fabsf
     libc.src.__support.FPUtil.fp_bits
 )
 
@@ -1643,20 +1645,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
-add_fp_unittest(
-  inv_trigf_utils_test
-  NEED_MPFR
-  SUITE
-    libc-math-unittests
-  HDRS
-    in_float_range_test_helper.h
-  SRCS
-    inv_trigf_utils_test.cpp
-  DEPENDS
-    libc.src.math.generic.inv_trigf_utils
-    libc.src.__support.FPUtil.fp_bits
-)
-
 add_fp_unittest(
   scalbn_test
   NEED_MPFR
diff --git a/libc/test/src/math/CeilTest.h b/libc/test/src/math/CeilTest.h
index 5ea4f349d008b..74cc90614dfc2 100644
--- a/libc/test/src/math/CeilTest.h
+++ b/libc/test/src/math/CeilTest.h
@@ -10,7 +10,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/CopySignTest.h b/libc/test/src/math/CopySignTest.h
index 8b81e8d7de252..206626d66f580 100644
--- a/libc/test/src/math/CopySignTest.h
+++ b/libc/test/src/math/CopySignTest.h
@@ -10,7 +10,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FAbsTest.h b/libc/test/src/math/FAbsTest.h
index 54f5f87e08e7e..942991f23be1c 100644
--- a/libc/test/src/math/FAbsTest.h
+++ b/libc/test/src/math/FAbsTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FDimTest.h b/libc/test/src/math/FDimTest.h
index e00b4fd5c42ec..76f0f18bbc68d 100644
--- a/libc/test/src/math/FDimTest.h
+++ b/libc/test/src/math/FDimTest.h
@@ -6,11 +6,11 @@
 //
 //===---------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 template <typename T>
 class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/FMaxTest.h b/libc/test/src/math/FMaxTest.h
index f8046f380f5fd..2c7dc3dc13ec5 100644
--- a/libc/test/src/math/FMaxTest.h
+++ b/libc/test/src/math/FMaxTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FMinTest.h b/libc/test/src/math/FMinTest.h
index 7a6534f320c92..a986d5240d0da 100644
--- a/libc/test/src/math/FMinTest.h
+++ b/libc/test/src/math/FMinTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FModTest.h b/libc/test/src/math/FModTest.h
index 2b1442923268d..96ad299258a17 100644
--- a/libc/test/src/math/FModTest.h
+++ b/libc/test/src/math/FModTest.h
@@ -14,7 +14,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 #define TEST_SPECIAL(x, y, expected, dom_err, expected_exception)              \
   EXPECT_FP_EQ(expected, f(x, y));                                             \
diff --git a/libc/test/src/math/FloorTest.h b/libc/test/src/math/FloorTest.h
index 66b37d69d7ba3..21ae291e61bc7 100644
--- a/libc/test/src/math/FloorTest.h
+++ b/libc/test/src/math/FloorTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FrexpTest.h b/libc/test/src/math/FrexpTest.h
index f3a64ce4aac31..f971b45628f09 100644
--- a/libc/test/src/math/FrexpTest.h
+++ b/libc/test/src/math/FrexpTest.h
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/HypotTest.h b/libc/test/src/math/HypotTest.h
index 8f84024a6ee11..46fcc462a2793 100644
--- a/libc/test/src/math/HypotTest.h
+++ b/libc/test/src/math/HypotTest.h
@@ -14,7 +14,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/ILogbTest.h b/libc/test/src/math/ILogbTest.h
index 3e2db33e2c052..dcc9d554eb3c2 100644
--- a/libc/test/src/math/ILogbTest.h
+++ b/libc/test/src/math/ILogbTest.h
@@ -9,11 +9,11 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/limits.h" // INT_MAX
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
 public:
diff --git a/libc/test/src/math/LdExpTest.h b/libc/test/src/math/LdExpTest.h
index fe84b5f4c192a..738135d6afe27 100644
--- a/libc/test/src/math/LdExpTest.h
+++ b/libc/test/src/math/LdExpTest.h
@@ -15,7 +15,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 #include <stdint.h>
 
 template <typename T>
diff --git a/libc/test/src/math/LogbTest.h b/libc/test/src/math/LogbTest.h
index d64c5c44e4281..3859b56582e5e 100644
--- a/libc/test/src/math/LogbTest.h
+++ b/libc/test/src/math/LogbTest.h
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/ModfTest.h b/libc/test/src/math/ModfTest.h
index a7e5e8810611b..84e26db49695d 100644
--- a/libc/test/src/math/ModfTest.h
+++ b/libc/test/src/math/ModfTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/NextAfterTest.h b/libc/test/src/math/NextAfterTest.h
index 2f1450a16fd1e..d45d819bfdb6c 100644
--- a/libc/test/src/math/NextAfterTest.h
+++ b/libc/test/src/math/NextAfterTest.h
@@ -9,13 +9,13 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 template <typename T>
 class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/RIntTest.h b/libc/test/src/math/RIntTest.h
index 3b16b902bcf5d..d392d4fb14a2e 100644
--- a/libc/test/src/math/RIntTest.h
+++ b/libc/test/src/math/RIntTest.h
@@ -15,8 +15,8 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include <fenv.h>
-#include <math.h>
 #include <stdio.h>
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
diff --git a/libc/test/src/math/RemQuoTest.h b/libc/test/src/math/RemQuoTest.h
index 7b3011d23055d..d61b97554199e 100644
--- a/libc/test/src/math/RemQuoTest.h
+++ b/libc/test/src/math/RemQuoTest.h
@@ -9,12 +9,12 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/RoundTest.h b/libc/test/src/math/RoundTest.h
index b255ecc4fa84e..17da00f869d3b 100644
--- a/libc/test/src/math/RoundTest.h
+++ b/libc/test/src/math/RoundTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index 9bd4ba52f61ba..017f5867fc8de 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -15,8 +15,8 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include <errno.h>
-#include <math.h>
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/SqrtTest.h b/libc/test/src/math/SqrtTest.h
index 75eb411810f5e..9811b2767ee33 100644
--- a/libc/test/src/math/SqrtTest.h
+++ b/libc/test/src/math/SqrtTest.h
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/TruncTest.h b/libc/test/src/math/TruncTest.h
index 6d0ea1182ec11..c3a89dbb837b5 100644
--- a/libc/test/src/math/TruncTest.h
+++ b/libc/test/src/math/TruncTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp
index c273184d58f3e..6f8321bd7182a 100644
--- a/libc/test/src/math/acosf_test.cpp
+++ b/libc/test/src/math/acosf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/acosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp
index a0e845b2b247e..41d1166fb430d 100644
--- a/libc/test/src/math/acoshf_test.cpp
+++ b/libc/test/src/math/acoshf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/acoshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp
index a24fdcc36e140..4e36f03f48955 100644
--- a/libc/test/src/math/asinf_test.cpp
+++ b/libc/test/src/math/asinf_test.cpp
@@ -7,13 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/asinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp
index 3127861c9a1be..9a3bfbed1068d 100644
--- a/libc/test/src/math/asinhf_test.cpp
+++ b/libc/test/src/math/asinhf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/asinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp
index 1fa7165805c7d..e51932fc49552 100644
--- a/libc/test/src/math/atanf_test.cpp
+++ b/libc/test/src/math/atanf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/atanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
@@ -53,8 +53,15 @@ TEST_F(LlvmLibcAtanfTest, InFloatRange) {
 
 // For small values, tanh(x) is x.
 TEST_F(LlvmLibcAtanfTest, SpecialValues) {
-  uint32_t val_arr[] = {0x3d8d6b23U, 0x3feefcfbU, 0xbd8d6b23U,
-                        0xbfeefcfbU, 0x7F800000U, 0xFF800000U};
+  uint32_t val_arr[] = {
+      0x3d8d6b23U, // x = 0x1.1ad646p-4f
+      0x3feefcfbU, // x = 0x1.ddf9f6p+0f
+      0xbd8d6b23U, // x = -0x1.1ad646p-4f
+      0xbfeefcfbU, // x = -0x1.ddf9f6p+0f
+      0x7F800000U, // x = +Inf
+      0xFF800000U, // x = -Inf
+      0xbffe2ec1U, // x = -0x1.fc5d82p+0f
+  };
   for (uint32_t v : val_arr) {
     float x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, x,
diff --git a/libc/test/src/math/atanhf_test.cpp b/libc/test/src/math/atanhf_test.cpp
index 1b45436094da4..39c067f3e764c 100644
--- a/libc/test/src/math/atanhf_test.cpp
+++ b/libc/test/src/math/atanhf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/atanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp
index 1f55e9e9a1495..6a1122997c51a 100644
--- a/libc/test/src/math/cos_test.cpp
+++ b/libc/test/src/math/cos_test.cpp
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 using LlvmLibcCosTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index 93ab06dc80b26..8a5eb17fdcea5 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/cosf.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp
index 3b8e14fb5f034..8792f56b03461 100644
--- a/libc/test/src/math/coshf_test.cpp
+++ b/libc/test/src/math/coshf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/erff_test.cpp b/libc/test/src/math/erff_test.cpp
index 8ebde4ec24fb5..1e43c206aef0d 100644
--- a/libc/test/src/math/erff_test.cpp
+++ b/libc/test/src/math/erff_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/erff.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt
index 84a5ebc75ef04..df32dd4f943f3 100644
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@@ -273,6 +273,7 @@ add_fp_unittest(
     fmod_generic_impl_test.cpp
   DEPENDS
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
     libc.src.__support.FPUtil.generic.fmod
 )
 
diff --git a/libc/test/src/math/exhaustive/atanf_test.cpp b/libc/test/src/math/exhaustive/atanf_test.cpp
index 508c288b050c5..6f23231d42674 100644
--- a/libc/test/src/math/exhaustive/atanf_test.cpp
+++ b/libc/test/src/math/exhaustive/atanf_test.cpp
@@ -25,7 +25,7 @@ TEST_F(LlvmLibcAtanfExhaustiveTest, PostiveRange) {
 }
 
 // Range: [-Inf, 0];
-static constexpr uint32_t NEG_START = 0xb000'0000U;
+static constexpr uint32_t NEG_START = 0x8000'0000U;
 static constexpr uint32_t NEG_STOP = 0xff80'0000U;
 
 TEST_F(LlvmLibcAtanfExhaustiveTest, NegativeRange) {
diff --git a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
index 25a5e3898599a..c7aec5b7bc21b 100644
--- a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
+++ b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
@@ -6,53 +6,55 @@
 //
 //===----------------------------------------------------------------------===//
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h" // ldexp
 #include "src/__support/FPUtil/generic/FMod.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
 #include <array>
-#include <limits>
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 template <typename T, bool InverseMultiplication>
 class LlvmLibcFModTest : public LIBC_NAMESPACE::testing::Test {
 
-  using U = typename LIBC_NAMESPACE::fputil::FPBits<T>::StorageType;
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+  using U = typename FPBits::StorageType;
   using DivisionHelper = LIBC_NAMESPACE::cpp::conditional_t<
       InverseMultiplication,
       LIBC_NAMESPACE::fputil::generic::FModDivisionInvMultHelper<U>,
       LIBC_NAMESPACE::fputil::generic::FModDivisionSimpleHelper<U>>;
 
-  static constexpr std::array<T, 11> test_bases = {
+  static constexpr std::array<T, 11> TEST_BASES = {
       T(0.0),
       T(1.0),
       T(3.0),
       T(27.0),
       T(11.0 / 8.0),
       T(2.764443),
-      T(1.0) - std::numeric_limits<T>::epsilon(),
-      T(1.0) + std::numeric_limits<T>::epsilon(),
-      T(M_PI),
-      T(M_SQRT2),
-      T(M_E)};
+      T(1.0) - T(0x1.0p-23) - T(0x1.0p-52) - T(0x1.0p-112),
+      T(1.0) + T(0x1.0p-23) + T(0x1.0p-52) + T(0x1.0p-112),
+      T(3.14159265),
+      T(1.41421356),
+      T(2.71828183)};
 
 public:
   void testExtensive() {
     using FMod = LIBC_NAMESPACE::fputil::generic::FMod<T, U, DivisionHelper>;
-    using nl = std::numeric_limits<T>;
-    int min2 = nl::min_exponent - nl::digits - 5;
-    int max2 = nl::max_exponent + 3;
-    for (T by : test_bases) {
+    int min2 = -(FPBits::MAX_BIASED_EXPONENT + FPBits::SIG_LEN) / 2;
+    int max2 = 3 + FPBits::MAX_BIASED_EXPONENT / 2;
+    for (T by : TEST_BASES) {
       for (int iy = min2; iy < max2; iy++) {
-        T y = by * std::ldexp(2, iy);
-        if (y == 0 || !std::isfinite(y))
+        T y = by * LIBC_NAMESPACE::fputil::ldexp(2.0, iy);
+        FPBits y_bits(y);
+        if (y_bits.is_zero() || !y_bits.is_finite())
           continue;
-        for (T bx : test_bases) {
+        for (T bx : TEST_BASES) {
           for (int ix = min2; ix < max2; ix++) {
-            T x = bx * std::ldexp(2, ix);
-            if (!std::isfinite(x))
+            T x = bx * LIBC_NAMESPACE::fputil::ldexp(2.0, ix);
+            if (!FPBits(x).is_finite())
               continue;
             T result = FMod::eval(x, y);
             mpfr::BinaryInput<T> input{x, y};
diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp
index d71b5a2230374..778189626a617 100644
--- a/libc/test/src/math/exp10_test.cpp
+++ b/libc/test/src/math/exp10_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp
index 4e2d065f1292a..9d44e8f65decc 100644
--- a/libc/test/src/math/exp10f_test.cpp
+++ b/libc/test/src/math/exp10f_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp
index 2f9d7b3a2a6c9..845fda5451d4b 100644
--- a/libc/test/src/math/exp2_test.cpp
+++ b/libc/test/src/math/exp2_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
index f5ea8554be5c0..f63f091eab9a8 100644
--- a/libc/test/src/math/exp2f_test.cpp
+++ b/libc/test/src/math/exp2f_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
 #include "src/errno/libc_errno.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp
index 006db00b91949..42018e608ae45 100644
--- a/libc/test/src/math/exp_test.cpp
+++ b/libc/test/src/math/exp_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
index ffd9da500488b..634958bdc43e5 100644
--- a/libc/test/src/math/expf_test.cpp
+++ b/libc/test/src/math/expf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/explogxf_test.cpp b/libc/test/src/math/explogxf_test.cpp
index 24f9f3c0f8e45..a536a9f3ab8de 100644
--- a/libc/test/src/math/explogxf_test.cpp
+++ b/libc/test/src/math/explogxf_test.cpp
@@ -7,12 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "in_float_range_test_helper.h"
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/math/fabs.h"
+#include "src/math/fabsf.h"
 #include "src/math/generic/explogxf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 using LlvmLibcExplogfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -22,7 +24,7 @@ constexpr int def_count = 100003;
 constexpr float def_prec = 0.500001f;
 
 auto f_normal = [](float x) -> bool {
-  return !(isnan(x) || isinf(x) || fabs(x) < 2E-38);
+  return !(isnan(x) || isinf(x) || LIBC_NAMESPACE::fabs(x) < 2E-38);
 };
 
 TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
@@ -32,8 +34,8 @@ TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
     return static_cast<float>(result.mh * r);
   };
   auto f_check = [](float x) -> bool {
-    return !(
-        (isnan(x) || isinf(x) || x < -70 || x > 70 || fabsf(x) < 0x1.0p-10));
+    return !((isnan(x) || isinf(x) || x < -70 || x > 70 ||
+              LIBC_NAMESPACE::fabsf(x) < 0x1.0p-10));
   };
   CHECK_DATA(0.0f, neg_inf, mpfr::Operation::Exp, fx, f_check, def_count,
              def_prec);
diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp
index ccc6e36095481..198e6d5cdd8ab 100644
--- a/libc/test/src/math/expm1_test.cpp
+++ b/libc/test/src/math/expm1_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expm1.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
index 94a9301f8eb22..c72815887ba8b 100644
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expm1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/fdim_test.cpp b/libc/test/src/math/fdim_test.cpp
index 2f00a30ad1ee6..6c0c3e204c5f9 100644
--- a/libc/test/src/math/fdim_test.cpp
+++ b/libc/test/src/math/fdim_test.cpp
@@ -8,11 +8,11 @@
 
 #include "FDimTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdim.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 using LlvmLibcFDimTest = FDimTestTemplate<double>;
 
diff --git a/libc/test/src/math/fdimf_test.cpp b/libc/test/src/math/fdimf_test.cpp
index 27511baf25b6d..a74011b5a2249 100644
--- a/libc/test/src/math/fdimf_test.cpp
+++ b/libc/test/src/math/fdimf_test.cpp
@@ -8,11 +8,11 @@
 
 #include "FDimTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdimf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 using LlvmLibcFDimTest = FDimTestTemplate<float>;
 
diff --git a/libc/test/src/math/fdiml_test.cpp b/libc/test/src/math/fdiml_test.cpp
index 45aedb0a1cdea..d3f2e68a7c1d7 100644
--- a/libc/test/src/math/fdiml_test.cpp
+++ b/libc/test/src/math/fdiml_test.cpp
@@ -8,11 +8,11 @@
 
 #include "FDimTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdiml.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 using LlvmLibcFDimTest = FDimTestTemplate<long double>;
 
diff --git a/libc/test/src/math/ilogb_test.cpp b/libc/test/src/math/ilogb_test.cpp
index 7011c43386e66..45756ffa3d9a7 100644
--- a/libc/test/src/math/ilogb_test.cpp
+++ b/libc/test/src/math/ilogb_test.cpp
@@ -8,12 +8,12 @@
 
 #include "ILogbTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogb.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogb) {
   test_special_numbers<double>(&LIBC_NAMESPACE::ilogb);
diff --git a/libc/test/src/math/ilogbf_test.cpp b/libc/test/src/math/ilogbf_test.cpp
index dcff8eeb15180..ff19dd145a198 100644
--- a/libc/test/src/math/ilogbf_test.cpp
+++ b/libc/test/src/math/ilogbf_test.cpp
@@ -8,12 +8,12 @@
 
 #include "ILogbTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogbf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbf) {
   test_special_numbers<float>(&LIBC_NAMESPACE::ilogbf);
diff --git a/libc/test/src/math/ilogbl_test.cpp b/libc/test/src/math/ilogbl_test.cpp
index 29a221ad7f08f..b2c5246669946 100644
--- a/libc/test/src/math/ilogbl_test.cpp
+++ b/libc/test/src/math/ilogbl_test.cpp
@@ -8,12 +8,12 @@
 
 #include "ILogbTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogbl.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbl) {
   test_special_numbers<long double>(&LIBC_NAMESPACE::ilogbl);
diff --git a/libc/test/src/math/inv_trigf_utils_test.cpp b/libc/test/src/math/inv_trigf_utils_test.cpp
deleted file mode 100644
index 23420edcd0ca1..0000000000000
--- a/libc/test/src/math/inv_trigf_utils_test.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- Unittests for supfuncf --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "in_float_range_test_helper.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/math/generic/inv_trigf_utils.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
-
-using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
-
-namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
-
-constexpr int def_count = 100003;
-constexpr float def_prec = 0.500001f;
-
-auto f_normal = [](float x) -> bool { return !(isnan(x) || isinf(x)); };
-
-TEST_F(LlvmLibcAtanfTest, InPositiveRange) {
-  CHECK_DATA(0.0f, inf, mpfr::Operation::Atan, LIBC_NAMESPACE::atan_eval,
-             f_normal, def_count, def_prec);
-}
-
-TEST_F(LlvmLibcAtanfTest, InNegativeRange) {
-  CHECK_DATA(-0.0f, neg_inf, mpfr::Operation::Atan, LIBC_NAMESPACE::atan_eval,
-             f_normal, def_count, def_prec);
-}
diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp
index ed4a24b951595..dc4ac895546c4 100644
--- a/libc/test/src/math/log10_test.cpp
+++ b/libc/test/src/math/log10_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log10f_test.cpp b/libc/test/src/math/log10f_test.cpp
index c38a5159682cf..f8a137e44c351 100644
--- a/libc/test/src/math/log10f_test.cpp
+++ b/libc/test/src/math/log10f_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/log10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp
index d769659356c94..975fb8e05c35e 100644
--- a/libc/test/src/math/log1p_test.cpp
+++ b/libc/test/src/math/log1p_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log1p.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp
index 2f6330ee6ce63..a1108fee48196 100644
--- a/libc/test/src/math/log1pf_test.cpp
+++ b/libc/test/src/math/log1pf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log1pf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp
index f4ee0ff5185b4..8765279005798 100644
--- a/libc/test/src/math/log2_test.cpp
+++ b/libc/test/src/math/log2_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp
index d8b4808f5cda1..c05b6b93cff77 100644
--- a/libc/test/src/math/log2f_test.cpp
+++ b/libc/test/src/math/log2f_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp
index b1f1ab775462b..06a0dc574be51 100644
--- a/libc/test/src/math/log_test.cpp
+++ b/libc/test/src/math/log_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/logf_test.cpp b/libc/test/src/math/logf_test.cpp
index 3ab67ba807823..1ab480744ba59 100644
--- a/libc/test/src/math/logf_test.cpp
+++ b/libc/test/src/math/logf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/logf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp
index 3dffeb603499f..cf674ecf8f99e 100644
--- a/libc/test/src/math/powf_test.cpp
+++ b/libc/test/src/math/powf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/powf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index 69981ed22ee69..fa1c5370c30fb 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -12,7 +12,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 using LlvmLibcSinTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index 2c0c7eaaa25f0..a7372fd53b319 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sincosf.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index 94c1114e68898..a3c5384e3e626 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sinf.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp
index f14150bb04420..bea976055dbdf 100644
--- a/libc/test/src/math/sinhf_test.cpp
+++ b/libc/test/src/math/sinhf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 293e65abd44f5..85dacce3b21dc 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1506,6 +1506,22 @@ add_fp_unittest(
   UNIT_TEST_ONLY
 )
 
+add_fp_unittest(
+  nanf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nanf128_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.include.signal
+    libc.src.math.nanf128
+    libc.src.__support.FPUtil.fp_bits
+  # FIXME: The nan tests currently have death tests, which aren't supported for
+  # hermetic tests.
+  UNIT_TEST_ONLY
+)
+
 add_fp_unittest(
   nextafter_test
   SUITE
@@ -1614,6 +1630,118 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  nextdown_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextdown_test.cpp
+  HDRS
+    NextDownTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextdown
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextdownf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextdownf_test.cpp
+  HDRS
+    NextDownTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextdownf
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextdownl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextdownl_test.cpp
+  HDRS
+    NextDownTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextdownl
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextdownf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextdownf128_test.cpp
+  HDRS
+    NextDownTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextdownf128
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextup_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextup_test.cpp
+  HDRS
+    NextUpTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextup
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextupf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextupf_test.cpp
+  HDRS
+    NextUpTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextupf
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextupl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextupl_test.cpp
+  HDRS
+    NextUpTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextupl
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextupf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextupf128_test.cpp
+  HDRS
+    NextUpTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextupf128
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
 # TODO(lntue): The current implementation of fputil::general::fma<float> is only
 # correctly rounded for the default rounding mode round-to-nearest tie-to-even.
 add_fp_unittest(
diff --git a/libc/test/src/math/smoke/CeilTest.h b/libc/test/src/math/smoke/CeilTest.h
index 5248dbca50370..ec70258fddec1 100644
--- a/libc/test/src/math/smoke/CeilTest.h
+++ b/libc/test/src/math/smoke/CeilTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class CeilTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/CopySignTest.h b/libc/test/src/math/smoke/CopySignTest.h
index 9ee34338ba807..70a6a419e0a03 100644
--- a/libc/test/src/math/smoke/CopySignTest.h
+++ b/libc/test/src/math/smoke/CopySignTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T>
 class CopySignTest : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/smoke/FAbsTest.h b/libc/test/src/math/smoke/FAbsTest.h
index cf05882e22f97..9309c2ada4a11 100644
--- a/libc/test/src/math/smoke/FAbsTest.h
+++ b/libc/test/src/math/smoke/FAbsTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class FAbsTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h
index 2b1442923268d..96ad299258a17 100644
--- a/libc/test/src/math/smoke/FModTest.h
+++ b/libc/test/src/math/smoke/FModTest.h
@@ -14,7 +14,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 #define TEST_SPECIAL(x, y, expected, dom_err, expected_exception)              \
   EXPECT_FP_EQ(expected, f(x, y));                                             \
diff --git a/libc/test/src/math/smoke/FloorTest.h b/libc/test/src/math/smoke/FloorTest.h
index 610f5c206ed3a..8886e8e751836 100644
--- a/libc/test/src/math/smoke/FloorTest.h
+++ b/libc/test/src/math/smoke/FloorTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class FloorTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h
index 619879188a3b3..43499267b7113 100644
--- a/libc/test/src/math/smoke/HypotTest.h
+++ b/libc/test/src/math/smoke/HypotTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T>
 class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/smoke/ModfTest.h b/libc/test/src/math/smoke/ModfTest.h
index d7e15a7ed6820..107963665b835 100644
--- a/libc/test/src/math/smoke/ModfTest.h
+++ b/libc/test/src/math/smoke/ModfTest.h
@@ -11,7 +11,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class ModfTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h
index dda86eeeb6e03..23b3b15347407 100644
--- a/libc/test/src/math/smoke/NextAfterTest.h
+++ b/libc/test/src/math/smoke/NextAfterTest.h
@@ -9,13 +9,13 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #define ASSERT_FP_EQ_WITH_EXCEPTION(result, expected, expected_exception)      \
   ASSERT_FP_EQ(result, expected);                                              \
diff --git a/libc/test/src/math/smoke/NextDownTest.h b/libc/test/src/math/smoke/NextDownTest.h
new file mode 100644
index 0000000000000..c678ab1db1def
--- /dev/null
+++ b/libc/test/src/math/smoke/NextDownTest.h
@@ -0,0 +1,43 @@
+//===-- Utility class to test different flavors of nextdown -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTDOWNTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_NEXTDOWNTEST_H
+
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+template <typename T>
+class NextDownTestTemplate : public LIBC_NAMESPACE::testing::Test {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef T (*NextDownFunc)(T);
+
+  void testNaN(NextDownFunc func) { ASSERT_FP_EQ(func(aNaN), aNaN); }
+
+  void testBoundaries(NextDownFunc func) {
+    ASSERT_FP_EQ(zero, func(min_denormal));
+
+    ASSERT_FP_EQ(neg_min_denormal, func(zero));
+    ASSERT_FP_EQ(neg_min_denormal, func(neg_zero));
+
+    ASSERT_FP_EQ(neg_max_normal, func(neg_max_normal));
+    ASSERT_FP_EQ(neg_inf, func(neg_inf));
+
+    ASSERT_FP_EQ(max_normal, func(inf));
+  }
+};
+
+#define LIST_NEXTDOWN_TESTS(T, func)                                           \
+  using LlvmLibcNextDownTest = NextDownTestTemplate<T>;                        \
+  TEST_F(LlvmLibcNextDownTest, TestNaN) { testNaN(&func); }                    \
+  TEST_F(LlvmLibcNextDownTest, TestBoundaries) { testBoundaries(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_NEXTDOWNTEST_H
diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h
index 42a9a56aeb0a7..caf98262c5d15 100644
--- a/libc/test/src/math/smoke/NextTowardTest.h
+++ b/libc/test/src/math/smoke/NextTowardTest.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTTOWARDTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_NEXTTOWARDTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/BasicOperations.h"
@@ -16,7 +17,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include <fenv.h>
-#include <math.h>
 
 #define ASSERT_FP_EQ_WITH_EXCEPTION(result, expected, expected_exception)      \
   ASSERT_FP_EQ(result, expected);                                              \
diff --git a/libc/test/src/math/smoke/NextUpTest.h b/libc/test/src/math/smoke/NextUpTest.h
new file mode 100644
index 0000000000000..ebbdb5c73def9
--- /dev/null
+++ b/libc/test/src/math/smoke/NextUpTest.h
@@ -0,0 +1,43 @@
+//===-- Utility class to test different flavors of nextup -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTUPTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_NEXTUPTEST_H
+
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+template <typename T>
+class NextUpTestTemplate : public LIBC_NAMESPACE::testing::Test {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef T (*NextUpFunc)(T);
+
+  void testNaN(NextUpFunc func) { ASSERT_FP_EQ(func(aNaN), aNaN); }
+
+  void testBoundaries(NextUpFunc func) {
+    ASSERT_FP_EQ(neg_zero, func(neg_min_denormal));
+
+    ASSERT_FP_EQ(min_denormal, func(zero));
+    ASSERT_FP_EQ(min_denormal, func(neg_zero));
+
+    ASSERT_FP_EQ(max_normal, func(max_normal));
+    ASSERT_FP_EQ(inf, func(inf));
+
+    ASSERT_FP_EQ(neg_max_normal, func(neg_inf));
+  }
+};
+
+#define LIST_NEXTUP_TESTS(T, func)                                             \
+  using LlvmLibcNextUpTest = NextUpTestTemplate<T>;                            \
+  TEST_F(LlvmLibcNextUpTest, TestNaN) { testNaN(&func); }                      \
+  TEST_F(LlvmLibcNextUpTest, TestBoundaries) { testBoundaries(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_NEXTUPTEST_H
diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h
index 4c90dffa39cb1..903fbe9ce3430 100644
--- a/libc/test/src/math/smoke/RIntTest.h
+++ b/libc/test/src/math/smoke/RIntTest.h
@@ -14,8 +14,8 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include <fenv.h>
-#include <math.h>
 #include <stdio.h>
 
 static constexpr int ROUNDING_MODES[4] = {FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO,
diff --git a/libc/test/src/math/smoke/RemQuoTest.h b/libc/test/src/math/smoke/RemQuoTest.h
index 87551faeda9cf..a9fa405b27000 100644
--- a/libc/test/src/math/smoke/RemQuoTest.h
+++ b/libc/test/src/math/smoke/RemQuoTest.h
@@ -9,11 +9,11 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 template <typename T>
 class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/smoke/RoundTest.h b/libc/test/src/math/smoke/RoundTest.h
index d2a5906b1e29e..8cf96f4569034 100644
--- a/libc/test/src/math/smoke/RoundTest.h
+++ b/libc/test/src/math/smoke/RoundTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class RoundTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index e86533ca09e17..1b5135d016cc4 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -14,8 +14,8 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include <errno.h>
-#include <math.h>
 
 static constexpr int ROUNDING_MODES[4] = {FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO,
                                           FE_TONEAREST};
diff --git a/libc/test/src/math/smoke/SqrtTest.h b/libc/test/src/math/smoke/SqrtTest.h
index edb6e74236e31..eea5dc1534e0b 100644
--- a/libc/test/src/math/smoke/SqrtTest.h
+++ b/libc/test/src/math/smoke/SqrtTest.h
@@ -10,7 +10,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class SqrtTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/TruncTest.h b/libc/test/src/math/smoke/TruncTest.h
index 71b1ab9df3f08..5612d27fef21d 100644
--- a/libc/test/src/math/smoke/TruncTest.h
+++ b/libc/test/src/math/smoke/TruncTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class TruncTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/acosf_test.cpp b/libc/test/src/math/smoke/acosf_test.cpp
index 864c9ea4d3175..573a2c39492f0 100644
--- a/libc/test/src/math/smoke/acosf_test.cpp
+++ b/libc/test/src/math/smoke/acosf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/acosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/acoshf_test.cpp b/libc/test/src/math/smoke/acoshf_test.cpp
index b3ba740c2024d..f561f23eb99ad 100644
--- a/libc/test/src/math/smoke/acoshf_test.cpp
+++ b/libc/test/src/math/smoke/acoshf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/acoshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/asinf_test.cpp b/libc/test/src/math/smoke/asinf_test.cpp
index e6afb23a397fe..39d25e72c143b 100644
--- a/libc/test/src/math/smoke/asinf_test.cpp
+++ b/libc/test/src/math/smoke/asinf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/asinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/asinhf_test.cpp b/libc/test/src/math/smoke/asinhf_test.cpp
index 7a520cdc050a2..9637bfa539488 100644
--- a/libc/test/src/math/smoke/asinhf_test.cpp
+++ b/libc/test/src/math/smoke/asinhf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/asinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp
index 8cc1b418f869c..abd9835d38a05 100644
--- a/libc/test/src/math/smoke/atanf_test.cpp
+++ b/libc/test/src/math/smoke/atanf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/atanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp
index e1d6d2532d39e..df0746e0c9c3a 100644
--- a/libc/test/src/math/smoke/atanhf_test.cpp
+++ b/libc/test/src/math/smoke/atanhf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/atanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/cosf_test.cpp b/libc/test/src/math/smoke/cosf_test.cpp
index 539c428eadf11..62132990ed547 100644
--- a/libc/test/src/math/smoke/cosf_test.cpp
+++ b/libc/test/src/math/smoke/cosf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/cosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp
index e38802f936474..9d7ef505ae749 100644
--- a/libc/test/src/math/smoke/coshf_test.cpp
+++ b/libc/test/src/math/smoke/coshf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/coshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/erff_test.cpp b/libc/test/src/math/smoke/erff_test.cpp
index 843cc7f4c8c04..24778f8d653ad 100644
--- a/libc/test/src/math/smoke/erff_test.cpp
+++ b/libc/test/src/math/smoke/erff_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/erff.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/exp10_test.cpp b/libc/test/src/math/smoke/exp10_test.cpp
index daa7b5b48842b..fffffeb4c78ab 100644
--- a/libc/test/src/math/smoke/exp10_test.cpp
+++ b/libc/test/src/math/smoke/exp10_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp
index f6533e983dacd..c0dcc12503324 100644
--- a/libc/test/src/math/smoke/exp10f_test.cpp
+++ b/libc/test/src/math/smoke/exp10f_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/exp2_test.cpp b/libc/test/src/math/smoke/exp2_test.cpp
index 7bcc2a3cb6c74..d362d32f678b2 100644
--- a/libc/test/src/math/smoke/exp2_test.cpp
+++ b/libc/test/src/math/smoke/exp2_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp
index 7baab630a4cbc..e2989a6ec4d8a 100644
--- a/libc/test/src/math/smoke/exp2f_test.cpp
+++ b/libc/test/src/math/smoke/exp2f_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
 #include "src/errno/libc_errno.h"
 #include "src/math/exp2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/exp_test.cpp b/libc/test/src/math/smoke/exp_test.cpp
index 455cb2ea13532..a2becc74f526f 100644
--- a/libc/test/src/math/smoke/exp_test.cpp
+++ b/libc/test/src/math/smoke/exp_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp
index 2968704f83e35..42710c5fa404e 100644
--- a/libc/test/src/math/smoke/expf_test.cpp
+++ b/libc/test/src/math/smoke/expf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/expm1_test.cpp b/libc/test/src/math/smoke/expm1_test.cpp
index 4581060075e32..07963ec2d34c8 100644
--- a/libc/test/src/math/smoke/expm1_test.cpp
+++ b/libc/test/src/math/smoke/expm1_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expm1.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp
index 4ef8d50ba551a..82e0b15463504 100644
--- a/libc/test/src/math/smoke/expm1f_test.cpp
+++ b/libc/test/src/math/smoke/expm1f_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expm1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/log10_test.cpp b/libc/test/src/math/smoke/log10_test.cpp
index c1658a3d5c965..36d7534197648 100644
--- a/libc/test/src/math/smoke/log10_test.cpp
+++ b/libc/test/src/math/smoke/log10_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log10f_test.cpp b/libc/test/src/math/smoke/log10f_test.cpp
index 53950233cabf5..53e699417fb7c 100644
--- a/libc/test/src/math/smoke/log10f_test.cpp
+++ b/libc/test/src/math/smoke/log10f_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/log10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log1p_test.cpp b/libc/test/src/math/smoke/log1p_test.cpp
index e1966c28397c0..5fe9c60f90abf 100644
--- a/libc/test/src/math/smoke/log1p_test.cpp
+++ b/libc/test/src/math/smoke/log1p_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log1p.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log1pf_test.cpp b/libc/test/src/math/smoke/log1pf_test.cpp
index 377a46adef9c7..e2fb2f057d2eb 100644
--- a/libc/test/src/math/smoke/log1pf_test.cpp
+++ b/libc/test/src/math/smoke/log1pf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log1pf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log2_test.cpp b/libc/test/src/math/smoke/log2_test.cpp
index 826c51eb8c1b0..fbeba9527bcb7 100644
--- a/libc/test/src/math/smoke/log2_test.cpp
+++ b/libc/test/src/math/smoke/log2_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log2f_test.cpp b/libc/test/src/math/smoke/log2f_test.cpp
index 2387ff59d70d9..46906e78dcaf7 100644
--- a/libc/test/src/math/smoke/log2f_test.cpp
+++ b/libc/test/src/math/smoke/log2f_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/log_test.cpp b/libc/test/src/math/smoke/log_test.cpp
index 423b9d8bb818a..b1e3905994800 100644
--- a/libc/test/src/math/smoke/log_test.cpp
+++ b/libc/test/src/math/smoke/log_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/logf_test.cpp b/libc/test/src/math/smoke/logf_test.cpp
index 21b39ee6896dd..97b6bdde307b3 100644
--- a/libc/test/src/math/smoke/logf_test.cpp
+++ b/libc/test/src/math/smoke/logf_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/logf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/nanf128_test.cpp b/libc/test/src/math/smoke/nanf128_test.cpp
new file mode 100644
index 0000000000000..2a9f57de5b43b
--- /dev/null
+++ b/libc/test/src/math/smoke/nanf128_test.cpp
@@ -0,0 +1,60 @@
+//===-- Unittests for nanf128 ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/UInt128.h"
+#include "src/math/nanf128.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+class LlvmLibcNanf128Test : public LIBC_NAMESPACE::testing::Test {
+public:
+  using FPBits128 = LIBC_NAMESPACE::fputil::FPBits<float128>;
+  using StorageType = FPBits128::StorageType;
+
+  const UInt128 QUIET_NAN = FPBits128::quiet_nan().uintval();
+  const UInt128 ONE = UInt128(1);
+
+  void run_test(const char *input_str, StorageType bits) {
+    float128 result = LIBC_NAMESPACE::nanf128(input_str);
+    auto actual_fp = FPBits128(result);
+    auto expected_fp = FPBits128(bits);
+    EXPECT_EQ(actual_fp.uintval(), expected_fp.uintval());
+  };
+};
+
+TEST_F(LlvmLibcNanf128Test, NCharSeq) {
+  run_test("", QUIET_NAN);
+  run_test("1234", QUIET_NAN | 1234);
+  run_test("0x1234", QUIET_NAN | 0x1234);
+  run_test("2417851639229258349412352", QUIET_NAN | (ONE << 81));
+  run_test("0x200000000000000000000", QUIET_NAN | (ONE << 81));
+  run_test("10384593717069655257060992658440191",
+           QUIET_NAN | FPBits128::SIG_MASK);
+  run_test("0x1ffffffffffffffffffffffffffff", QUIET_NAN | FPBits128::SIG_MASK);
+  run_test("10384593717069655257060992658440192", QUIET_NAN);
+  run_test("0x20000000000000000000000000000", QUIET_NAN);
+  run_test("1a", QUIET_NAN);
+  run_test("10000000000000000000000000000000000000000000000000", QUIET_NAN);
+}
+
+TEST_F(LlvmLibcNanf128Test, RandomString) {
+  run_test(" 1234", QUIET_NAN);
+  run_test("-1234", QUIET_NAN);
+  run_test("asd&f", QUIET_NAN);
+  run_test("123 ", QUIET_NAN);
+  run_test("1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM_",
+           QUIET_NAN);
+}
+
+#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+#include <signal.h>
+TEST_F(LlvmLibcNanf128Test, InvalidInput) {
+  EXPECT_DEATH([] { LIBC_NAMESPACE::nanf128(nullptr); }, WITH_SIGNAL(SIGSEGV));
+}
+#endif // LIBC_HAVE_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nextdown_test.cpp b/libc/test/src/math/smoke/nextdown_test.cpp
new file mode 100644
index 0000000000000..6b0f5c6c5a6aa
--- /dev/null
+++ b/libc/test/src/math/smoke/nextdown_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextdown --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextDownTest.h"
+
+#include "src/math/nextdown.h"
+
+LIST_NEXTDOWN_TESTS(double, LIBC_NAMESPACE::nextdown)
diff --git a/libc/test/src/math/smoke/nextdownf128_test.cpp b/libc/test/src/math/smoke/nextdownf128_test.cpp
new file mode 100644
index 0000000000000..932a8b36a59e4
--- /dev/null
+++ b/libc/test/src/math/smoke/nextdownf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextdownf128 ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextDownTest.h"
+
+#include "src/math/nextdownf128.h"
+
+LIST_NEXTDOWN_TESTS(float128, LIBC_NAMESPACE::nextdownf128)
diff --git a/libc/test/src/math/smoke/nextdownf_test.cpp b/libc/test/src/math/smoke/nextdownf_test.cpp
new file mode 100644
index 0000000000000..3c05c22d82049
--- /dev/null
+++ b/libc/test/src/math/smoke/nextdownf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextdownf -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextDownTest.h"
+
+#include "src/math/nextdownf.h"
+
+LIST_NEXTDOWN_TESTS(float, LIBC_NAMESPACE::nextdownf)
diff --git a/libc/test/src/math/smoke/nextdownl_test.cpp b/libc/test/src/math/smoke/nextdownl_test.cpp
new file mode 100644
index 0000000000000..f1785eb58ce51
--- /dev/null
+++ b/libc/test/src/math/smoke/nextdownl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextdownl -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextDownTest.h"
+
+#include "src/math/nextdownl.h"
+
+LIST_NEXTDOWN_TESTS(long double, LIBC_NAMESPACE::nextdownl)
diff --git a/libc/test/src/math/smoke/nextup_test.cpp b/libc/test/src/math/smoke/nextup_test.cpp
new file mode 100644
index 0000000000000..04c73ac9492f3
--- /dev/null
+++ b/libc/test/src/math/smoke/nextup_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextup ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextUpTest.h"
+
+#include "src/math/nextup.h"
+
+LIST_NEXTUP_TESTS(double, LIBC_NAMESPACE::nextup)
diff --git a/libc/test/src/math/smoke/nextupf128_test.cpp b/libc/test/src/math/smoke/nextupf128_test.cpp
new file mode 100644
index 0000000000000..ddd385a7b159b
--- /dev/null
+++ b/libc/test/src/math/smoke/nextupf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextupf128 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextUpTest.h"
+
+#include "src/math/nextupf128.h"
+
+LIST_NEXTUP_TESTS(float128, LIBC_NAMESPACE::nextupf128)
diff --git a/libc/test/src/math/smoke/nextupf_test.cpp b/libc/test/src/math/smoke/nextupf_test.cpp
new file mode 100644
index 0000000000000..df73bee011711
--- /dev/null
+++ b/libc/test/src/math/smoke/nextupf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextupf ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextUpTest.h"
+
+#include "src/math/nextupf.h"
+
+LIST_NEXTUP_TESTS(float, LIBC_NAMESPACE::nextupf)
diff --git a/libc/test/src/math/smoke/nextupl_test.cpp b/libc/test/src/math/smoke/nextupl_test.cpp
new file mode 100644
index 0000000000000..50f765633c2a5
--- /dev/null
+++ b/libc/test/src/math/smoke/nextupl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextupl ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextUpTest.h"
+
+#include "src/math/nextupl.h"
+
+LIST_NEXTUP_TESTS(long double, LIBC_NAMESPACE::nextupl)
diff --git a/libc/test/src/math/smoke/powf_test.cpp b/libc/test/src/math/smoke/powf_test.cpp
index 1867dde0ac3be..e9de1554ec614 100644
--- a/libc/test/src/math/smoke/powf_test.cpp
+++ b/libc/test/src/math/smoke/powf_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/powf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/sincosf_test.cpp b/libc/test/src/math/smoke/sincosf_test.cpp
index a1447ad57e116..5952b20fc5bff 100644
--- a/libc/test/src/math/smoke/sincosf_test.cpp
+++ b/libc/test/src/math/smoke/sincosf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sincosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp
index e51726646bc68..9450895041874 100644
--- a/libc/test/src/math/smoke/sinf_test.cpp
+++ b/libc/test/src/math/smoke/sinf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp
index 23334293b73fe..0f005f752e698 100644
--- a/libc/test/src/math/smoke/sinhf_test.cpp
+++ b/libc/test/src/math/smoke/sinhf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/tanf_test.cpp b/libc/test/src/math/smoke/tanf_test.cpp
index 53a153254b311..68bf493f7e822 100644
--- a/libc/test/src/math/smoke/tanf_test.cpp
+++ b/libc/test/src/math/smoke/tanf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/tanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/tanhf_test.cpp b/libc/test/src/math/smoke/tanhf_test.cpp
index 7cb80dca7919e..f1ce8b40d43ac 100644
--- a/libc/test/src/math/smoke/tanhf_test.cpp
+++ b/libc/test/src/math/smoke/tanhf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/tanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/tan_test.cpp b/libc/test/src/math/tan_test.cpp
index 9cdc7c4abd32c..85174db9364e3 100644
--- a/libc/test/src/math/tan_test.cpp
+++ b/libc/test/src/math/tan_test.cpp
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 using LlvmLibcTanTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp
index c8d9d52e3f695..d40bc44d6442f 100644
--- a/libc/test/src/math/tanf_test.cpp
+++ b/libc/test/src/math/tanf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/tanf.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
index 28da7ffbeddd1..ef272b17d68ca 100644
--- a/libc/test/src/math/tanhf_test.cpp
+++ b/libc/test/src/math/tanhf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/tanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 6e1c86e070a82..3ccce16a76a2d 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -14,6 +14,7 @@ add_libc_test(
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.fflush
+    libc.src.stdio.fileno
     libc.src.stdio.fopen
     libc.src.stdio.fputs
     libc.src.stdio.fread
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index f5dbc49818390..0fbe19cf08d83 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -11,6 +11,7 @@
 #include "src/stdio/feof.h"
 #include "src/stdio/ferror.h"
 #include "src/stdio/fflush.h"
+#include "src/stdio/fileno.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fputs.h"
 #include "src/stdio/fread.h"
@@ -30,6 +31,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
+  ASSERT_EQ(LIBC_NAMESPACE::fileno(file), 3);
   constexpr char CONTENT[] = "1234567890987654321";
   ASSERT_EQ(sizeof(CONTENT) - 1,
             LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT) - 1, file));
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index ab8bdb23b1187..8dde95d02a96d 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -121,8 +121,8 @@ TEST(LlvmLibcSPrintfTest, StringConv) {
 
 #ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
   written = LIBC_NAMESPACE::sprintf(buff, "%s", nullptr);
-  EXPECT_EQ(written, 4);
-  ASSERT_STREQ(buff, "null");
+  EXPECT_EQ(written, 6);
+  ASSERT_STREQ(buff, "(null)");
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 }
 
diff --git a/libc/test/src/sys/mman/linux/CMakeLists.txt b/libc/test/src/sys/mman/linux/CMakeLists.txt
index 6f7fc34d8d3ce..0762a2d846a8e 100644
--- a/libc/test/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/test/src/sys/mman/linux/CMakeLists.txt
@@ -127,3 +127,23 @@ add_libc_unittest(
     libc.src.unistd.sysconf
     libc.test.UnitTest.ErrnoSetterMatcher
 )
+
+add_libc_unittest(
+  shm_test
+  SUITE
+    libc_sys_mman_unittests
+  SRCS
+    shm_test.cpp
+  DEPENDS
+    libc.include.sys_mman
+    libc.include.sys_syscall
+    libc.src.errno.errno
+    libc.src.sys.mman.shm_open
+    libc.src.sys.mman.shm_unlink
+    libc.src.sys.mman.mmap
+    libc.src.sys.mman.munmap
+    libc.src.unistd.ftruncate
+    libc.src.unistd.close
+    libc.src.__support.OSUtil.osutil
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
diff --git a/libc/test/src/sys/mman/linux/shm_test.cpp b/libc/test/src/sys/mman/linux/shm_test.cpp
new file mode 100644
index 0000000000000..3b1a2aa33b56a
--- /dev/null
+++ b/libc/test/src/sys/mman/linux/shm_test.cpp
@@ -0,0 +1,77 @@
+//===-- Unittests for shm_open/shm_unlink ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/OSUtil/syscall.h"
+#include "src/sys/mman/mmap.h"
+#include "src/sys/mman/munmap.h"
+#include "src/sys/mman/shm_open.h"
+#include "src/sys/mman/shm_unlink.h"
+#include "src/unistd/close.h"
+#include "src/unistd/ftruncate.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/LibcTest.h"
+#include <asm-generic/fcntl.h>
+#include <sys/syscall.h>
+
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+// since shm_open/shm_unlink are wrappers around open/unlink, we only focus on
+// testing basic cases and name conversions.
+
+TEST(LlvmLibcShmTest, Basic) {
+  const char *name = "/test_shm_open";
+  int fd;
+  ASSERT_THAT(fd = LIBC_NAMESPACE::shm_open(name, O_CREAT | O_RDWR, 0666),
+              returns(GE(0)).with_errno(EQ(0)));
+
+  // check that FD_CLOEXEC is set by default.
+  // TODO: use fcntl when implemented.
+  // https://github.com/llvm/llvm-project/issues/84968
+  long flag = LIBC_NAMESPACE::syscall_impl(SYS_fcntl, fd, F_GETFD);
+  ASSERT_GE(static_cast<int>(flag), 0);
+  EXPECT_NE(static_cast<int>(flag) & FD_CLOEXEC, 0);
+
+  // allocate space using ftruncate
+  ASSERT_THAT(LIBC_NAMESPACE::ftruncate(fd, 4096), Succeeds());
+  // map the shared memory
+  void *addr = LIBC_NAMESPACE::mmap(nullptr, 4096, PROT_READ | PROT_WRITE,
+                                    MAP_SHARED, fd, 0);
+  ASSERT_NE(addr, MAP_FAILED);
+  // just write random data to the shared memory
+  char data[] = "Despite its name, LLVM has little to do with traditional "
+                "virtual machines.";
+  for (size_t i = 0; i < sizeof(data); ++i)
+    static_cast<char *>(addr)[i] = data[i];
+
+  // close fd does not affect the mapping
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds());
+  for (size_t i = 0; i < sizeof(data); ++i)
+    EXPECT_EQ(static_cast<char *>(addr)[i], data[i]);
+
+  // unmap the shared memory
+  ASSERT_THAT(LIBC_NAMESPACE::munmap(addr, 4096), Succeeds());
+  // remove the shared memory
+  ASSERT_THAT(LIBC_NAMESPACE::shm_unlink(name), Succeeds());
+}
+
+TEST(LlvmLibcShmTest, NameConversion) {
+  const char *name = "////test_shm_open";
+  int fd;
+  ASSERT_THAT(fd = LIBC_NAMESPACE::shm_open(name, O_CREAT | O_RDWR, 0666),
+              returns(GE(0)).with_errno(EQ(0)));
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds());
+  ASSERT_THAT(LIBC_NAMESPACE::shm_unlink(name), Succeeds());
+
+  ASSERT_THAT(LIBC_NAMESPACE::shm_open("/123/123", O_CREAT | O_RDWR, 0666),
+              Fails(EINVAL));
+
+  ASSERT_THAT(LIBC_NAMESPACE::shm_open("/.", O_CREAT | O_RDWR, 0666),
+              Fails(EINVAL));
+
+  ASSERT_THAT(LIBC_NAMESPACE::shm_open("/..", O_CREAT | O_RDWR, 0666),
+              Fails(EINVAL));
+}
diff --git a/libc/test/utils/FPUtil/x86_long_double_test.cpp b/libc/test/utils/FPUtil/x86_long_double_test.cpp
index bafbbe2a41075..3b140c6c02667 100644
--- a/libc/test/utils/FPUtil/x86_long_double_test.cpp
+++ b/libc/test/utils/FPUtil/x86_long_double_test.cpp
@@ -9,7 +9,7 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 using FPBits = LIBC_NAMESPACE::fputil::FPBits<long double>;
 
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 7cc18cfb62438..2e1c44e6fd5da 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -14,7 +14,7 @@
 #include "src/__support/FPUtil/fpbits_str.h"
 #include "test/UnitTest/FPMatcher.h"
 
-#include <cmath>
+#include "include/llvm-libc-macros/math-macros.h"
 #include <fenv.h>
 #include <memory>
 #include <stdint.h>
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 6d3d1502d9d24..c789db73619ea 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -280,6 +280,12 @@ add_custom_command(
   DEPENDS ${clc_script_loc} )
 add_custom_target( "generate_convert_clc.cl" DEPENDS convert-clc.cl )
 
+add_custom_command(
+	OUTPUT clspv-convert.cl
+	COMMAND ${Python3_EXECUTABLE} ${script_loc} --clspv > clspv-convert.cl
+	DEPENDS ${script_loc} )
+add_custom_target( "clspv-generate_convert.cl" DEPENDS clspv-convert.cl )
+
 enable_testing()
 
 if (LIBCLC_STANDALONE_BUILD)
diff --git a/libclc/check_external_calls.sh b/libclc/check_external_calls.sh
index 4de31a220d070..25792e249d6b6 100755
--- a/libclc/check_external_calls.sh
+++ b/libclc/check_external_calls.sh
@@ -3,15 +3,15 @@
 FILE=$1
 BIN_DIR=$2
 if [ ! -f $FILE ]; then
-	echo "ERROR: Not a file: $FILE"
-	exit 3
+  echo "ERROR: Not a file: $FILE"
+  exit 3
 fi
 ret=0
 
 DIS="$BIN_DIR/llvm-dis"
 if [ ! -x $DIS ]; then
-	echo "ERROR: Disassembler '$DIS' is not executable"
-	exit 3
+  echo "ERROR: Disassembler '$DIS' is not executable"
+  exit 3
 fi
 
 TMP_FILE=$(mktemp)
@@ -21,10 +21,10 @@ $DIS < $FILE | grep ' call ' | grep -v '@llvm' > "$TMP_FILE"
 COUNT=$(wc -l < "$TMP_FILE")
 
 if [ "$COUNT" -ne "0" ]; then
-	echo "ERROR: $COUNT unresolved calls detected in $FILE"
-	cat $TMP_FILE
-	ret=1
+  echo "ERROR: $COUNT unresolved calls detected in $FILE"
+  cat $TMP_FILE
+  ret=1
 else
-	echo "File $FILE is OK"
+  echo "File $FILE is OK"
 fi
 exit $ret
diff --git a/libclc/cmake/CMakeLLAsmInformation.cmake b/libclc/cmake/CMakeLLAsmInformation.cmake
index 5bd432231b858..4ac6ab13e9b6b 100644
--- a/libclc/cmake/CMakeLLAsmInformation.cmake
+++ b/libclc/cmake/CMakeLLAsmInformation.cmake
@@ -1,7 +1,7 @@
 if(NOT CMAKE_LLAsm_COMPILE_OBJECT)
   set(CMAKE_LLAsm_COMPILE_OBJECT
-	  "${CMAKE_LLAsm_PREPROCESSOR} -E -P <DEFINES> <INCLUDES> <FLAGS> -cl-no-stdinc -x cl <SOURCE> -o <OBJECT>.temp"
-	  "<CMAKE_LLAsm_COMPILER> -o <OBJECT> <OBJECT>.temp")
+    "${CMAKE_LLAsm_PREPROCESSOR} -E -P <DEFINES> <INCLUDES> <FLAGS> -cl-no-stdinc -x cl <SOURCE> -o <OBJECT>.temp"
+    "<CMAKE_LLAsm_COMPILER> -o <OBJECT> <OBJECT>.temp")
 endif()
 
 if(NOT CMAKE_LLAsm_CREATE_STATIC_LIBRARY)
diff --git a/libclc/generic/lib/gen_convert.py b/libclc/generic/lib/gen_convert.py
index 4899a63841c1b..a1220fd3c2664 100644
--- a/libclc/generic/lib/gen_convert.py
+++ b/libclc/generic/lib/gen_convert.py
@@ -21,6 +21,7 @@
 #
 # Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com>
 # Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com>
+# Copyright (c) 2024 Romaric Jodin <rjodin@chromium.org>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -45,6 +46,120 @@
 #
 # convert_<destTypen><_sat><_roundingMode>(<sourceTypen>)
 
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--clspv", action="store_true", help="Generate the clspv variant of the code"
+)
+args = parser.parse_args()
+
+clspv = args.clspv
+
+types = [
+    "char",
+    "uchar",
+    "short",
+    "ushort",
+    "int",
+    "uint",
+    "long",
+    "ulong",
+    "float",
+    "double",
+]
+int_types = ["char", "uchar", "short", "ushort", "int", "uint", "long", "ulong"]
+unsigned_types = ["uchar", "ushort", "uint", "ulong"]
+float_types = ["float", "double"]
+int64_types = ["long", "ulong"]
+float64_types = ["double"]
+vector_sizes = ["", "2", "3", "4", "8", "16"]
+half_sizes = [("2", ""), ("4", "2"), ("8", "4"), ("16", "8")]
+
+saturation = ["", "_sat"]
+rounding_modes = ["_rtz", "_rte", "_rtp", "_rtn"]
+float_prefix = {"float": "FLT_", "double": "DBL_"}
+float_suffix = {"float": "f", "double": ""}
+
+bool_type = {
+    "char": "char",
+    "uchar": "char",
+    "short": "short",
+    "ushort": "short",
+    "int": "int",
+    "uint": "int",
+    "long": "long",
+    "ulong": "long",
+    "float": "int",
+    "double": "long",
+}
+
+unsigned_type = {
+    "char": "uchar",
+    "uchar": "uchar",
+    "short": "ushort",
+    "ushort": "ushort",
+    "int": "uint",
+    "uint": "uint",
+    "long": "ulong",
+    "ulong": "ulong",
+}
+
+sizeof_type = {
+    "char": 1,
+    "uchar": 1,
+    "short": 2,
+    "ushort": 2,
+    "int": 4,
+    "uint": 4,
+    "long": 8,
+    "ulong": 8,
+    "float": 4,
+    "double": 8,
+}
+
+limit_max = {
+    "char": "CHAR_MAX",
+    "uchar": "UCHAR_MAX",
+    "short": "SHRT_MAX",
+    "ushort": "USHRT_MAX",
+    "int": "INT_MAX",
+    "uint": "UINT_MAX",
+    "long": "LONG_MAX",
+    "ulong": "ULONG_MAX",
+}
+
+limit_min = {
+    "char": "CHAR_MIN",
+    "uchar": "0",
+    "short": "SHRT_MIN",
+    "ushort": "0",
+    "int": "INT_MIN",
+    "uint": "0",
+    "long": "LONG_MIN",
+    "ulong": "0",
+}
+
+
+def conditional_guard(src, dst):
+    int64_count = 0
+    float64_count = 0
+    if src in int64_types:
+        int64_count = int64_count + 1
+    elif src in float64_types:
+        float64_count = float64_count + 1
+    if dst in int64_types:
+        int64_count = int64_count + 1
+    elif dst in float64_types:
+        float64_count = float64_count + 1
+    if float64_count > 0:
+        # In embedded profile, if cl_khr_fp64 is supported cles_khr_int64 has to be
+        print("#ifdef cl_khr_fp64")
+        return True
+    elif int64_count > 0:
+        print("#if defined cles_khr_int64 || !defined(__EMBEDDED_PROFILE__)")
+        return True
+    return False
 
 print(
     """/* !!!! AUTOGENERATED FILE generated by convert_type.py !!!!!
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index b436e96f178b7..3dec6faea13a0 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -176,6 +176,7 @@ set(BENCHMARK_TESTS
     algorithms/count.bench.cpp
     algorithms/equal.bench.cpp
     algorithms/find.bench.cpp
+    algorithms/fill.bench.cpp
     algorithms/for_each.bench.cpp
     algorithms/lower_bound.bench.cpp
     algorithms/make_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/fill.bench.cpp b/libcxx/benchmarks/algorithms/fill.bench.cpp
new file mode 100644
index 0000000000000..40f37425c394c
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/fill.bench.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <vector>
+
+static void bm_fill_n(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::fill_n(vec1.begin(), vec1.size(), false));
+  }
+}
+BENCHMARK(bm_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_ranges_fill_n(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::ranges::fill_n(vec1.begin(), vec1.size(), false));
+  }
+}
+BENCHMARK(bm_ranges_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_fill(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    std::fill(vec1.begin(), vec1.end(), false);
+  }
+}
+BENCHMARK(bm_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_ranges_fill(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::ranges::fill(vec1, false));
+  }
+}
+BENCHMARK(bm_ranges_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst
index ee2b81d3b9e7c..5b027ed1bd072 100644
--- a/libcxx/docs/Modules.rst
+++ b/libcxx/docs/Modules.rst
@@ -178,34 +178,13 @@ This is a small sample program that uses the module ``std``. It consists of a
   )
   FetchContent_MakeAvailable(std)
 
-  #
-  # Adjust project compiler flags
-  #
-
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fprebuilt-module-path=${std_BINARY_DIR}/CMakeFiles/std.dir/>)
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fprebuilt-module-path=${std_BINARY_DIR}/CMakeFiles/std.compat.dir/>)
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-nostdinc++>)
-  # The include path needs to be set to be able to use macros from headers.
-  # For example from, the headers <cassert> and <version>.
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-isystem>)
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${LIBCXX_BUILD}/include/c++/v1>)
-
-  #
-  # Adjust project linker flags
-  #
-
-  add_link_options($<$<COMPILE_LANGUAGE:CXX>:-nostdlib++>)
-  add_link_options($<$<COMPILE_LANGUAGE:CXX>:-L${LIBCXX_BUILD}/lib>)
-  add_link_options($<$<COMPILE_LANGUAGE:CXX>:-Wl,-rpath,${LIBCXX_BUILD}/lib>)
-  # Linking against the standard c++ library is required for CMake to get the proper dependencies.
-  link_libraries(std c++)
-  link_libraries(std.compat c++)
-
   #
   # Add the project
   #
 
   add_executable(main)
+  add_dependencies(main std.compat)
+  target_link_libraries(main std.compat)
   target_sources(main
     PRIVATE
       main.cpp
@@ -218,13 +197,9 @@ Building this project is done with the following steps, assuming the files
 
   $ mkdir build
   $ cmake -G Ninja -S . -B build -DCMAKE_CXX_COMPILER=<path-to-compiler> -DLIBCXX_BUILD=<build>
-  $ ninja -j1 std -C build
   $ ninja -C build
   $ build/main
 
-.. note:: The ``std`` dependencies of ``std.compat`` is not always resolved when
-          building the ``std`` target using multiple jobs.
-
 .. warning:: ``<path-to-compiler>`` should point point to the real binary and
              not to a symlink.
 
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 2b62a36ca8e5c..c70ae477fafc1 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -49,6 +49,8 @@ Improvements and New Features
 -----------------------------
 
 - The performance of growing ``std::vector`` has been improved for trivially relocatable types.
+- The performance of ``ranges::fill`` and ``ranges::fill_n`` has been improved for ``vector<bool>::iterator``\s,
+  resulting in a performance increase of up to 1400x.
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index e00345533b865..a103192350069 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -46,7 +46,7 @@
 "`3473 <https://wg21.link/LWG3473>`__","Normative encouragement in non-normative note","November 2020","|Complete|","15.0","|format|"
 "`3474 <https://wg21.link/LWG3474>`__","Nesting ``join_views`` is broken because of CTAD","November 2020","|Complete|","15.0","|ranges|"
 "`3476 <https://wg21.link/LWG3476>`__","``thread`` and ``jthread`` constructors require that the parameters be move-constructible but never move construct the parameters","November 2020","",""
-"`3477 <https://wg21.link/LWG3477>`__","Simplify constraints for ``semiregular-box``","November 2020","","","|ranges|"
+"`3477 <https://wg21.link/LWG3477>`__","Simplify constraints for ``semiregular-box``","November 2020","|Complete|","13.0","|ranges|"
 "`3482 <https://wg21.link/LWG3482>`__","``drop_view``'s const begin should additionally require ``sized_range``","November 2020","|Complete|","14.0","|ranges|"
 "`3483 <https://wg21.link/LWG3483>`__","``transform_view::iterator``'s difference is overconstrained","November 2020","|Complete|","14.0","|ranges|"
 "","","","","",""
@@ -181,7 +181,7 @@
 "`3711 <https://wg21.link/LWG3711>`__","Missing preconditions for slide_view constructor","July 2022","","","|ranges|"
 "`3712 <https://wg21.link/LWG3712>`__","``chunk_view`` and ``slide_view`` should not be ``default_initializable``","July 2022","","","|ranges|"
 "`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","",""
-"`3715 <https://wg21.link/LWG3715>`__","``view_interface::empty`` is overconstrained","July 2022","","","|ranges|"
+"`3715 <https://wg21.link/LWG3715>`__","``view_interface::empty`` is overconstrained","July 2022","|Complete|","19.0","|ranges|"
 "`3719 <https://wg21.link/LWG3719>`__","Directory iterators should be usable with default sentinel","July 2022","|Complete|","17.0","|ranges|"
 "`3721 <https://wg21.link/LWG3721>`__","Allow an ``arg-id`` with a value of zero for ``width`` in ``std-format-spec``","July 2022","|Complete|","16.0","|format|"
 "`3724 <https://wg21.link/LWG3724>`__","``decay-copy`` should be constrained","July 2022","|Complete|","14.0"
@@ -283,7 +283,7 @@
 "`3655 <https://wg21.link/LWG3655>`__","The ``INVOKE`` operation and union types","February 2023","|Complete|","18.0",""
 "`3723 <https://wg21.link/LWG3723>`__","``priority_queue::push_range`` needs to ``append_range``","February 2023","","","|ranges|"
 "`3734 <https://wg21.link/LWG3734>`__","Inconsistency in ``inout_ptr`` and ``out_ptr`` for empty case","February 2023","","",""
-"`3772 <https://wg21.link/LWG3772>`__","``repeat_view``'s ``piecewise`` constructor is missing Postconditions","February 2023","","","|ranges|"
+"`3772 <https://wg21.link/LWG3772>`__","``repeat_view``'s ``piecewise`` constructor is missing Postconditions","February 2023","|Complete|","17.0","|ranges|"
 "`3786 <https://wg21.link/LWG3786>`__","Flat maps' deduction guide needs to default ``Allocator`` to be useful","February 2023","","",""
 "`3803 <https://wg21.link/LWG3803>`__","``flat_foo`` constructors taking ``KeyContainer`` lack ``KeyCompare`` parameter","February 2023","","",""
 "`3810 <https://wg21.link/LWG3810>`__","CTAD for ``std::basic_format_args``","February 2023","|Complete|","17.0","|format|"
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index 56e1468b4ca1a..80547c5c1f3f5 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -100,7 +100,7 @@
 "`P2396R1 <https://wg21.link/P2396R1>`__","LWG", "Concurrency TS 2 fixes ", "November 2022","","","|concurrency TS|"
 "`P2505R5 <https://wg21.link/P2505R5>`__","LWG", "Monadic Functions for ``std::expected``", "November 2022","|Complete|","17.0",""
 "`P2539R4 <https://wg21.link/P2539R4>`__","LWG", "Should the output of ``std::print`` to a terminal be synchronized with the underlying stream?", "November 2022","|Complete|","18.0","|format|"
-"`P2602R2 <https://wg21.link/P2602R2>`__","LWG", "Poison Pills are Too Toxic", "November 2022","","","|ranges|"
+"`P2602R2 <https://wg21.link/P2602R2>`__","LWG", "Poison Pills are Too Toxic", "November 2022","|Complete|","19.0","|ranges|"
 "`P2708R1 <https://wg21.link/P2708R1>`__","LWG", "No Further Fundamentals TSes", "November 2022","|Nothing to do|","",""
 "","","","","","",""
 "`P0290R4 <https://wg21.link/P0290R4>`__","LWG", "``apply()`` for ``synchronized_value<T>``","February 2023","","","|concurrency TS|"
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 63adc03fae298..6ed8d21d98a15 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -755,10 +755,7 @@ set(files
   __type_traits/is_constant_evaluated.h
   __type_traits/is_constructible.h
   __type_traits/is_convertible.h
-  __type_traits/is_copy_assignable.h
-  __type_traits/is_copy_constructible.h
   __type_traits/is_core_convertible.h
-  __type_traits/is_default_constructible.h
   __type_traits/is_destructible.h
   __type_traits/is_empty.h
   __type_traits/is_enum.h
@@ -774,17 +771,10 @@ set(files
   __type_traits/is_member_function_pointer.h
   __type_traits/is_member_object_pointer.h
   __type_traits/is_member_pointer.h
-  __type_traits/is_move_assignable.h
-  __type_traits/is_move_constructible.h
   __type_traits/is_nothrow_assignable.h
   __type_traits/is_nothrow_constructible.h
   __type_traits/is_nothrow_convertible.h
-  __type_traits/is_nothrow_copy_assignable.h
-  __type_traits/is_nothrow_copy_constructible.h
-  __type_traits/is_nothrow_default_constructible.h
   __type_traits/is_nothrow_destructible.h
-  __type_traits/is_nothrow_move_assignable.h
-  __type_traits/is_nothrow_move_constructible.h
   __type_traits/is_null_pointer.h
   __type_traits/is_object.h
   __type_traits/is_pod.h
@@ -805,14 +795,9 @@ set(files
   __type_traits/is_trivial.h
   __type_traits/is_trivially_assignable.h
   __type_traits/is_trivially_constructible.h
-  __type_traits/is_trivially_copy_assignable.h
-  __type_traits/is_trivially_copy_constructible.h
   __type_traits/is_trivially_copyable.h
-  __type_traits/is_trivially_default_constructible.h
   __type_traits/is_trivially_destructible.h
   __type_traits/is_trivially_lexicographically_comparable.h
-  __type_traits/is_trivially_move_assignable.h
-  __type_traits/is_trivially_move_constructible.h
   __type_traits/is_trivially_relocatable.h
   __type_traits/is_unbounded_array.h
   __type_traits/is_union.h
diff --git a/libcxx/include/__algorithm/copy_backward.h b/libcxx/include/__algorithm/copy_backward.h
index 3ec88d8bd5cc3..591dd21e2b032 100644
--- a/libcxx/include/__algorithm/copy_backward.h
+++ b/libcxx/include/__algorithm/copy_backward.h
@@ -15,7 +15,7 @@
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/common_type.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/copy_move_common.h b/libcxx/include/__algorithm/copy_move_common.h
index 0fc7a5e3cee70..845967b05038d 100644
--- a/libcxx/include/__algorithm/copy_move_common.h
+++ b/libcxx/include/__algorithm/copy_move_common.h
@@ -19,7 +19,7 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_always_bitcastable.h>
 #include <__type_traits/is_constant_evaluated.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_trivially_assignable.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/is_volatile.h>
diff --git a/libcxx/include/__algorithm/equal.h b/libcxx/include/__algorithm/equal.h
index 3c0e3060e39a9..c76a16b47f5da 100644
--- a/libcxx/include/__algorithm/equal.h
+++ b/libcxx/include/__algorithm/equal.h
@@ -69,20 +69,6 @@ equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first
 }
 
 #if _LIBCPP_STD_VER >= 14
-template <class _BinaryPredicate, class _InputIterator1, class _InputIterator2>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-__equal(_InputIterator1 __first1,
-        _InputIterator1 __last1,
-        _InputIterator2 __first2,
-        _InputIterator2 __last2,
-        _BinaryPredicate __pred,
-        input_iterator_tag,
-        input_iterator_tag) {
-  for (; __first1 != __last1 && __first2 != __last2; ++__first1, (void)++__first2)
-    if (!__pred(*__first1, *__first2))
-      return false;
-  return __first1 == __last1 && __first2 == __last2;
-}
 
 template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Pred, class _Proj1, class _Proj2>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __equal_impl(
@@ -110,17 +96,18 @@ __equal_impl(_Tp* __first1, _Tp* __last1, _Up* __first2, _Up*, _Pred&, _Proj1&,
   return std::__constexpr_memcmp_equal(__first1, __first2, __element_count(__last1 - __first1));
 }
 
-template <class _BinaryPredicate, class _RandomAccessIterator1, class _RandomAccessIterator2>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-__equal(_RandomAccessIterator1 __first1,
-        _RandomAccessIterator1 __last1,
-        _RandomAccessIterator2 __first2,
-        _RandomAccessIterator2 __last2,
-        _BinaryPredicate __pred,
-        random_access_iterator_tag,
-        random_access_iterator_tag) {
-  if (std::distance(__first1, __last1) != std::distance(__first2, __last2))
-    return false;
+template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+equal(_InputIterator1 __first1,
+      _InputIterator1 __last1,
+      _InputIterator2 __first2,
+      _InputIterator2 __last2,
+      _BinaryPredicate __pred) {
+  if constexpr (__has_random_access_iterator_category<_InputIterator1>::value &&
+                __has_random_access_iterator_category<_InputIterator2>::value) {
+    if (std::distance(__first1, __last1) != std::distance(__first2, __last2))
+      return false;
+  }
   __identity __proj;
   return std::__equal_impl(
       std::__unwrap_iter(__first1),
@@ -132,36 +119,13 @@ __equal(_RandomAccessIterator1 __first1,
       __proj);
 }
 
-template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-equal(_InputIterator1 __first1,
-      _InputIterator1 __last1,
-      _InputIterator2 __first2,
-      _InputIterator2 __last2,
-      _BinaryPredicate __pred) {
-  return std::__equal<_BinaryPredicate&>(
-      __first1,
-      __last1,
-      __first2,
-      __last2,
-      __pred,
-      typename iterator_traits<_InputIterator1>::iterator_category(),
-      typename iterator_traits<_InputIterator2>::iterator_category());
-}
-
 template <class _InputIterator1, class _InputIterator2>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2) {
-  return std::__equal(
-      __first1,
-      __last1,
-      __first2,
-      __last2,
-      __equal_to(),
-      typename iterator_traits<_InputIterator1>::iterator_category(),
-      typename iterator_traits<_InputIterator2>::iterator_category());
+  return std::equal(__first1, __last1, __first2, __last2, __equal_to());
 }
-#endif
+
+#endif // _LIBCPP_STD_VER >= 14
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__algorithm/equal_range.h b/libcxx/include/__algorithm/equal_range.h
index a94290431971c..2b086abf1794f 100644
--- a/libcxx/include/__algorithm/equal_range.h
+++ b/libcxx/include/__algorithm/equal_range.h
@@ -23,7 +23,7 @@
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
 #include <__type_traits/is_callable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h
index 36f3349d9e7a3..f29633f88087f 100644
--- a/libcxx/include/__algorithm/fill_n.h
+++ b/libcxx/include/__algorithm/fill_n.h
@@ -9,18 +9,74 @@
 #ifndef _LIBCPP___ALGORITHM_FILL_N_H
 #define _LIBCPP___ALGORITHM_FILL_N_H
 
+#include <__algorithm/min.h>
 #include <__config>
+#include <__fwd/bit_reference.h>
 #include <__iterator/iterator_traits.h>
+#include <__memory/pointer_traits.h>
 #include <__utility/convert_to_integral.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
 
+template <class _OutputIterator, class _Size, class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
+
+template <bool _FillVal, class _Cp>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+__fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
+  using _It            = __bit_iterator<_Cp, false>;
+  using __storage_type = typename _It::__storage_type;
+
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0) {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = std::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    if (_FillVal)
+      *__first.__seg_ |= __m;
+    else
+      *__first.__seg_ &= ~__m;
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  __storage_type __nw = __n / __bits_per_word;
+  std::__fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
+  __n -= __nw * __bits_per_word;
+  // do last partial word
+  if (__n > 0) {
+    __first.__seg_ += __nw;
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    if (_FillVal)
+      *__first.__seg_ |= __m;
+    else
+      *__first.__seg_ &= ~__m;
+  }
+}
+
+template <class _Cp, class _Size>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
+__fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
+  if (__n > 0) {
+    if (__value)
+      std::__fill_n_bool<true>(__first, __n);
+    else
+      std::__fill_n_bool<false>(__first, __n);
+  }
+  return __first + __n;
+}
+
 template <class _OutputIterator, class _Size, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
@@ -37,4 +93,6 @@ fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_FILL_N_H
diff --git a/libcxx/include/__algorithm/inplace_merge.h b/libcxx/include/__algorithm/inplace_merge.h
index eb3c0bdbc2db7..a6bcc66a2fa47 100644
--- a/libcxx/include/__algorithm/inplace_merge.h
+++ b/libcxx/include/__algorithm/inplace_merge.h
@@ -114,8 +114,8 @@ _LIBCPP_HIDE_FROM_ABI void __buffered_inplace_merge(
     for (_BidirectionalIterator __i = __middle; __i != __last;
          __d.template __incr<value_type>(), (void)++__i, (void)++__p)
       ::new ((void*)__p) value_type(_IterOps<_AlgPolicy>::__iter_move(__i));
-    typedef __unconstrained_reverse_iterator<_BidirectionalIterator> _RBi;
-    typedef __unconstrained_reverse_iterator<value_type*> _Rv;
+    typedef reverse_iterator<_BidirectionalIterator> _RBi;
+    typedef reverse_iterator<value_type*> _Rv;
     typedef __invert<_Compare> _Inverted;
     std::__half_inplace_merge<_AlgPolicy>(
         _Rv(__p), _Rv(__buff), _RBi(__middle), _RBi(__first), _RBi(__last), _Inverted(__comp));
diff --git a/libcxx/include/__algorithm/lexicographical_compare_three_way.h b/libcxx/include/__algorithm/lexicographical_compare_three_way.h
index 32de97d07a131..50ebdc647a97a 100644
--- a/libcxx/include/__algorithm/lexicographical_compare_three_way.h
+++ b/libcxx/include/__algorithm/lexicographical_compare_three_way.h
@@ -17,7 +17,7 @@
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__type_traits/common_type.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h
index dba6d487fff77..bf574b5274093 100644
--- a/libcxx/include/__algorithm/move.h
+++ b/libcxx/include/__algorithm/move.h
@@ -16,7 +16,7 @@
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/common_type.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/move_backward.h b/libcxx/include/__algorithm/move_backward.h
index aeedf4241dce9..6bb7c91d66c7c 100644
--- a/libcxx/include/__algorithm/move_backward.h
+++ b/libcxx/include/__algorithm/move_backward.h
@@ -15,7 +15,7 @@
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/common_type.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/partial_sort.h b/libcxx/include/__algorithm/partial_sort.h
index 85a8fdc77aa22..7f8d0c49147e3 100644
--- a/libcxx/include/__algorithm/partial_sort.h
+++ b/libcxx/include/__algorithm/partial_sort.h
@@ -18,8 +18,8 @@
 #include <__config>
 #include <__debug_utils/randomize_range.h>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/pop_heap.h b/libcxx/include/__algorithm/pop_heap.h
index 798a1d09934bc..6d23830097ff9 100644
--- a/libcxx/include/__algorithm/pop_heap.h
+++ b/libcxx/include/__algorithm/pop_heap.h
@@ -17,8 +17,8 @@
 #include <__assert>
 #include <__config>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/push_heap.h b/libcxx/include/__algorithm/push_heap.h
index 7d8720e3a93d4..ec0b445f2b70f 100644
--- a/libcxx/include/__algorithm/push_heap.h
+++ b/libcxx/include/__algorithm/push_heap.h
@@ -14,8 +14,8 @@
 #include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/rotate.h b/libcxx/include/__algorithm/rotate.h
index 9a4d07883e320..df4ca95aac95b 100644
--- a/libcxx/include/__algorithm/rotate.h
+++ b/libcxx/include/__algorithm/rotate.h
@@ -15,7 +15,7 @@
 #include <__algorithm/swap_ranges.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_trivially_move_assignable.h>
+#include <__type_traits/is_trivially_assignable.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/sort_heap.h b/libcxx/include/__algorithm/sort_heap.h
index 060fc33c3c6e9..f20b110c7fd12 100644
--- a/libcxx/include/__algorithm/sort_heap.h
+++ b/libcxx/include/__algorithm/sort_heap.h
@@ -16,8 +16,8 @@
 #include <__config>
 #include <__debug_utils/strict_weak_ordering_check.h>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/stable_sort.h b/libcxx/include/__algorithm/stable_sort.h
index 9be192bd65a6e..726e7e16b3564 100644
--- a/libcxx/include/__algorithm/stable_sort.h
+++ b/libcxx/include/__algorithm/stable_sort.h
@@ -20,7 +20,7 @@
 #include <__memory/destruct_n.h>
 #include <__memory/temporary_buffer.h>
 #include <__memory/unique_ptr.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
+#include <__type_traits/is_trivially_assignable.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <new>
diff --git a/libcxx/include/__algorithm/unwrap_iter.h b/libcxx/include/__algorithm/unwrap_iter.h
index 50d815c970884..8cc0d22d4fc21 100644
--- a/libcxx/include/__algorithm/unwrap_iter.h
+++ b/libcxx/include/__algorithm/unwrap_iter.h
@@ -13,7 +13,7 @@
 #include <__iterator/iterator_traits.h>
 #include <__memory/pointer_traits.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/declval.h>
 #include <__utility/move.h>
 
diff --git a/libcxx/include/__algorithm/upper_bound.h b/libcxx/include/__algorithm/upper_bound.h
index f499f7a80aa6d..9c7d8fbcde07b 100644
--- a/libcxx/include/__algorithm/upper_bound.h
+++ b/libcxx/include/__algorithm/upper_bound.h
@@ -18,7 +18,7 @@
 #include <__iterator/advance.h>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__atomic/atomic_base.h b/libcxx/include/__atomic/atomic_base.h
index 6ca01a7f1bf9b..e9badccc25a62 100644
--- a/libcxx/include/__atomic/atomic_base.h
+++ b/libcxx/include/__atomic/atomic_base.h
@@ -18,7 +18,7 @@
 #include <__config>
 #include <__memory/addressof.h>
 #include <__type_traits/is_integral.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
 #include <version>
 
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 3a5339b72ddc3..9579b9eaf70bb 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -171,61 +171,6 @@ private:
   __bit_const_reference& operator=(const __bit_const_reference&) = delete;
 };
 
-// fill_n
-
-template <bool _FillVal, class _Cp>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
-__fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
-  using _It            = __bit_iterator<_Cp, false>;
-  using __storage_type = typename _It::__storage_type;
-
-  const int __bits_per_word = _It::__bits_per_word;
-  // do first partial word
-  if (__first.__ctz_ != 0) {
-    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-    __storage_type __dn    = std::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    if (_FillVal)
-      *__first.__seg_ |= __m;
-    else
-      *__first.__seg_ &= ~__m;
-    __n -= __dn;
-    ++__first.__seg_;
-  }
-  // do middle whole words
-  __storage_type __nw = __n / __bits_per_word;
-  std::fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
-  __n -= __nw * __bits_per_word;
-  // do last partial word
-  if (__n > 0) {
-    __first.__seg_ += __nw;
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    if (_FillVal)
-      *__first.__seg_ |= __m;
-    else
-      *__first.__seg_ &= ~__m;
-  }
-}
-
-template <class _Cp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value) {
-  if (__n > 0) {
-    if (__value)
-      std::__fill_n<true>(__first, __n);
-    else
-      std::__fill_n<false>(__first, __n);
-  }
-}
-
-// fill
-
-template <class _Cp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value) {
-  std::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value);
-}
-
 // copy
 
 template <class _Cp, bool _IsConst>
@@ -1007,8 +952,10 @@ private:
   friend class __bit_iterator<_Cp, true>;
   template <class _Dp>
   friend struct __bit_array;
+
   template <bool _FillVal, class _Dp>
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void __fill_n(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void
+  __fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
 
   template <class _Dp, bool _IC>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_aligned(
diff --git a/libcxx/include/__compare/partial_order.h b/libcxx/include/__compare/partial_order.h
index f3ed4900fbff2..1d2fae63e5f24 100644
--- a/libcxx/include/__compare/partial_order.h
+++ b/libcxx/include/__compare/partial_order.h
@@ -28,6 +28,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // [cmp.alg]
 namespace __partial_order {
+void partial_order() = delete;
+
 struct __fn {
   // NOLINTBEGIN(libcpp-robust-against-adl) partial_order should use ADL, but only here
   template <class _Tp, class _Up>
diff --git a/libcxx/include/__compare/strong_order.h b/libcxx/include/__compare/strong_order.h
index 3dc819e642515..8c363b5638222 100644
--- a/libcxx/include/__compare/strong_order.h
+++ b/libcxx/include/__compare/strong_order.h
@@ -37,6 +37,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // [cmp.alg]
 namespace __strong_order {
+void strong_order() = delete;
+
 struct __fn {
   // NOLINTBEGIN(libcpp-robust-against-adl) strong_order should use ADL, but only here
   template <class _Tp, class _Up>
diff --git a/libcxx/include/__compare/weak_order.h b/libcxx/include/__compare/weak_order.h
index b82a708c29a14..1a3e85feb233b 100644
--- a/libcxx/include/__compare/weak_order.h
+++ b/libcxx/include/__compare/weak_order.h
@@ -30,6 +30,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // [cmp.alg]
 namespace __weak_order {
+void weak_order() = delete;
+
 struct __fn {
   // NOLINTBEGIN(libcpp-robust-against-adl) weak_order should use ADL, but only here
   template <class _Tp, class _Up>
diff --git a/libcxx/include/__concepts/class_or_enum.h b/libcxx/include/__concepts/class_or_enum.h
index c1b4a8c258f3a..2739e31e14ba6 100644
--- a/libcxx/include/__concepts/class_or_enum.h
+++ b/libcxx/include/__concepts/class_or_enum.h
@@ -28,11 +28,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp>
 concept __class_or_enum = is_class_v<_Tp> || is_union_v<_Tp> || is_enum_v<_Tp>;
 
-// Work around Clang bug https://llvm.org/PR52970
-// TODO: remove this workaround once libc++ no longer has to support Clang 13 (it was fixed in Clang 14).
-template <class _Tp>
-concept __workaround_52970 = is_class_v<__remove_cvref_t<_Tp>> || is_union_v<__remove_cvref_t<_Tp>>;
-
 #endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__concepts/swappable.h b/libcxx/include/__concepts/swappable.h
index 1337dc49d75b1..d339488a087a5 100644
--- a/libcxx/include/__concepts/swappable.h
+++ b/libcxx/include/__concepts/swappable.h
@@ -15,8 +15,8 @@
 #include <__concepts/constructible.h>
 #include <__config>
 #include <__type_traits/extent.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/exchange.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h
index 417db54e6eaac..1bf2df939258a 100644
--- a/libcxx/include/__exception/nested_exception.h
+++ b/libcxx/include/__exception/nested_exception.h
@@ -15,8 +15,8 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/is_base_of.h>
 #include <__type_traits/is_class.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_constructible.h>
 #include <__type_traits/is_final.h>
 #include <__type_traits/is_polymorphic.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 443d9257dc598..d7adaac7567b2 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -23,24 +23,14 @@
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_function.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_assignable.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/lazy.h>
 #include <__type_traits/negation.h>
diff --git a/libcxx/include/__format/concepts.h b/libcxx/include/__format/concepts.h
index d7b5a9d16df70..13380e9b91aff 100644
--- a/libcxx/include/__format/concepts.h
+++ b/libcxx/include/__format/concepts.h
@@ -15,10 +15,12 @@
 #include <__config>
 #include <__format/format_parse_context.h>
 #include <__fwd/format.h>
+#include <__fwd/tuple.h>
+#include <__tuple/tuple_size.h>
 #include <__type_traits/is_specialization.h>
 #include <__type_traits/remove_const.h>
+#include <__type_traits/remove_reference.h>
 #include <__utility/pair.h>
-#include <tuple>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index 066cd369eb891..23a599e995759 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -151,7 +151,7 @@ consteval __arg_t __determine_arg_t() {
 // The overload for not formattable types allows triggering the static
 // assertion below.
 template <class _Context, class _Tp>
-  requires(!__formattable<_Tp, typename _Context::char_type>)
+  requires(!__formattable_with<_Tp, _Context>)
 consteval __arg_t __determine_arg_t() {
   return __arg_t::__none;
 }
@@ -165,7 +165,6 @@ _LIBCPP_HIDE_FROM_ABI basic_format_arg<_Context> __create_format_arg(_Tp& __valu
   using _Dp               = remove_const_t<_Tp>;
   constexpr __arg_t __arg = __determine_arg_t<_Context, _Dp>();
   static_assert(__arg != __arg_t::__none, "the supplied type is not formattable");
-
   static_assert(__formattable_with<_Tp, _Context>);
 
   // Not all types can be used to directly initialize the
diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h
index d131e942aca60..bf603c5c62d9d 100644
--- a/libcxx/include/__format/format_context.h
+++ b/libcxx/include/__format/format_context.h
@@ -27,7 +27,7 @@
 #include <cstddef>
 
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#  include <locale>
+#  include <__locale>
 #  include <optional>
 #endif
 
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 3ee53539f4ee6..c7810140105a0 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -41,7 +41,7 @@
 #include <string_view>
 
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#  include <locale>
+#  include <__locale>
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index f01d323efff5f..1d94cc349c0dd 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -39,7 +39,7 @@
 #include <cstddef>
 
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#  include <locale>
+#  include <__locale>
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__functional/bind_front.h b/libcxx/include/__functional/bind_front.h
index 30dda533615b2..11d0bac3f839c 100644
--- a/libcxx/include/__functional/bind_front.h
+++ b/libcxx/include/__functional/bind_front.h
@@ -17,7 +17,6 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index 416c26a0c73f2..1faa9e92ebd63 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -28,7 +28,7 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/is_core_convertible.h>
 #include <__type_traits/is_scalar.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/strip_signature.h>
@@ -768,7 +768,7 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
   {
   }
 
-  virtual __base<_Rp(_ArgTypes...)>* __clone() const {
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual __base<_Rp(_ArgTypes...)>* __clone() const {
     _LIBCPP_ASSERT_INTERNAL(
         false,
         "Block pointers are just pointers, so they should always fit into "
@@ -777,9 +777,11 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
     return nullptr;
   }
 
-  virtual void __clone(__base<_Rp(_ArgTypes...)>* __p) const { ::new ((void*)__p) __func(__f_); }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void __clone(__base<_Rp(_ArgTypes...)>* __p) const {
+    ::new ((void*)__p) __func(__f_);
+  }
 
-  virtual void destroy() _NOEXCEPT {
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void destroy() _NOEXCEPT {
 #    ifndef _LIBCPP_HAS_OBJC_ARC
     if (__f_)
       _Block_release(__f_);
@@ -787,7 +789,7 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
     __f_ = 0;
   }
 
-  virtual void destroy_deallocate() _NOEXCEPT {
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void destroy_deallocate() _NOEXCEPT {
     _LIBCPP_ASSERT_INTERNAL(
         false,
         "Block pointers are just pointers, so they should always fit into "
@@ -795,16 +797,20 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
         "never be invoked.");
   }
 
-  virtual _Rp operator()(_ArgTypes&&... __arg) { return std::__invoke(__f_, std::forward<_ArgTypes>(__arg)...); }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual _Rp operator()(_ArgTypes&&... __arg) {
+    return std::__invoke(__f_, std::forward<_ArgTypes>(__arg)...);
+  }
 
 #    ifndef _LIBCPP_HAS_NO_RTTI
-  virtual const void* target(type_info const& __ti) const _NOEXCEPT {
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual const void* target(type_info const& __ti) const _NOEXCEPT {
     if (__ti == typeid(__func::__block_type))
       return &__f_;
     return (const void*)nullptr;
   }
 
-  virtual const std::type_info& target_type() const _NOEXCEPT { return typeid(__func::__block_type); }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual const std::type_info& target_type() const _NOEXCEPT {
+    return typeid(__func::__block_type);
+  }
 #    endif // _LIBCPP_HAS_NO_RTTI
 };
 
diff --git a/libcxx/include/__functional/hash.h b/libcxx/include/__functional/hash.h
index a466c837038f8..a9e450edd39f5 100644
--- a/libcxx/include/__functional/hash.h
+++ b/libcxx/include/__functional/hash.h
@@ -10,23 +10,18 @@
 #define _LIBCPP___FUNCTIONAL_HASH_H
 
 #include <__config>
-#include <__functional/invoke.h>
 #include <__functional/unary_function.h>
 #include <__fwd/functional.h>
-#include <__tuple/sfinae_helpers.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
+#include <__type_traits/conjunction.h>
+#include <__type_traits/invoke.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_enum.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/underlying_type.h>
-#include <__utility/forward.h>
-#include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/swap.h>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__functional/not_fn.h b/libcxx/include/__functional/not_fn.h
index 23a491c135d79..4b3ce5524a743 100644
--- a/libcxx/include/__functional/not_fn.h
+++ b/libcxx/include/__functional/not_fn.h
@@ -16,7 +16,6 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index e6691e78a267f..a705117d0173f 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -28,12 +28,9 @@
 #include <__type_traits/can_extract_key.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_const.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_swappable.h>
diff --git a/libcxx/include/__iterator/cpp17_iterator_concepts.h b/libcxx/include/__iterator/cpp17_iterator_concepts.h
index d1ad2b4e28480..cdb561e68452a 100644
--- a/libcxx/include/__iterator/cpp17_iterator_concepts.h
+++ b/libcxx/include/__iterator/cpp17_iterator_concepts.h
@@ -14,10 +14,8 @@
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_signed.h>
 #include <__type_traits/is_void.h>
 #include <__utility/as_const.h>
diff --git a/libcxx/include/__iterator/iter_move.h b/libcxx/include/__iterator/iter_move.h
index 202b94cccc5ac..ba8aed3c0ffbb 100644
--- a/libcxx/include/__iterator/iter_move.h
+++ b/libcxx/include/__iterator/iter_move.h
@@ -35,7 +35,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 namespace __iter_move {
 
-void iter_move();
+void iter_move() = delete;
 
 template <class _Tp>
 concept __unqualified_iter_move = __class_or_enum<remove_cvref_t<_Tp>> && requires(_Tp&& __t) {
diff --git a/libcxx/include/__iterator/reverse_iterator.h b/libcxx/include/__iterator/reverse_iterator.h
index 5900b1c5ac154..2ae1461934853 100644
--- a/libcxx/include/__iterator/reverse_iterator.h
+++ b/libcxx/include/__iterator/reverse_iterator.h
@@ -34,7 +34,7 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__utility/declval.h>
@@ -316,172 +316,6 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator<_Ite
 }
 #endif
 
-#if _LIBCPP_STD_VER <= 17
-template <class _Iter>
-using __unconstrained_reverse_iterator = reverse_iterator<_Iter>;
-#else
-
-// __unconstrained_reverse_iterator allows us to use reverse iterators in the implementation of algorithms by working
-// around a language issue in C++20.
-// In C++20, when a reverse iterator wraps certain C++20-hostile iterators, calling comparison operators on it will
-// result in a compilation error. However, calling comparison operators on the pristine hostile iterator is not
-// an error. Thus, we cannot use reverse_iterators in the implementation of an algorithm that accepts a
-// C++20-hostile iterator. This class is an internal workaround -- it is a copy of reverse_iterator with
-// tweaks to make it support hostile iterators.
-//
-// A C++20-hostile iterator is one that defines a comparison operator where one of the arguments is an exact match
-// and the other requires an implicit conversion, for example:
-//   friend bool operator==(const BaseIter&, const DerivedIter&);
-//
-// C++20 rules for rewriting equality operators create another overload of this function with parameters reversed:
-//   friend bool operator==(const DerivedIter&, const BaseIter&);
-//
-// This creates an ambiguity in overload resolution.
-//
-// Clang treats this ambiguity differently in different contexts. When operator== is actually called in the function
-// body, the code is accepted with a warning. When a concept requires operator== to be a valid expression, however,
-// it evaluates to false. Thus, the implementation of reverse_iterator::operator== can actually call operator== on its
-// base iterators, but the constraints on reverse_iterator::operator== prevent it from being considered during overload
-// resolution. This class simply removes the problematic constraints from comparison functions.
-template <class _Iter>
-class __unconstrained_reverse_iterator {
-  _Iter __iter_;
-
-public:
-  static_assert(__has_bidirectional_iterator_category<_Iter>::value || bidirectional_iterator<_Iter>);
-
-  using iterator_type = _Iter;
-  using iterator_category =
-      _If<__has_random_access_iterator_category<_Iter>::value,
-          random_access_iterator_tag,
-          __iterator_category_type<_Iter>>;
-  using pointer         = __iterator_pointer_type<_Iter>;
-  using value_type      = iter_value_t<_Iter>;
-  using difference_type = iter_difference_t<_Iter>;
-  using reference       = iter_reference_t<_Iter>;
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator()                                        = default;
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator(const __unconstrained_reverse_iterator&) = default;
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit __unconstrained_reverse_iterator(_Iter __iter) : __iter_(__iter) {}
-
-  _LIBCPP_HIDE_FROM_ABI constexpr _Iter base() const { return __iter_; }
-  _LIBCPP_HIDE_FROM_ABI constexpr reference operator*() const {
-    auto __tmp = __iter_;
-    return *--__tmp;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr pointer operator->() const {
-    if constexpr (is_pointer_v<_Iter>) {
-      return std::prev(__iter_);
-    } else {
-      return std::prev(__iter_).operator->();
-    }
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr iter_rvalue_reference_t<_Iter>
-  iter_move(const __unconstrained_reverse_iterator& __i) noexcept(
-      is_nothrow_copy_constructible_v<_Iter>&& noexcept(ranges::iter_move(--std::declval<_Iter&>()))) {
-    auto __tmp = __i.base();
-    return ranges::iter_move(--__tmp);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator++() {
-    --__iter_;
-    return *this;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator++(int) {
-    auto __tmp = *this;
-    --__iter_;
-    return __tmp;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator--() {
-    ++__iter_;
-    return *this;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator--(int) {
-    auto __tmp = *this;
-    ++__iter_;
-    return __tmp;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator+=(difference_type __n) {
-    __iter_ -= __n;
-    return *this;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator-=(difference_type __n) {
-    __iter_ += __n;
-    return *this;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator+(difference_type __n) const {
-    return __unconstrained_reverse_iterator(__iter_ - __n);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator-(difference_type __n) const {
-    return __unconstrained_reverse_iterator(__iter_ + __n);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr difference_type operator-(const __unconstrained_reverse_iterator& __other) const {
-    return __other.__iter_ - __iter_;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr auto operator[](difference_type __n) const { return *(*this + __n); }
-
-  // Deliberately unconstrained unlike the comparison functions in `reverse_iterator` -- see the class comment for the
-  // rationale.
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator==(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() == __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator!=(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() != __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator<(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() > __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator>(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() < __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator<=(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() >= __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator>=(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() <= __rhs.base();
-  }
-};
-
-#endif // _LIBCPP_STD_VER <= 17
-
-template <template <class> class _RevIter1, template <class> class _RevIter2, class _Iter>
-struct __unwrap_reverse_iter_impl {
-  using _UnwrappedIter  = decltype(__unwrap_iter_impl<_Iter>::__unwrap(std::declval<_Iter>()));
-  using _ReverseWrapper = _RevIter1<_RevIter2<_Iter> >;
-
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ReverseWrapper
-  __rewrap(_ReverseWrapper __orig_iter, _UnwrappedIter __unwrapped_iter) {
-    return _ReverseWrapper(
-        _RevIter2<_Iter>(__unwrap_iter_impl<_Iter>::__rewrap(__orig_iter.base().base(), __unwrapped_iter)));
-  }
-
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _UnwrappedIter __unwrap(_ReverseWrapper __i) _NOEXCEPT {
-    return __unwrap_iter_impl<_Iter>::__unwrap(__i.base().base());
-  }
-};
-
 #if _LIBCPP_STD_VER >= 20
 template <ranges::bidirectional_range _Range>
 _LIBCPP_HIDE_FROM_ABI constexpr ranges::subrange<reverse_iterator<ranges::iterator_t<_Range>>,
@@ -493,24 +327,20 @@ __reverse_range(_Range&& __range) {
 #endif
 
 template <class _Iter, bool __b>
-struct __unwrap_iter_impl<reverse_iterator<reverse_iterator<_Iter> >, __b>
-    : __unwrap_reverse_iter_impl<reverse_iterator, reverse_iterator, _Iter> {};
-
-#if _LIBCPP_STD_VER >= 20
-
-template <class _Iter, bool __b>
-struct __unwrap_iter_impl<reverse_iterator<__unconstrained_reverse_iterator<_Iter>>, __b>
-    : __unwrap_reverse_iter_impl<reverse_iterator, __unconstrained_reverse_iterator, _Iter> {};
-
-template <class _Iter, bool __b>
-struct __unwrap_iter_impl<__unconstrained_reverse_iterator<reverse_iterator<_Iter>>, __b>
-    : __unwrap_reverse_iter_impl<__unconstrained_reverse_iterator, reverse_iterator, _Iter> {};
+struct __unwrap_iter_impl<reverse_iterator<reverse_iterator<_Iter> >, __b> {
+  using _UnwrappedIter  = decltype(__unwrap_iter_impl<_Iter>::__unwrap(std::declval<_Iter>()));
+  using _ReverseWrapper = reverse_iterator<reverse_iterator<_Iter> >;
 
-template <class _Iter, bool __b>
-struct __unwrap_iter_impl<__unconstrained_reverse_iterator<__unconstrained_reverse_iterator<_Iter>>, __b>
-    : __unwrap_reverse_iter_impl<__unconstrained_reverse_iterator, __unconstrained_reverse_iterator, _Iter> {};
+  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ReverseWrapper
+  __rewrap(_ReverseWrapper __orig_iter, _UnwrappedIter __unwrapped_iter) {
+    return _ReverseWrapper(
+        reverse_iterator<_Iter>(__unwrap_iter_impl<_Iter>::__rewrap(__orig_iter.base().base(), __unwrapped_iter)));
+  }
 
-#endif // _LIBCPP_STD_VER >= 20
+  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _UnwrappedIter __unwrap(_ReverseWrapper __i) _NOEXCEPT {
+    return __unwrap_iter_impl<_Iter>::__unwrap(__i.base().base());
+  }
+};
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__mdspan/mdspan.h b/libcxx/include/__mdspan/mdspan.h
index 684828eb90ec7..d9a0108d6d350 100644
--- a/libcxx/include/__mdspan/mdspan.h
+++ b/libcxx/include/__mdspan/mdspan.h
@@ -27,7 +27,6 @@
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
diff --git a/libcxx/include/__memory/allocator_traits.h b/libcxx/include/__memory/allocator_traits.h
index e79512b2ed146..efdaa5e4ed8a5 100644
--- a/libcxx/include/__memory/allocator_traits.h
+++ b/libcxx/include/__memory/allocator_traits.h
@@ -14,9 +14,8 @@
 #include <__memory/construct_at.h>
 #include <__memory/pointer_traits.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_empty.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/make_unsigned.h>
 #include <__type_traits/remove_reference.h>
 #include <__type_traits/void_t.h>
diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h
index 373e131a7877d..328849d7cc12d 100644
--- a/libcxx/include/__memory/compressed_pair.h
+++ b/libcxx/include/__memory/compressed_pair.h
@@ -16,7 +16,7 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/dependent_type.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/is_default_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_empty.h>
 #include <__type_traits/is_final.h>
 #include <__type_traits/is_same.h>
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index d90e1449678e5..794a794d8fd85 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -37,8 +37,8 @@
 #include <__type_traits/disjunction.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_bounded_array.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/nat.h>
diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h
index 52cce1cf7c7c6..7475ef5cf85de 100644
--- a/libcxx/include/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__memory/uninitialized_algorithms.h
@@ -25,10 +25,8 @@
 #include <__type_traits/extent.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_constant_evaluated.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
-#include <__type_traits/is_trivially_move_assignable.h>
-#include <__type_traits/is_trivially_move_constructible.h>
+#include <__type_traits/is_trivially_assignable.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_relocatable.h>
 #include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/negation.h>
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index a505dab8dd749..46d9405e31596 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -28,7 +28,6 @@
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_reference.h>
diff --git a/libcxx/include/__mutex/mutex.h b/libcxx/include/__mutex/mutex.h
index ddc85cf5a00d5..1ed01547126f4 100644
--- a/libcxx/include/__mutex/mutex.h
+++ b/libcxx/include/__mutex/mutex.h
@@ -11,7 +11,7 @@
 
 #include <__config>
 #include <__thread/support.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__ranges/access.h b/libcxx/include/__ranges/access.h
index 263fdd637fd96..3db4f11b40f2d 100644
--- a/libcxx/include/__ranges/access.h
+++ b/libcxx/include/__ranges/access.h
@@ -41,12 +41,11 @@ concept __can_borrow = is_lvalue_reference_v<_Tp> || enable_borrowed_range<remov
 namespace ranges {
 namespace __begin {
 template <class _Tp>
-concept __member_begin = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_begin = __can_borrow<_Tp> && requires(_Tp&& __t) {
   { _LIBCPP_AUTO_CAST(__t.begin()) } -> input_or_output_iterator;
 };
 
-void begin(auto&)       = delete;
-void begin(const auto&) = delete;
+void begin() = delete;
 
 template <class _Tp>
 concept __unqualified_begin =
@@ -104,13 +103,12 @@ using iterator_t = decltype(ranges::begin(std::declval<_Tp&>()));
 namespace ranges {
 namespace __end {
 template <class _Tp>
-concept __member_end = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_end = __can_borrow<_Tp> && requires(_Tp&& __t) {
   typename iterator_t<_Tp>;
   { _LIBCPP_AUTO_CAST(__t.end()) } -> sentinel_for<iterator_t<_Tp>>;
 };
 
-void end(auto&)       = delete;
-void end(const auto&) = delete;
+void end() = delete;
 
 template <class _Tp>
 concept __unqualified_end =
diff --git a/libcxx/include/__ranges/data.h b/libcxx/include/__ranges/data.h
index 18002bb52cc8c..131f6cdad8f21 100644
--- a/libcxx/include/__ranges/data.h
+++ b/libcxx/include/__ranges/data.h
@@ -40,7 +40,7 @@ template <class _Tp>
 concept __ptr_to_object = is_pointer_v<_Tp> && is_object_v<remove_pointer_t<_Tp>>;
 
 template <class _Tp>
-concept __member_data = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_data = __can_borrow<_Tp> && requires(_Tp&& __t) {
   { _LIBCPP_AUTO_CAST(__t.data()) } -> __ptr_to_object;
 };
 
diff --git a/libcxx/include/__ranges/empty.h b/libcxx/include/__ranges/empty.h
index acd55dae224ce..5c1004042aba5 100644
--- a/libcxx/include/__ranges/empty.h
+++ b/libcxx/include/__ranges/empty.h
@@ -29,7 +29,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 namespace __empty {
 template <class _Tp>
-concept __member_empty = __workaround_52970<_Tp> && requires(_Tp&& __t) { bool(__t.empty()); };
+concept __member_empty = requires(_Tp&& __t) { bool(__t.empty()); };
 
 template <class _Tp>
 concept __can_invoke_size = !__member_empty<_Tp> && requires(_Tp&& __t) { ranges::size(__t); };
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index c8314dd848b44..9e6f724241ccf 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -31,7 +31,7 @@
 #include <__ranges/movable_box.h>
 #include <__ranges/view_interface.h>
 #include <__type_traits/conditional.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/make_unsigned.h>
 #include <__type_traits/type_identity.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/__ranges/movable_box.h b/libcxx/include/__ranges/movable_box.h
index 9b38877494eaa..5a456cc3a1b66 100644
--- a/libcxx/include/__ranges/movable_box.h
+++ b/libcxx/include/__ranges/movable_box.h
@@ -17,8 +17,6 @@
 #include <__memory/addressof.h>
 #include <__memory/construct_at.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
 #include <__utility/move.h>
 #include <optional>
 
diff --git a/libcxx/include/__ranges/rbegin.h b/libcxx/include/__ranges/rbegin.h
index 7111201ae7d6b..12e739e1a2b85 100644
--- a/libcxx/include/__ranges/rbegin.h
+++ b/libcxx/include/__ranges/rbegin.h
@@ -36,12 +36,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 namespace __rbegin {
 template <class _Tp>
-concept __member_rbegin = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_rbegin = __can_borrow<_Tp> && requires(_Tp&& __t) {
   { _LIBCPP_AUTO_CAST(__t.rbegin()) } -> input_or_output_iterator;
 };
 
-void rbegin(auto&)       = delete;
-void rbegin(const auto&) = delete;
+void rbegin() = delete;
 
 template <class _Tp>
 concept __unqualified_rbegin =
diff --git a/libcxx/include/__ranges/rend.h b/libcxx/include/__ranges/rend.h
index 58d98aafd264b..5edbc4e3160c5 100644
--- a/libcxx/include/__ranges/rend.h
+++ b/libcxx/include/__ranges/rend.h
@@ -37,13 +37,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 namespace __rend {
 template <class _Tp>
-concept __member_rend = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_rend = __can_borrow<_Tp> && requires(_Tp&& __t) {
   ranges::rbegin(__t);
   { _LIBCPP_AUTO_CAST(__t.rend()) } -> sentinel_for<decltype(ranges::rbegin(__t))>;
 };
 
-void rend(auto&)       = delete;
-void rend(const auto&) = delete;
+void rend() = delete;
 
 template <class _Tp>
 concept __unqualified_rend =
diff --git a/libcxx/include/__ranges/size.h b/libcxx/include/__ranges/size.h
index 14e21aae6bf1d..40b0c6b6aad7a 100644
--- a/libcxx/include/__ranges/size.h
+++ b/libcxx/include/__ranges/size.h
@@ -41,14 +41,13 @@ inline constexpr bool disable_sized_range = false;
 
 namespace ranges {
 namespace __size {
-void size(auto&)       = delete;
-void size(const auto&) = delete;
+void size() = delete;
 
 template <class _Tp>
 concept __size_enabled = !disable_sized_range<remove_cvref_t<_Tp>>;
 
 template <class _Tp>
-concept __member_size = __size_enabled<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_size = __size_enabled<_Tp> && requires(_Tp&& __t) {
   { _LIBCPP_AUTO_CAST(__t.size()) } -> __integer_like;
 };
 
diff --git a/libcxx/include/__ranges/view_interface.h b/libcxx/include/__ranges/view_interface.h
index 84dd1c316de37..3bcfbaf3a2f9e 100644
--- a/libcxx/include/__ranges/view_interface.h
+++ b/libcxx/include/__ranges/view_interface.h
@@ -21,6 +21,7 @@
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__ranges/empty.h>
+#include <__ranges/size.h>
 #include <__type_traits/is_class.h>
 #include <__type_traits/make_unsigned.h>
 #include <__type_traits/remove_cv.h>
@@ -51,16 +52,24 @@ class view_interface {
 public:
   template <class _D2 = _Derived>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool empty()
-    requires forward_range<_D2>
+    requires sized_range<_D2> || forward_range<_D2>
   {
-    return ranges::begin(__derived()) == ranges::end(__derived());
+    if constexpr (sized_range<_D2>) {
+      return ranges::size(__derived()) == 0;
+    } else {
+      return ranges::begin(__derived()) == ranges::end(__derived());
+    }
   }
 
   template <class _D2 = _Derived>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool empty() const
-    requires forward_range<const _D2>
+    requires sized_range<const _D2> || forward_range<const _D2>
   {
-    return ranges::begin(__derived()) == ranges::end(__derived());
+    if constexpr (sized_range<const _D2>) {
+      return ranges::size(__derived()) == 0;
+    } else {
+      return ranges::begin(__derived()) == ranges::end(__derived());
+    }
   }
 
   template <class _D2 = _Derived>
diff --git a/libcxx/include/__ranges/zip_view.h b/libcxx/include/__ranges/zip_view.h
index 4898c0afc87a6..ce00a4e53a489 100644
--- a/libcxx/include/__ranges/zip_view.h
+++ b/libcxx/include/__ranges/zip_view.h
@@ -30,7 +30,7 @@
 #include <__ranges/enable_borrowed_range.h>
 #include <__ranges/size.h>
 #include <__ranges/view_interface.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/make_unsigned.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index aaf955685d2d3..c68349e0979c9 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -26,9 +26,8 @@
 #include <__type_traits/add_lvalue_reference.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_destructible.h>
 #include <__type_traits/remove_reference.h>
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index 0d727ad5c9a72..5a0d8f42a69ce 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -26,11 +26,9 @@
 #include <__type_traits/can_extract_key.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_const.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
diff --git a/libcxx/include/__tuple/sfinae_helpers.h b/libcxx/include/__tuple/sfinae_helpers.h
index 90e9b1e580047..35a57ff776592 100644
--- a/libcxx/include/__tuple/sfinae_helpers.h
+++ b/libcxx/include/__tuple/sfinae_helpers.h
@@ -16,6 +16,7 @@
 #include <__tuple/tuple_like_ext.h>
 #include <__tuple/tuple_size.h>
 #include <__tuple/tuple_types.h>
+#include <__type_traits/conjunction.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_constructible.h>
@@ -32,12 +33,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #ifndef _LIBCPP_CXX03_LANG
 
-template <bool... _Preds>
-struct __all_dummy;
-
-template <bool... _Pred>
-struct __all : _IsSame<__all_dummy<_Pred...>, __all_dummy<((void)_Pred, true)...>> {};
-
 struct __tuple_sfinae_base {
   template <template <class, class...> class _Trait, class... _LArgs, class... _RArgs>
   static auto __do_test(__tuple_types<_LArgs...>, __tuple_types<_RArgs...>)
diff --git a/libcxx/include/__tuple/tuple_size.h b/libcxx/include/__tuple/tuple_size.h
index b8320106fb269..668be13a5c841 100644
--- a/libcxx/include/__tuple/tuple_size.h
+++ b/libcxx/include/__tuple/tuple_size.h
@@ -43,7 +43,7 @@ struct _LIBCPP_TEMPLATE_VIS tuple_size<__enable_if_tuple_size_imp< volatile _Tp,
 
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS
-    tuple_size<__enable_if_tuple_size_imp< const volatile _Tp, integral_constant<size_t, sizeof(tuple_size<_Tp>)>>>
+    tuple_size<__enable_if_tuple_size_imp<const volatile _Tp, integral_constant<size_t, sizeof(tuple_size<_Tp>)>>>
     : public integral_constant<size_t, tuple_size<_Tp>::value> {};
 
 #else
@@ -63,6 +63,11 @@ struct _LIBCPP_TEMPLATE_VIS tuple_size<tuple<_Tp...> > : public integral_constan
 template <class... _Tp>
 struct _LIBCPP_TEMPLATE_VIS tuple_size<__tuple_types<_Tp...> > : public integral_constant<size_t, sizeof...(_Tp)> {};
 
+#  if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr size_t tuple_size_v = tuple_size<_Tp>::value;
+#  endif
+
 #endif // _LIBCPP_CXX03_LANG
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__type_traits/conjunction.h b/libcxx/include/__type_traits/conjunction.h
index 4bfa5a27300d1..c2995591bbc28 100644
--- a/libcxx/include/__type_traits/conjunction.h
+++ b/libcxx/include/__type_traits/conjunction.h
@@ -13,6 +13,7 @@
 #include <__type_traits/conditional.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
+#include <__type_traits/is_same.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -37,6 +38,12 @@ false_type __and_helper(...);
 template <class... _Pred>
 using _And _LIBCPP_NODEBUG = decltype(std::__and_helper<_Pred...>(0));
 
+template <bool... _Preds>
+struct __all_dummy;
+
+template <bool... _Pred>
+struct __all : _IsSame<__all_dummy<_Pred...>, __all_dummy<((void)_Pred, true)...> > {};
+
 #if _LIBCPP_STD_VER >= 17
 
 template <class...>
diff --git a/libcxx/include/__type_traits/is_assignable.h b/libcxx/include/__type_traits/is_assignable.h
index 11134b1c1abaf..1398d42e75467 100644
--- a/libcxx/include/__type_traits/is_assignable.h
+++ b/libcxx/include/__type_traits/is_assignable.h
@@ -10,6 +10,9 @@
 #define _LIBCPP___TYPE_TRAITS_IS_ASSIGNABLE_H
 
 #include <__config>
+#include <__type_traits/add_const.h>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,6 +29,26 @@ template <class _Tp, class _Arg>
 inline constexpr bool is_assignable_v = __is_assignable(_Tp, _Arg);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_copy_assignable
+    : public integral_constant<
+          bool,
+          __is_assignable(__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_copy_assignable_v = is_copy_assignable<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_move_assignable
+    : public integral_constant<bool, __is_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_move_assignable_v = is_move_assignable<_Tp>::value;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_constructible.h b/libcxx/include/__type_traits/is_constructible.h
index 4e62eb061fd59..e2753156709e2 100644
--- a/libcxx/include/__type_traits/is_constructible.h
+++ b/libcxx/include/__type_traits/is_constructible.h
@@ -10,6 +10,9 @@
 #define _LIBCPP___TYPE_IS_CONSTRUCTIBLE_H
 
 #include <__config>
+#include <__type_traits/add_const.h>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,6 +29,33 @@ template <class _Tp, class... _Args>
 inline constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_copy_constructible
+    : public integral_constant<bool, __is_constructible(_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {
+};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_copy_constructible_v = is_copy_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_move_constructible
+    : public integral_constant<bool, __is_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_move_constructible_v = is_move_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_default_constructible : public integral_constant<bool, __is_constructible(_Tp)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_default_constructible_v = __is_constructible(_Tp);
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_IS_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_copy_assignable.h b/libcxx/include/__type_traits/is_copy_assignable.h
deleted file mode 100644
index e607ace540437..0000000000000
--- a/libcxx/include/__type_traits/is_copy_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_COPY_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_COPY_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_copy_assignable
-    : public integral_constant<
-          bool,
-          __is_assignable(__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_copy_assignable_v = is_copy_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_COPY_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_copy_constructible.h b/libcxx/include/__type_traits/is_copy_constructible.h
deleted file mode 100644
index 402f2b89875bb..0000000000000
--- a/libcxx/include/__type_traits/is_copy_constructible.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_COPY_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_COPY_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_copy_constructible
-    : public integral_constant<bool, __is_constructible(_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {
-};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_copy_constructible_v = is_copy_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_COPY_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_default_constructible.h b/libcxx/include/__type_traits/is_default_constructible.h
deleted file mode 100644
index e73e9b98f5dca..0000000000000
--- a/libcxx/include/__type_traits/is_default_constructible.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_DEFAULT_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_DEFAULT_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_default_constructible : public integral_constant<bool, __is_constructible(_Tp)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_default_constructible_v = __is_constructible(_Tp);
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_DEFAULT_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_implicitly_default_constructible.h b/libcxx/include/__type_traits/is_implicitly_default_constructible.h
index 576166e52698a..d5dadd7b870dd 100644
--- a/libcxx/include/__type_traits/is_implicitly_default_constructible.h
+++ b/libcxx/include/__type_traits/is_implicitly_default_constructible.h
@@ -11,7 +11,7 @@
 
 #include <__config>
 #include <__type_traits/integral_constant.h>
-#include <__type_traits/is_default_constructible.h>
+#include <__type_traits/is_constructible.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/is_move_assignable.h b/libcxx/include/__type_traits/is_move_assignable.h
deleted file mode 100644
index 867bc00d824ae..0000000000000
--- a/libcxx/include/__type_traits/is_move_assignable.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_MOVE_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_MOVE_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_move_assignable
-    : public integral_constant<bool, __is_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_move_assignable_v = is_move_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_MOVE_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_move_constructible.h b/libcxx/include/__type_traits/is_move_constructible.h
deleted file mode 100644
index 40ec4a070816b..0000000000000
--- a/libcxx/include/__type_traits/is_move_constructible.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_MOVE_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_MOVE_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_move_constructible
-    : public integral_constant<bool, __is_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_move_constructible_v = is_move_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_MOVE_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_assignable.h b/libcxx/include/__type_traits/is_nothrow_assignable.h
index cc57493313859..adbf0cde93481 100644
--- a/libcxx/include/__type_traits/is_nothrow_assignable.h
+++ b/libcxx/include/__type_traits/is_nothrow_assignable.h
@@ -10,6 +10,9 @@
 #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_ASSIGNABLE_H
 
 #include <__config>
+#include <__type_traits/add_const.h>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -27,6 +30,28 @@ template <class _Tp, class _Arg>
 inline constexpr bool is_nothrow_assignable_v = __is_nothrow_assignable(_Tp, _Arg);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_assignable
+    : public integral_constant<bool,
+                               __is_nothrow_assignable(__add_lvalue_reference_t<_Tp>,
+                                                       __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_copy_assignable_v = is_nothrow_copy_assignable<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_assignable
+    : public integral_constant<bool,
+                               __is_nothrow_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {
+};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_move_assignable_v = is_nothrow_move_assignable<_Tp>::value;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_constructible.h b/libcxx/include/__type_traits/is_nothrow_constructible.h
index f56816b943c4c..fdf23c4dabfea 100644
--- a/libcxx/include/__type_traits/is_nothrow_constructible.h
+++ b/libcxx/include/__type_traits/is_nothrow_constructible.h
@@ -10,6 +10,9 @@
 #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_CONSTRUCTIBLE_H
 
 #include <__config>
+#include <__type_traits/add_const.h>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_reference.h>
@@ -66,6 +69,57 @@ template <class _Tp, class... _Args>
 inline constexpr bool is_nothrow_constructible_v = is_nothrow_constructible<_Tp, _Args...>::value;
 #endif
 
+// TODO: remove this implementation once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611 is fixed
+#ifdef _LIBCPP_COMPILER_GCC
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_constructible
+    : public is_nothrow_constructible<_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type> > {};
+
+#else // _LIBCPP_COMPILER_GCC
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_constructible
+    : public integral_constant<
+          bool,
+          __is_nothrow_constructible(_Tp, typename add_lvalue_reference<typename add_const<_Tp>::type>::type)> {};
+
+#endif // _LIBCPP_COMPILER_GCC
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_copy_constructible_v = is_nothrow_copy_constructible<_Tp>::value;
+#endif
+
+// TODO: remove this implementation once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611 is fixed
+#ifndef _LIBCPP_COMPILER_GCC
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_constructible
+    : public integral_constant<bool, __is_nothrow_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
+
+#else // _LIBCPP_COMPILER_GCC
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_constructible
+    : public is_nothrow_constructible<_Tp, __add_rvalue_reference_t<_Tp> > {};
+
+#endif // _LIBCPP_COMPILER_GCC
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_move_constructible_v = is_nothrow_move_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_default_constructible
+    : public integral_constant<bool, __is_nothrow_constructible(_Tp)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_default_constructible_v = __is_nothrow_constructible(_Tp);
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_copy_assignable.h b/libcxx/include/__type_traits/is_nothrow_copy_assignable.h
deleted file mode 100644
index a97e962b308c8..0000000000000
--- a/libcxx/include/__type_traits/is_nothrow_copy_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_assignable
-    : public integral_constant<bool,
-                               __is_nothrow_assignable(__add_lvalue_reference_t<_Tp>,
-                                                       __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_copy_assignable_v = is_nothrow_copy_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_copy_constructible.h b/libcxx/include/__type_traits/is_nothrow_copy_constructible.h
deleted file mode 100644
index dd7f1d5ef6276..0000000000000
--- a/libcxx/include/__type_traits/is_nothrow_copy_constructible.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/is_nothrow_constructible.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-// TODO: remove this implementation once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611 is fixed
-#ifdef _LIBCPP_COMPILER_GCC
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_constructible
-    : public is_nothrow_constructible<_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type> > {};
-
-#else // _LIBCPP_COMPILER_GCC
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_constructible
-    : public integral_constant<
-          bool,
-          __is_nothrow_constructible(_Tp, typename add_lvalue_reference<typename add_const<_Tp>::type>::type)> {};
-
-#endif // _LIBCPP_COMPILER_GCC
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_copy_constructible_v = is_nothrow_copy_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_default_constructible.h b/libcxx/include/__type_traits/is_nothrow_default_constructible.h
deleted file mode 100644
index 58f31f21b0267..0000000000000
--- a/libcxx/include/__type_traits/is_nothrow_default_constructible.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DEFAULT_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DEFAULT_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_default_constructible
-    : public integral_constant<bool, __is_nothrow_constructible(_Tp)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_default_constructible_v = __is_nothrow_constructible(_Tp);
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DEFAULT_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_move_assignable.h b/libcxx/include/__type_traits/is_nothrow_move_assignable.h
deleted file mode 100644
index aa87e369cd5d0..0000000000000
--- a/libcxx/include/__type_traits/is_nothrow_move_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_assignable
-    : public integral_constant<bool,
-                               __is_nothrow_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {
-};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_move_assignable_v = is_nothrow_move_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_move_constructible.h b/libcxx/include/__type_traits/is_nothrow_move_constructible.h
deleted file mode 100644
index dab5a019b33bf..0000000000000
--- a/libcxx/include/__type_traits/is_nothrow_move_constructible.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/is_nothrow_constructible.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-// TODO: remove this implementation once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611 is fixed
-#ifndef _LIBCPP_COMPILER_GCC
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_constructible
-    : public integral_constant<bool, __is_nothrow_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
-
-#else // _LIBCPP_COMPILER_GCC
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_constructible
-    : public is_nothrow_constructible<_Tp, __add_rvalue_reference_t<_Tp> > {};
-
-#endif // _LIBCPP_COMPILER_GCC
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_move_constructible_v = is_nothrow_move_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_scoped_enum.h b/libcxx/include/__type_traits/is_scoped_enum.h
index 43b3a6b66b1f5..1db88e13356e8 100644
--- a/libcxx/include/__type_traits/is_scoped_enum.h
+++ b/libcxx/include/__type_traits/is_scoped_enum.h
@@ -22,6 +22,18 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 23
+
+// TODO: GCC and Clang both have this builtin. Remove the false case once we've updated to GCC 14.
+#  if __has_builtin(__is_scoped_enum)
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_scoped_enum : bool_constant<__is_scoped_enum(_Tp)> {};
+
+template <class _Tp>
+inline constexpr bool is_scoped_enum_v = __is_scoped_enum(_Tp);
+
+#  else
+
 template <class _Tp, bool = is_enum_v<_Tp> >
 struct __is_scoped_enum_helper : false_type {};
 
@@ -33,7 +45,10 @@ struct _LIBCPP_TEMPLATE_VIS is_scoped_enum : public __is_scoped_enum_helper<_Tp>
 
 template <class _Tp>
 inline constexpr bool is_scoped_enum_v = is_scoped_enum<_Tp>::value;
-#endif
+
+#  endif // __has_builtin(__is_scoped_enum)
+
+#endif // _LIBCPP_STD_VER >= 23
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_swappable.h b/libcxx/include/__type_traits/is_swappable.h
index 06e092692df3d..06b59e5c25d04 100644
--- a/libcxx/include/__type_traits/is_swappable.h
+++ b/libcxx/include/__type_traits/is_swappable.h
@@ -13,10 +13,10 @@
 #include <__type_traits/add_lvalue_reference.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_referenceable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_void.h>
diff --git a/libcxx/include/__type_traits/is_trivially_assignable.h b/libcxx/include/__type_traits/is_trivially_assignable.h
index 19169d13d6d23..c0b8deed93f62 100644
--- a/libcxx/include/__type_traits/is_trivially_assignable.h
+++ b/libcxx/include/__type_traits/is_trivially_assignable.h
@@ -10,6 +10,9 @@
 #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_ASSIGNABLE_H
 
 #include <__config>
+#include <__type_traits/add_const.h>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,6 +29,28 @@ template <class _Tp, class _Arg>
 inline constexpr bool is_trivially_assignable_v = __is_trivially_assignable(_Tp, _Arg);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_assignable
+    : public integral_constant<bool,
+                               __is_trivially_assignable(__add_lvalue_reference_t<_Tp>,
+                                                         __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_copy_assignable_v = is_trivially_copy_assignable<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_move_assignable
+    : public integral_constant<
+          bool,
+          __is_trivially_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_move_assignable_v = is_trivially_move_assignable<_Tp>::value;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_constructible.h b/libcxx/include/__type_traits/is_trivially_constructible.h
index 4faaf9323cd5b..3a77e9fe164da 100644
--- a/libcxx/include/__type_traits/is_trivially_constructible.h
+++ b/libcxx/include/__type_traits/is_trivially_constructible.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_CONSTRUCTIBLE_H
 
 #include <__config>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -27,6 +29,33 @@ template <class _Tp, class... _Args>
 inline constexpr bool is_trivially_constructible_v = __is_trivially_constructible(_Tp, _Args...);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_constructible
+    : public integral_constant<bool, __is_trivially_constructible(_Tp, __add_lvalue_reference_t<const _Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_copy_constructible_v = is_trivially_copy_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_move_constructible
+    : public integral_constant<bool, __is_trivially_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_move_constructible_v = is_trivially_move_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_default_constructible
+    : public integral_constant<bool, __is_trivially_constructible(_Tp)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_default_constructible_v = __is_trivially_constructible(_Tp);
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_copy_assignable.h b/libcxx/include/__type_traits/is_trivially_copy_assignable.h
deleted file mode 100644
index b6b3c119746f5..0000000000000
--- a/libcxx/include/__type_traits/is_trivially_copy_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_assignable
-    : public integral_constant<bool,
-                               __is_trivially_assignable(__add_lvalue_reference_t<_Tp>,
-                                                         __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_copy_assignable_v = is_trivially_copy_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_copy_constructible.h b/libcxx/include/__type_traits/is_trivially_copy_constructible.h
deleted file mode 100644
index 8e71fd1fbf8b3..0000000000000
--- a/libcxx/include/__type_traits/is_trivially_copy_constructible.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_constructible
-    : public integral_constant<bool, __is_trivially_constructible(_Tp, __add_lvalue_reference_t<const _Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_copy_constructible_v = is_trivially_copy_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_default_constructible.h b/libcxx/include/__type_traits/is_trivially_default_constructible.h
deleted file mode 100644
index c3b6152a9fcad..0000000000000
--- a/libcxx/include/__type_traits/is_trivially_default_constructible.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_DEFAULT_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_DEFAULT_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_default_constructible
-    : public integral_constant<bool, __is_trivially_constructible(_Tp)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_default_constructible_v = __is_trivially_constructible(_Tp);
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_DEFAULT_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_move_assignable.h b/libcxx/include/__type_traits/is_trivially_move_assignable.h
deleted file mode 100644
index daf890b26c743..0000000000000
--- a/libcxx/include/__type_traits/is_trivially_move_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_move_assignable
-    : public integral_constant<
-          bool,
-          __is_trivially_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_move_assignable_v = is_trivially_move_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_move_constructible.h b/libcxx/include/__type_traits/is_trivially_move_constructible.h
deleted file mode 100644
index 71e6f898fb3f3..0000000000000
--- a/libcxx/include/__type_traits/is_trivially_move_constructible.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_move_constructible
-    : public integral_constant<bool, __is_trivially_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_move_constructible_v = is_trivially_move_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/noexcept_move_assign_container.h b/libcxx/include/__type_traits/noexcept_move_assign_container.h
index 1d6c07e8cf96d..baaf36d9980e9 100644
--- a/libcxx/include/__type_traits/noexcept_move_assign_container.h
+++ b/libcxx/include/__type_traits/noexcept_move_assign_container.h
@@ -12,7 +12,7 @@
 #include <__config>
 #include <__memory/allocator_traits.h>
 #include <__type_traits/integral_constant.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
+#include <__type_traits/is_nothrow_assignable.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/remove_reference.h b/libcxx/include/__type_traits/remove_reference.h
index fd66417bd84fe..ba67891758adc 100644
--- a/libcxx/include/__type_traits/remove_reference.h
+++ b/libcxx/include/__type_traits/remove_reference.h
@@ -10,7 +10,6 @@
 #define _LIBCPP___TYPE_TRAITS_REMOVE_REFERENCE_H
 
 #include <__config>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -26,15 +25,16 @@ struct remove_reference {
 
 template <class _Tp>
 using __libcpp_remove_reference_t = __remove_reference_t(_Tp);
-#else
-// clang-format off
-template <class _Tp> struct _LIBCPP_TEMPLATE_VIS remove_reference        {typedef _LIBCPP_NODEBUG _Tp type;};
-template <class _Tp> struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&>  {typedef _LIBCPP_NODEBUG _Tp type;};
-template <class _Tp> struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&&> {typedef _LIBCPP_NODEBUG _Tp type;};
-// clang-format on
+#elif __has_builtin(__remove_reference)
+template <class _Tp>
+struct remove_reference {
+  using type _LIBCPP_NODEBUG = __remove_reference(_Tp);
+};
 
 template <class _Tp>
 using __libcpp_remove_reference_t = typename remove_reference<_Tp>::type;
+#else
+#  error "remove_reference not implemented!"
 #endif // __has_builtin(__remove_reference_t)
 
 #if _LIBCPP_STD_VER >= 14
diff --git a/libcxx/include/__utility/exception_guard.h b/libcxx/include/__utility/exception_guard.h
index 8d90dfd5f1907..93290be40e714 100644
--- a/libcxx/include/__utility/exception_guard.h
+++ b/libcxx/include/__utility/exception_guard.h
@@ -11,7 +11,7 @@
 
 #include <__assert>
 #include <__config>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__utility/exchange.h>
 #include <__utility/move.h>
 
diff --git a/libcxx/include/__utility/exchange.h b/libcxx/include/__utility/exchange.h
index 72312c06b5886..957e9d0acaa65 100644
--- a/libcxx/include/__utility/exchange.h
+++ b/libcxx/include/__utility/exchange.h
@@ -11,7 +11,7 @@
 
 #include <__config>
 #include <__type_traits/is_nothrow_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
 
diff --git a/libcxx/include/__utility/move.h b/libcxx/include/__utility/move.h
index 86d7fc88473bd..626535c8b1ee3 100644
--- a/libcxx/include/__utility/move.h
+++ b/libcxx/include/__utility/move.h
@@ -12,8 +12,8 @@
 
 #include <__config>
 #include <__type_traits/conditional.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/remove_reference.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index b488a9829c384..0056806877e13 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -30,16 +30,9 @@
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_implicitly_default_constructible.h>
-#include <__type_traits/is_move_assignable.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_assignable.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/nat.h>
diff --git a/libcxx/include/__utility/small_buffer.h b/libcxx/include/__utility/small_buffer.h
index f3415a5e50f99..9e13797573d2d 100644
--- a/libcxx/include/__utility/small_buffer.h
+++ b/libcxx/include/__utility/small_buffer.h
@@ -12,8 +12,8 @@
 #include <__config>
 #include <__memory/construct_at.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__utility/exception_guard.h>
 #include <__utility/forward.h>
 #include <cstddef>
diff --git a/libcxx/include/__utility/swap.h b/libcxx/include/__utility/swap.h
index ca8280c729df1..d678d2cba7dc9 100644
--- a/libcxx/include/__utility/swap.h
+++ b/libcxx/include/__utility/swap.h
@@ -10,10 +10,10 @@
 #define _LIBCPP___UTILITY_SWAP_H
 
 #include <__config>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_swappable.h>
 #include <__utility/declval.h>
 #include <__utility/move.h>
diff --git a/libcxx/include/any b/libcxx/include/any
index ce54803cd91b5..a6212fedfa2cd 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -92,9 +92,8 @@ namespace std {
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_copy_constructible.h>
 #include <__type_traits/is_function.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_void.h>
diff --git a/libcxx/include/array b/libcxx/include/array
index 7fa5dc1479349..977f913c3e748 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -123,12 +123,11 @@ template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexce
 #include <__iterator/wrap_iter.h>
 #include <__tuple/sfinae_helpers.h>
 #include <__type_traits/conditional.h>
+#include <__type_traits/conjunction.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_const.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/remove_cv.h>
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index b3b260c2a998e..5bab3f8ad5cf0 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -878,10 +878,12 @@ constexpr chrono::year                                  operator ""y(unsigned lo
 #  include <cstring>
 #  include <forward_list>
 #  include <string>
+#  include <tuple>
 #endif
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER == 20
 #  include <charconv>
+#  include <locale>
 #endif
 
 #endif // _LIBCPP_CHRONO
diff --git a/libcxx/include/complex b/libcxx/include/complex
index e996485a38ae6..a81f968143c4c 100644
--- a/libcxx/include/complex
+++ b/libcxx/include/complex
@@ -258,6 +258,7 @@ template<class T> complex<T> tanh (const complex<T>&);
 
 #include <__config>
 #include <__fwd/complex.h>
+#include <__fwd/tuple.h>
 #include <__tuple/tuple_element.h>
 #include <__tuple/tuple_size.h>
 #include <__utility/move.h>
@@ -1443,15 +1444,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const complex<_Tp>& __x) {
 
 // [complex.tuple], tuple interface
 
-template <class _Tp>
-struct tuple_size;
-
 template <class _Tp>
 struct tuple_size<complex<_Tp>> : integral_constant<size_t, 2> {};
 
-template <size_t _Ip, class _Tp>
-struct tuple_element;
-
 template <size_t _Ip, class _Tp>
 struct tuple_element<_Ip, complex<_Tp>> {
   static_assert(_Ip < 2, "Index value is out of range.");
diff --git a/libcxx/include/coroutine b/libcxx/include/coroutine
index 4bd1d4e9c3103..b1ba83b541b4b 100644
--- a/libcxx/include/coroutine
+++ b/libcxx/include/coroutine
@@ -56,6 +56,7 @@ struct suspend_always;
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <iosfwd>
+#  include <limits>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/cwchar b/libcxx/include/cwchar
index 7442438d8f447..4cc6f56c389bf 100644
--- a/libcxx/include/cwchar
+++ b/libcxx/include/cwchar
@@ -254,4 +254,8 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __constexpr_wmemchr(_Tp
 
 _LIBCPP_END_NAMESPACE_STD
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cstddef>
+#endif
+
 #endif // _LIBCPP_CWCHAR
diff --git a/libcxx/include/execution b/libcxx/include/execution
index 822ffa1fd3ebc..94d434b2e4603 100644
--- a/libcxx/include/execution
+++ b/libcxx/include/execution
@@ -142,4 +142,8 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cstddef>
+#endif
+
 #endif // _LIBCPP_EXECUTION
diff --git a/libcxx/include/experimental/memory b/libcxx/include/experimental/memory
index 7e698fe9dee77..e9663d43a8ab7 100644
--- a/libcxx/include/experimental/memory
+++ b/libcxx/include/experimental/memory
@@ -191,4 +191,8 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_ENABLE_EXPERIMENTAL
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <limits>
+#endif
+
 #endif /* _LIBCPP_EXPERIMENTAL_MEMORY */
diff --git a/libcxx/include/format b/libcxx/include/format
index c0485c5a10359..146613464534f 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -221,4 +221,8 @@ namespace std {
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <locale>
+#endif
+
 #endif // _LIBCPP_FORMAT
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index a62b171a46783..1dcbe8f88b90d 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -221,9 +221,8 @@ template <class T, class Allocator, class Predicate>
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_const.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/type_identity.h>
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 776641b347e6c..7a084d114b185 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -308,6 +308,43 @@ private:
   state_type __st_;
   state_type __st_last_;
   ios_base::openmode __om_;
+  // There have been no file operations yet, which allows setting unbuffered
+  // I/O mode.
+  static const ios_base::openmode __no_io_operations = ios_base::trunc;
+  // Unbuffered I/O mode has been requested.
+  static const ios_base::openmode __use_unbuffered_io = ios_base::ate;
+  // Used to track the currently used mode and track whether the output should
+  // be unbuffered.
+  // [filebuf.virtuals]/12
+  //   If setbuf(0, 0) is called on a stream before any I/O has occurred on
+  //   that stream, the stream becomes unbuffered. Otherwise the results are
+  //   implementation-defined.
+  // This allows calling setbuf(0, 0)
+  // - before opening a file,
+  // - after opening a file, before
+  //   - a read
+  //   - a write
+  //   - a seek.
+  // Note that opening a file with ios_base::ate does a seek operation.
+  // Normally underflow, overflow, and sync change this flag to ios_base::in,
+  // ios_base_out, or 0.
+  //
+  // The ios_base::trunc and ios_base::ate flags are not used in __cm_. They
+  // are used to track the state of the unbuffered request. For readability
+  // they have the aliases __no_io_operations and __use_unbuffered_io
+  // respectively.
+  //
+  // The __no_io_operations and __use_unbuffered_io flags are used in the
+  // following way:
+  // - __no_io_operations is set upon construction to indicate the unbuffered
+  //   state can be set.
+  // - When requesting unbuffered output:
+  //   - If the file is open it sets the mode.
+  //   - Else places a request by adding the __use_unbuffered_io flag.
+  // - When a file is opened it checks whether both __no_io_operations and
+  //   __use_unbuffered_io are set. If so switches to unbuffered mode.
+  // - All file I/O operations change the mode effectively clearing the
+  //   __no_io_operations and __use_unbuffered_io flags.
   ios_base::openmode __cm_;
   bool __owns_eb_;
   bool __owns_ib_;
@@ -327,7 +364,13 @@ private:
       return nullptr;
 
     __om_ = __mode;
+    if (__cm_ == (__no_io_operations | __use_unbuffered_io)) {
+      std::setbuf(__file_, nullptr);
+      __cm_ = 0;
+    }
+
     if (__mode & ios_base::ate) {
+      __cm_ = 0;
       if (fseek(__file_, 0, SEEK_END)) {
         fclose(__file_);
         __file_ = nullptr;
@@ -337,6 +380,20 @@ private:
 
     return this;
   }
+
+  // If the file is already open, switch to unbuffered mode. Otherwise, record
+  // the request to use unbuffered mode so that we use that mode when we
+  // eventually open the file.
+  _LIBCPP_HIDE_FROM_ABI void __request_unbuffered_mode(char_type* __s, streamsize __n) {
+    if (__cm_ == __no_io_operations && __s == nullptr && __n == 0) {
+      if (__file_) {
+        std::setbuf(__file_, nullptr);
+        __cm_ = 0;
+      } else {
+        __cm_ = __no_io_operations | __use_unbuffered_io;
+      }
+    }
+  }
 };
 
 template <class _CharT, class _Traits>
@@ -352,7 +409,7 @@ basic_filebuf<_CharT, _Traits>::basic_filebuf()
       __st_(),
       __st_last_(),
       __om_(0),
-      __cm_(0),
+      __cm_(__no_io_operations),
       __owns_eb_(false),
       __owns_ib_(false),
       __always_noconv_(false) {
@@ -810,6 +867,7 @@ template <class _CharT, class _Traits>
 basic_streambuf<_CharT, _Traits>* basic_filebuf<_CharT, _Traits>::setbuf(char_type* __s, streamsize __n) {
   this->setg(nullptr, nullptr, nullptr);
   this->setp(nullptr, nullptr);
+  __request_unbuffered_mode(__s, __n);
   if (__owns_eb_)
     delete[] __extbuf_;
   if (__owns_ib_)
diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
index b1e728cde868d..77b7befd44f56 100644
--- a/libcxx/include/libcxx.imp
+++ b/libcxx/include/libcxx.imp
@@ -751,10 +751,7 @@
   { include: [ "<__type_traits/is_constant_evaluated.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_convertible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_copy_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_copy_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_core_convertible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_default_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_destructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_empty.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_enum.h>", "private", "<type_traits>", "public" ] },
@@ -770,17 +767,10 @@
   { include: [ "<__type_traits/is_member_function_pointer.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_member_object_pointer.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_member_pointer.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_move_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_move_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_nothrow_assignable.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_nothrow_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_nothrow_convertible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_copy_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_copy_constructible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_default_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_nothrow_destructible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_move_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_move_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_null_pointer.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_object.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_pod.h>", "private", "<type_traits>", "public" ] },
@@ -801,14 +791,9 @@
   { include: [ "<__type_traits/is_trivial.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_assignable.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_constructible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_copy_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_copy_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_copyable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_default_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_destructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_lexicographically_comparable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_move_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_move_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_relocatable.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_unbounded_array.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_union.h>", "private", "<type_traits>", "public" ] },
diff --git a/libcxx/include/limits b/libcxx/include/limits
index f15b5b1ab1d52..f022048cc2655 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -137,9 +137,9 @@ protected:
   typedef _Tp type;
 
   static _LIBCPP_CONSTEXPR const bool is_specialized = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return type(); }
 
   static _LIBCPP_CONSTEXPR const int digits       = 0;
   static _LIBCPP_CONSTEXPR const int digits10     = 0;
@@ -148,8 +148,8 @@ protected:
   static _LIBCPP_CONSTEXPR const bool is_integer  = false;
   static _LIBCPP_CONSTEXPR const bool is_exact    = false;
   static _LIBCPP_CONSTEXPR const int radix        = 0;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(); }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = 0;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = 0;
@@ -161,10 +161,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = false;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(); }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = false;
   static _LIBCPP_CONSTEXPR const bool is_bounded = false;
@@ -198,15 +198,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int max_digits10 = 0;
   static _LIBCPP_CONSTEXPR const type __min       = __libcpp_compute_min<type, digits, is_signed>::value;
   static _LIBCPP_CONSTEXPR const type __max       = is_signed ? type(type(~0) ^ __min) : type(~0);
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = true;
   static _LIBCPP_CONSTEXPR const bool is_exact   = true;
   static _LIBCPP_CONSTEXPR const int radix       = 2;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = 0;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = 0;
@@ -218,10 +218,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = false;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = false;
   static _LIBCPP_CONSTEXPR const bool is_bounded = true;
@@ -249,15 +249,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int max_digits10 = 0;
   static _LIBCPP_CONSTEXPR const type __min       = false;
   static _LIBCPP_CONSTEXPR const type __max       = true;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = true;
   static _LIBCPP_CONSTEXPR const bool is_exact   = true;
   static _LIBCPP_CONSTEXPR const int radix       = 2;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = 0;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = 0;
@@ -269,10 +269,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = false;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = false;
   static _LIBCPP_CONSTEXPR const bool is_bounded = true;
@@ -294,15 +294,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int digits       = __FLT_MANT_DIG__;
   static _LIBCPP_CONSTEXPR const int digits10     = __FLT_DIG__;
   static _LIBCPP_CONSTEXPR const int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __FLT_MIN__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __FLT_MAX__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __FLT_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __FLT_MAX__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = false;
   static _LIBCPP_CONSTEXPR const bool is_exact   = false;
   static _LIBCPP_CONSTEXPR const int radix       = __FLT_RADIX__;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __FLT_EPSILON__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5F; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __FLT_EPSILON__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5F; }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = __FLT_MIN_EXP__;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = __FLT_MIN_10_EXP__;
@@ -314,10 +314,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = true;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_valf(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nanf(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nansf(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __FLT_DENORM_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_valf(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nanf(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nansf(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __FLT_DENORM_MIN__; }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = true;
   static _LIBCPP_CONSTEXPR const bool is_bounded = true;
@@ -343,15 +343,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int digits       = __DBL_MANT_DIG__;
   static _LIBCPP_CONSTEXPR const int digits10     = __DBL_DIG__;
   static _LIBCPP_CONSTEXPR const int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __DBL_MIN__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __DBL_MAX__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __DBL_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __DBL_MAX__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = false;
   static _LIBCPP_CONSTEXPR const bool is_exact   = false;
   static _LIBCPP_CONSTEXPR const int radix       = __FLT_RADIX__;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __DBL_EPSILON__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __DBL_EPSILON__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5; }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = __DBL_MIN_EXP__;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = __DBL_MIN_10_EXP__;
@@ -363,10 +363,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = true;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_val(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nan(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nans(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __DBL_DENORM_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_val(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nan(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nans(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __DBL_DENORM_MIN__; }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = true;
   static _LIBCPP_CONSTEXPR const bool is_bounded = true;
@@ -392,15 +392,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int digits       = __LDBL_MANT_DIG__;
   static _LIBCPP_CONSTEXPR const int digits10     = __LDBL_DIG__;
   static _LIBCPP_CONSTEXPR const int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __LDBL_MIN__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __LDBL_MAX__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __LDBL_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __LDBL_MAX__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = false;
   static _LIBCPP_CONSTEXPR const bool is_exact   = false;
   static _LIBCPP_CONSTEXPR const int radix       = __FLT_RADIX__;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __LDBL_EPSILON__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5L; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __LDBL_EPSILON__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5L; }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = __LDBL_MIN_EXP__;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = __LDBL_MIN_10_EXP__;
@@ -412,10 +412,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = true;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_vall(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nanl(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nansl(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __LDBL_DENORM_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_vall(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nanl(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nansl(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __LDBL_DENORM_MIN__; }
 
 #if defined(__powerpc__) && defined(__LONG_DOUBLE_IBM128__)
   static _LIBCPP_CONSTEXPR const bool is_iec559 = false;
@@ -441,9 +441,9 @@ class _LIBCPP_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<_Tp>
 
 public:
   static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
 
   static _LIBCPP_CONSTEXPR const int digits       = __base::digits;
   static _LIBCPP_CONSTEXPR const int digits10     = __base::digits10;
@@ -452,8 +452,8 @@ public:
   static _LIBCPP_CONSTEXPR const bool is_integer  = __base::is_integer;
   static _LIBCPP_CONSTEXPR const bool is_exact    = __base::is_exact;
   static _LIBCPP_CONSTEXPR const int radix        = __base::radix;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = __base::min_exponent;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10;
@@ -467,10 +467,10 @@ public:
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = __base::has_denorm_loss;
   _LIBCPP_SUPPRESS_DEPRECATED_POP
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = __base::is_iec559;
   static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded;
diff --git a/libcxx/include/list b/libcxx/include/list
index 8f0689268e2a5..9de3d1f60a1f7 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -227,9 +227,8 @@ template <class T, class Allocator, class Predicate>
 #include <__ranges/from_range.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_allocator.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/type_identity.h>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 0bd2831b7f159..f36a47cef0097 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1902,7 +1902,6 @@ module std_private_type_traits_is_core_convertible                       [system
   header "__type_traits/is_core_convertible.h"
   export std_private_type_traits_integral_constant
 }
-module std_private_type_traits_is_default_constructible                  [system] { header "__type_traits/is_default_constructible.h" }
 module std_private_type_traits_is_destructible                           [system] { header "__type_traits/is_destructible.h" }
 module std_private_type_traits_is_empty                                  [system] { header "__type_traits/is_empty.h" }
 module std_private_type_traits_is_enum                                   [system] {
@@ -1927,26 +1926,16 @@ module std_private_type_traits_is_literal_type                           [system
 module std_private_type_traits_is_member_function_pointer                [system] { header "__type_traits/is_member_function_pointer.h" }
 module std_private_type_traits_is_member_object_pointer                  [system] { header "__type_traits/is_member_object_pointer.h" }
 module std_private_type_traits_is_member_pointer                         [system] { header "__type_traits/is_member_pointer.h" }
-module std_private_type_traits_is_move_assignable                        [system] { header "__type_traits/is_move_assignable.h" }
-module std_private_type_traits_is_move_constructible                     [system] { header "__type_traits/is_move_constructible.h" }
 module std_private_type_traits_is_nothrow_assignable                     [system] { header "__type_traits/is_nothrow_assignable.h" }
-module std_private_type_traits_is_nothrow_constructible                  [system] { header "__type_traits/is_nothrow_constructible.h" }
-module std_private_type_traits_is_nothrow_convertible                    [system] { header "__type_traits/is_nothrow_convertible.h" }
-module std_private_type_traits_is_nothrow_copy_assignable                [system] { header "__type_traits/is_nothrow_copy_assignable.h" }
-module std_private_type_traits_is_nothrow_copy_constructible             [system] { header "__type_traits/is_nothrow_copy_constructible.h" }
-module std_private_type_traits_is_nothrow_default_constructible          [system] {
-  header "__type_traits/is_nothrow_default_constructible.h"
+module std_private_type_traits_is_nothrow_constructible                  [system] {
+  header "__type_traits/is_nothrow_constructible.h"
   export std_private_type_traits_integral_constant
 }
+module std_private_type_traits_is_nothrow_convertible                    [system] { header "__type_traits/is_nothrow_convertible.h" }
 module std_private_type_traits_is_nothrow_destructible                   [system] {
   header "__type_traits/is_nothrow_destructible.h"
   export std_private_type_traits_is_destructible
 }
-module std_private_type_traits_is_nothrow_move_assignable                [system] { header "__type_traits/is_nothrow_move_assignable.h" }
-module std_private_type_traits_is_nothrow_move_constructible             [system] {
-  header "__type_traits/is_nothrow_move_constructible.h"
-  export std_private_type_traits_is_nothrow_constructible
-}
 module std_private_type_traits_is_null_pointer                           [system] {
   header "__type_traits/is_null_pointer.h"
   export std_cstddef
@@ -1985,14 +1974,9 @@ module std_private_type_traits_is_swappable                              [system
 module std_private_type_traits_is_trivial                                [system] { header "__type_traits/is_trivial.h" }
 module std_private_type_traits_is_trivially_assignable                   [system] { header "__type_traits/is_trivially_assignable.h" }
 module std_private_type_traits_is_trivially_constructible                [system] { header "__type_traits/is_trivially_constructible.h" }
-module std_private_type_traits_is_trivially_copy_assignable              [system] { header "__type_traits/is_trivially_copy_assignable.h" }
-module std_private_type_traits_is_trivially_copy_constructible           [system] { header "__type_traits/is_trivially_copy_constructible.h" }
 module std_private_type_traits_is_trivially_copyable                     [system] { header "__type_traits/is_trivially_copyable.h" }
-module std_private_type_traits_is_trivially_default_constructible        [system] { header "__type_traits/is_trivially_default_constructible.h" }
 module std_private_type_traits_is_trivially_destructible                 [system] { header "__type_traits/is_trivially_destructible.h" }
 module std_private_type_traits_is_trivially_lexicographically_comparable [system] { header "__type_traits/is_trivially_lexicographically_comparable.h" }
-module std_private_type_traits_is_trivially_move_assignable              [system] { header "__type_traits/is_trivially_move_assignable.h" }
-module std_private_type_traits_is_trivially_move_constructible           [system] { header "__type_traits/is_trivially_move_constructible.h" }
 module std_private_type_traits_is_trivially_relocatable                  [system] { header "__type_traits/is_trivially_relocatable.h" }
 module std_private_type_traits_is_unbounded_array                        [system] { header "__type_traits/is_unbounded_array.h" }
 module std_private_type_traits_is_union                                  [system] { header "__type_traits/is_union.h" }
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index ea56e3051908a..12fae9a88b9d7 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -418,24 +418,6 @@ inline _LIBCPP_HIDE_FROM_ABI void lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&...
   std::__lock_first(0, __l0, __l1, __l2, __l3...);
 }
 
-template <class _L0>
-inline _LIBCPP_HIDE_FROM_ABI void __unlock(_L0& __l0) {
-  __l0.unlock();
-}
-
-template <class _L0, class _L1>
-inline _LIBCPP_HIDE_FROM_ABI void __unlock(_L0& __l0, _L1& __l1) {
-  __l0.unlock();
-  __l1.unlock();
-}
-
-template <class _L0, class _L1, class _L2, class... _L3>
-inline _LIBCPP_HIDE_FROM_ABI void __unlock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3) {
-  __l0.unlock();
-  __l1.unlock();
-  std::__unlock(__l2, __l3...);
-}
-
 #  endif // _LIBCPP_CXX03_LANG
 
 #  if _LIBCPP_STD_VER >= 17
@@ -498,7 +480,7 @@ public:
 private:
   template <size_t... _Indx>
   _LIBCPP_HIDE_FROM_ABI static void __unlock_unpack(__tuple_indices<_Indx...>, _MutexTuple& __mt) {
-    std::__unlock(std::get<_Indx>(__mt)...);
+    (std::get<_Indx>(__mt).unlock(), ...);
   }
 
   _MutexTuple __t_;
diff --git a/libcxx/include/optional b/libcxx/include/optional
index 99bfd0dd900d3..a16e48502e250 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -200,22 +200,16 @@ namespace std {
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
 #include <__type_traits/is_destructible.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_object.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_scalar.h>
 #include <__type_traits/is_swappable.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
+#include <__type_traits/is_trivially_assignable.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_assignable.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__type_traits/negation.h>
 #include <__type_traits/remove_const.h>
 #include <__type_traits/remove_cvref.h>
@@ -1291,6 +1285,7 @@ _LIBCPP_POP_MACROS
 #  include <concepts>
 #  include <ctime>
 #  include <iterator>
+#  include <limits>
 #  include <memory>
 #  include <ratio>
 #  include <stdexcept>
diff --git a/libcxx/include/string b/libcxx/include/string
index ca5b3fa6a0147..a456f8cb80ee3 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -603,8 +603,8 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_standard_layout.h>
 #include <__type_traits/is_trivial.h>
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index e63e4e25a7d2b..f78db061b844b 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -232,20 +232,11 @@ template <class... Types>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_empty.h>
 #include <__type_traits/is_final.h>
 #include <__type_traits/is_implicitly_default_constructible.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_assignable.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
@@ -1369,9 +1360,6 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_T1, _T2>::pair(
       second(std::forward<_Args2>(std::get<_I2>(__second_args))...) {}
 
 #  if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr size_t tuple_size_v = tuple_size<_Tp>::value;
-
 #    define _LIBCPP_NOEXCEPT_RETURN(...)                                                                               \
       noexcept(noexcept(__VA_ARGS__)) { return __VA_ARGS__; }
 
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index 54c8abec340c4..10f9b881c0e4a 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -458,9 +458,6 @@ namespace std
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_destructible.h>
 #include <__type_traits/is_empty.h>
 #include <__type_traits/is_enum.h>
@@ -474,17 +471,10 @@ namespace std
 #include <__type_traits/is_member_function_pointer.h>
 #include <__type_traits/is_member_object_pointer.h>
 #include <__type_traits/is_member_pointer.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_nothrow_convertible.h>
-#include <__type_traits/is_nothrow_copy_assignable.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
 #include <__type_traits/is_nothrow_destructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_null_pointer.h>
 #include <__type_traits/is_object.h>
 #include <__type_traits/is_pod.h>
@@ -503,13 +493,8 @@ namespace std
 #include <__type_traits/is_trivial.h>
 #include <__type_traits/is_trivially_assignable.h>
 #include <__type_traits/is_trivially_constructible.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
 #include <__type_traits/is_trivially_copyable.h>
-#include <__type_traits/is_trivially_default_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_assignable.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/is_union.h>
 #include <__type_traits/is_unsigned.h>
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 59d7f9b740f3b..f5e99fc3239e8 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -228,17 +228,16 @@ namespace std {
 #include <__type_traits/add_pointer.h>
 #include <__type_traits/add_volatile.h>
 #include <__type_traits/common_type.h>
+#include <__type_traits/conjunction.h>
 #include <__type_traits/dependent_type.h>
 #include <__type_traits/is_array.h>
-#include <__type_traits/is_default_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_destructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
+#include <__type_traits/is_trivially_assignable.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_assignable.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/remove_const.h>
 #include <__type_traits/type_identity.h>
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 89cbdf0b3ff74..0908482600c53 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -346,7 +346,7 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__split_buffer>
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
+#include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/noexcept_move_assign_container.h>
 #include <__type_traits/type_identity.h>
 #include <__utility/exception_guard.h>
@@ -2990,6 +2990,7 @@ _LIBCPP_POP_MACROS
 #  include <concepts>
 #  include <cstdlib>
 #  include <locale>
+#  include <tuple>
 #  include <type_traits>
 #  include <typeinfo>
 #  include <utility>
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index 0dea8cfca94ac..8dd1b51aac2f7 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -206,9 +206,15 @@ add_custom_target(generate-cxx-modules
 # Configure the modules manifest.
 # Use the relative path between the installation and the module in the json
 # file. This allows moving the entire installation to a different location.
+cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_LIBRARY_DIR
+  BASE_DIRECTORY ${CMAKE_INSTALL_PREFIX}
+  OUTPUT_VARIABLE ABS_LIBRARY_DIR)
+cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_MODULES_DIR
+  BASE_DIRECTORY ${CMAKE_INSTALL_PREFIX}
+  OUTPUT_VARIABLE ABS_MODULES_DIR)
 file(RELATIVE_PATH LIBCXX_MODULE_RELATIVE_PATH
-  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_LIBRARY_DIR}
-  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_MODULES_DIR})
+  ${ABS_LIBRARY_DIR}
+  ${ABS_MODULES_DIR})
 configure_file(
   "modules.json.in"
   "${LIBCXX_LIBRARY_DIR}/libc++.modules.json"
diff --git a/libcxx/modules/CMakeLists.txt.in b/libcxx/modules/CMakeLists.txt.in
index e332d70cc1633..c35f6fecb1fd7 100644
--- a/libcxx/modules/CMakeLists.txt.in
+++ b/libcxx/modules/CMakeLists.txt.in
@@ -50,10 +50,15 @@ endif()
 target_compile_options(std
   PUBLIC
     -nostdinc++
+    @LIBCXX_COMPILE_FLAGS@
+)
+target_compile_options(std
+  PRIVATE
     -Wno-reserved-module-identifier
     -Wno-reserved-user-defined-literal
-    @LIBCXX_COMPILE_FLAGS@
 )
+target_link_options(std PUBLIC -nostdlib++ -Wl,-rpath,@LIBCXX_LIBRARY_DIR@ -L@LIBCXX_LIBRARY_DIR@)
+target_link_libraries(std c++)
 set_target_properties(std
   PROPERTIES
     OUTPUT_NAME   "c++std"
@@ -67,7 +72,7 @@ target_sources(std.compat
     std.compat.cppm
 )
 
-target_include_directories(std.compat SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+target_include_directories(std.compat SYSTEM PUBLIC @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
 
 if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
   target_compile_options(std.compat PUBLIC -fno-exceptions)
@@ -76,13 +81,16 @@ endif()
 target_compile_options(std.compat
   PUBLIC
     -nostdinc++
+    @LIBCXX_COMPILE_FLAGS@
+)
+target_compile_options(std.compat
+ PRIVATE
     -Wno-reserved-module-identifier
     -Wno-reserved-user-defined-literal
-	-fmodule-file=std=${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/std.dir/std.pcm
-    @LIBCXX_COMPILE_FLAGS@
 )
 set_target_properties(std.compat
   PROPERTIES
     OUTPUT_NAME   "c++std.compat"
 )
 add_dependencies(std.compat std)
+target_link_libraries(std.compat PUBLIC std c++)
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 07ffc8bfdaae3..1110a79ddcacd 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -301,7 +301,10 @@ if (LIBCXX_ENABLE_STATIC)
     # then its code shouldn't declare them with hidden visibility.  They might
     # actually be provided by a shared library at link time.
     if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
-      append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden)
+      append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete=force-hidden)
+      if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+        append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden)
+      endif()
     endif()
     target_compile_options(cxx_static PRIVATE ${CXX_STATIC_LIBRARY_FLAGS})
     # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in __config_site
diff --git a/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp
index 83f90ac4184bf..9c780ae98d1e8 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp
@@ -31,7 +31,8 @@ struct MyAlloc {
 int main(int, char**)
 {
     std::vector<bool, MyAlloc<bool>> vb;
-    std::vector<bool, MyAlloc<bool>> wb(100);
+    // std::fill_n triggers ADL because __bit_iterator has the container type as a template argument
+    // std::vector<bool, MyAlloc<bool>> wb(100);
 
     std::vector<int, MyAlloc<int>> v;
     std::vector<int, MyAlloc<int>> w(100);
diff --git a/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.compile.pass.cpp
new file mode 100644
index 0000000000000..a93f55b888b95
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.compile.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_NODISCARD_EXT
+
+// Check that <limits> functions aren't marked [[nodiscard]] when
+// _LIBCPP_DISABLE_NODISCARD_EXT is defined
+
+#include <limits>
+
+#include "test_macros.h"
+
+void func() {
+  // arithmetic
+  std::numeric_limits<int>::min();
+  std::numeric_limits<int>::max();
+  std::numeric_limits<int>::lowest();
+  std::numeric_limits<int>::epsilon();
+  std::numeric_limits<int>::round_error();
+  std::numeric_limits<int>::infinity();
+  std::numeric_limits<int>::quiet_NaN();
+  std::numeric_limits<int>::signaling_NaN();
+  std::numeric_limits<int>::denorm_min();
+  // bool
+  std::numeric_limits<bool>::min();
+  std::numeric_limits<bool>::max();
+  std::numeric_limits<bool>::lowest();
+  std::numeric_limits<bool>::epsilon();
+  std::numeric_limits<bool>::round_error();
+  std::numeric_limits<bool>::infinity();
+  std::numeric_limits<bool>::quiet_NaN();
+  std::numeric_limits<bool>::signaling_NaN();
+  std::numeric_limits<bool>::denorm_min();
+  // float
+  std::numeric_limits<float>::min();
+  std::numeric_limits<float>::max();
+  std::numeric_limits<float>::lowest();
+  std::numeric_limits<float>::epsilon();
+  std::numeric_limits<float>::round_error();
+  std::numeric_limits<float>::infinity();
+  std::numeric_limits<float>::quiet_NaN();
+  std::numeric_limits<float>::signaling_NaN();
+  std::numeric_limits<float>::denorm_min();
+  // double
+  std::numeric_limits<double>::min();
+  std::numeric_limits<double>::max();
+  std::numeric_limits<double>::lowest();
+  std::numeric_limits<double>::epsilon();
+  std::numeric_limits<double>::round_error();
+  std::numeric_limits<double>::infinity();
+  std::numeric_limits<double>::quiet_NaN();
+  std::numeric_limits<double>::signaling_NaN();
+  std::numeric_limits<double>::denorm_min();
+  // long double
+  std::numeric_limits<long double>::min();
+  std::numeric_limits<long double>::max();
+  std::numeric_limits<long double>::lowest();
+  std::numeric_limits<long double>::epsilon();
+  std::numeric_limits<long double>::round_error();
+  std::numeric_limits<long double>::infinity();
+  std::numeric_limits<long double>::quiet_NaN();
+  std::numeric_limits<long double>::signaling_NaN();
+  std::numeric_limits<long double>::denorm_min();
+}
diff --git a/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.verify.cpp
new file mode 100644
index 0000000000000..7a81b84378c52
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.verify.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// check that <limits> functions are marked [[nodiscard]]
+
+#include <limits>
+
+#include "test_macros.h"
+
+void func() {
+  // clang-format off
+  // arithmetic
+  std::numeric_limits<int>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // bool
+  std::numeric_limits<bool>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // float
+  std::numeric_limits<float>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // double
+  std::numeric_limits<double>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // long double
+  std::numeric_limits<long double>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/equal.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/equal.pass.cpp
deleted file mode 100644
index 583e733c07cb0..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/equal.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <BidirectionalIterator Iter1, BidirectionalIterator Iter2>
-//   requires HasEqualTo<Iter1, Iter2>
-// bool operator==(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 == r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s), true);
-    test(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s+1), false);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), true);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), false);
-    test(s, s, true);
-    test(s, s+1, false);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater-equal.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater-equal.pass.cpp
deleted file mode 100644
index 9e908418d0756..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater-equal.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasGreater<Iter1, Iter2>
-// bool operator>=(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 >= r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), true);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), true);
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s), false);
-    test(s, s, true);
-    test(s, s+1, true);
-    test(s+1, s, false);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater.pass.cpp
deleted file mode 100644
index f1afd23bab133..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasGreater<Iter1, Iter2>
-// bool operator>(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 > r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), false);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), true);
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s), false);
-    test(s, s, false);
-    test(s, s+1, true);
-    test(s+1, s, false);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less-equal.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less-equal.pass.cpp
deleted file mode 100644
index c710212308fac..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less-equal.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasGreater<Iter1, Iter2>
-// bool operator<=(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 <= r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), true);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), false);
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s), true);
-    test(s, s, true);
-    test(s, s+1, false);
-    test(s+1, s, true);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less.pass.cpp
deleted file mode 100644
index ffd3a0323373f..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasGreater<Iter1, Iter2>
-// bool operator<(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 < r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), false);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), false);
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s), true);
-    test(s, s, false);
-    test(s, s+1, false);
-    test(s+1, s, true);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/not-equal.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/not-equal.pass.cpp
deleted file mode 100644
index 614f159cc8052..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/not-equal.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <BidirectionalIterator Iter1, BidirectionalIterator Iter2>
-//   requires HasEqualTo<Iter1, Iter2>
-// bool operator!=(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 != r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s), false);
-    test(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s+1), true);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), false);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), true);
-    test(s, s, false);
-    test(s, s+1, true);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/assign.LWG3435.verify.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/assign.LWG3435.verify.cpp
deleted file mode 100644
index 835e2b65c19c2..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/assign.LWG3435.verify.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <class U>
-//  requires !same_as<U, Iter> && convertible_to<const U&, Iter> && assignable_from<Iter&, const U&>
-// __unconstrained_reverse_iterator& operator=(const __unconstrained_reverse_iterator<U>& u);
-
-#include <iterator>
-
-struct Base { };
-struct Derived : Base { };
-
-void test() {
-    std::__unconstrained_reverse_iterator<Base*> base;
-    std::__unconstrained_reverse_iterator<Derived*> derived;
-    derived = base; // expected-error {{no viable overloaded '='}}
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.default.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.default.pass.cpp
deleted file mode 100644
index 66972d7243cc8..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.default.pass.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator(); // constexpr since C++17
-
-#include <iterator>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test() {
-    std::__unconstrained_reverse_iterator<It> r;
-    (void)r;
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    test<bidirectional_iterator<const char*> >();
-    test<random_access_iterator<char*> >();
-    test<char*>();
-    test<const char*>();
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.explicit.verify.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.explicit.verify.cpp
deleted file mode 100644
index 6440e284f6a01..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.explicit.verify.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// explicit __unconstrained_reverse_iterator(Iter x);
-
-// test explicitness
-
-#include <iterator>
-
-void f() {
-    char const* it = "";
-    std::__unconstrained_reverse_iterator<char const*> r = it; // expected-error{{no viable conversion from 'const char *' to 'std::__unconstrained_reverse_iterator<const char *>'}}
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.pass.cpp
deleted file mode 100644
index e4d0874d50b5e..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// explicit __unconstrained_reverse_iterator(Iter x); // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    assert(r.base() == i);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char s[] = "123";
-    test(bidirectional_iterator<const char*>(s));
-    test(random_access_iterator<const char*>(s));
-    test(s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.reverse_iterator.LWG3435.verify.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.reverse_iterator.LWG3435.verify.cpp
deleted file mode 100644
index 7ea4a61ce6602..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.reverse_iterator.LWG3435.verify.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <class U>
-//  requires !same_as<U, Iter> && convertible_to<const U&, Iter>
-// __unconstrained_reverse_iterator(const __unconstrained_reverse_iterator<U> &);
-
-#include <iterator>
-
-struct Base { };
-struct Derived : Base { };
-
-void test() {
-    std::__unconstrained_reverse_iterator<Base*> base;
-    std::__unconstrained_reverse_iterator<Derived*> derived(base); // expected-error {{no matching constructor for initialization of 'std::__unconstrained_reverse_iterator<Derived *>'}}
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.conv/base.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.conv/base.pass.cpp
deleted file mode 100644
index 7fd85c92b3277..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.conv/base.pass.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// iterator_type base() const; // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-TEST_CONSTEXPR_CXX17 bool test() {
-    typedef bidirectional_iterator<int*> Iter;
-    int i = 0;
-    Iter iter(&i);
-    std::__unconstrained_reverse_iterator<Iter> const reverse(iter);
-    std::__unconstrained_reverse_iterator<Iter>::iterator_type base = reverse.base();
-    assert(base == Iter(&i));
-    return true;
-}
-
-int main(int, char**) {
-    test();
-#if TEST_STD_VER > 14
-    static_assert(test(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/arrow.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/arrow.pass.cpp
deleted file mode 100644
index f0a181bcba88f..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/arrow.pass.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// pointer operator->() const; // constexpr in C++17
-
-// Be sure to respect LWG 198:
-//    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#198
-// LWG 198 was superseded by LWG 2360
-//    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2360
-
-
-#include <iterator>
-#include <list>
-#include <cassert>
-
-#include "test_macros.h"
-
-class A
-{
-    int data_;
-public:
-    A() : data_(1) {}
-    A(const A&) = default;
-    A& operator=(const A&) = default;
-    ~A() {data_ = -1;}
-
-    int get() const {return data_;}
-
-    friend bool operator==(const A& x, const A& y)
-        {return x.data_ == y.data_;}
-};
-
-template <class It>
-void
-test(It i, typename std::iterator_traits<It>::value_type x)
-{
-    std::__unconstrained_reverse_iterator<It> r(i);
-    assert(r->get() == x.get());
-}
-
-class B
-{
-    int data_;
-public:
-    B(int d=1) : data_(d) {}
-    B(const B&) = default;
-    B& operator=(const B&) = default;
-    ~B() {data_ = -1;}
-
-    int get() const {return data_;}
-
-    friend bool operator==(const B& x, const B& y)
-        {return x.data_ == y.data_;}
-    const B *operator&() const { return nullptr; }
-    B       *operator&()       { return nullptr; }
-};
-
-class C
-{
-    int data_;
-public:
-    TEST_CONSTEXPR C() : data_(1) {}
-
-    TEST_CONSTEXPR int get() const {return data_;}
-
-    friend TEST_CONSTEXPR bool operator==(const C& x, const C& y)
-        {return x.data_ == y.data_;}
-};
-
-TEST_CONSTEXPR  C gC;
-
-int main(int, char**)
-{
-  A a;
-  test(&a+1, A());
-
-  {
-    std::list<B> l;
-    l.push_back(B(0));
-    l.push_back(B(1));
-    l.push_back(B(2));
-
-    {
-      std::list<B>::const_iterator i = l.begin();
-      assert ( i->get() == 0 );  ++i;
-      assert ( i->get() == 1 );  ++i;
-      assert ( i->get() == 2 );  ++i;
-      assert ( i == l.end ());
-    }
-
-    {
-      std::list<B>::const_reverse_iterator ri = l.rbegin();
-      assert ( ri->get() == 2 );  ++ri;
-      assert ( ri->get() == 1 );  ++ri;
-      assert ( ri->get() == 0 );  ++ri;
-      assert ( ri == l.rend ());
-    }
-  }
-
-#if TEST_STD_VER > 14
-  {
-    typedef std::__unconstrained_reverse_iterator<const C *> RI;
-    constexpr RI it1 = RI(&gC+1);
-
-    static_assert(it1->get() == gC.get(), "");
-  }
-#endif
-  {
-    ((void)gC);
-  }
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/bracket.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/bracket.pass.cpp
deleted file mode 100644
index f9beada9e4e64..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/bracket.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// unspecified operator[](difference_type n) const; // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i,
-                               typename std::iterator_traits<It>::difference_type n,
-                               typename std::iterator_traits<It>::value_type x) {
-    typedef typename std::iterator_traits<It>::value_type value_type;
-    const std::__unconstrained_reverse_iterator<It> r(i);
-    value_type rr = r[n];
-    assert(rr == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 4, '1');
-    test(random_access_iterator<const char*>(s+5), 0, '5');
-    test(s+5, 4, '1');
-    test(s+5, 0, '5');
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/dereference.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/dereference.pass.cpp
deleted file mode 100644
index bd6b6e0df038d..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/dereference.pass.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// reference operator*() const; // constexpr in C++17
-
-// Be sure to respect LWG 198:
-//    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#198
-// LWG 198 was superseded by LWG 2360
-//    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2360
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-
-class A
-{
-    int data_;
-public:
-    A() : data_(1) {}
-    A(const A&) = default;
-    A& operator=(const A&) = default;
-    ~A() {data_ = -1;}
-
-    friend bool operator==(const A& x, const A& y)
-        {return x.data_ == y.data_;}
-};
-
-template <class It>
-void
-test(It i, typename std::iterator_traits<It>::value_type x)
-{
-    std::__unconstrained_reverse_iterator<It> r(i);
-    assert(*r == x);
-}
-
-int main(int, char**)
-{
-    A a;
-    test(&a+1, A());
-
-#if TEST_STD_VER > 14
-    {
-        constexpr const char *p = "123456789";
-        typedef std::__unconstrained_reverse_iterator<const char *> RI;
-        constexpr RI it1 = RI(p+1);
-        constexpr RI it2 = RI(p+2);
-        static_assert(*it1 == p[0], "");
-        static_assert(*it2 == p[1], "");
-    }
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/decrement-assign.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/decrement-assign.pass.cpp
deleted file mode 100644
index 48be8a7399e42..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/decrement-assign.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// __unconstrained_reverse_iterator& operator-=(difference_type n); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits<It>::difference_type n, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It>& rr = r -= n;
-    assert(r.base() == x);
-    assert(&rr == &r);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 5, random_access_iterator<const char*>(s+10));
-    test(s+5, 5, s+10);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/increment-assign.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/increment-assign.pass.cpp
deleted file mode 100644
index 115d95e1485cd..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/increment-assign.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// __unconstrained_reverse_iterator& operator+=(difference_type n); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits<It>::difference_type n, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It>& rr = r += n;
-    assert(r.base() == x);
-    assert(&rr == &r);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    char const* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 5, random_access_iterator<const char*>(s));
-    test(s+5, 5, s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/minus.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/minus.pass.cpp
deleted file mode 100644
index c3a4d1fd9e36f..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/minus.pass.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// __unconstrained_reverse_iterator operator-(difference_type n) const; // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits<It>::difference_type n, It x) {
-    const std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It> rr = r - n;
-    assert(rr.base() == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 5, random_access_iterator<const char*>(s+10));
-    test(s+5, 5, s+10);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/plus.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/plus.pass.cpp
deleted file mode 100644
index 164c5abe8a353..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/plus.pass.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// __unconstrained_reverse_iterator operator+(difference_type n) const; // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits<It>::difference_type n, It x) {
-    const std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It> rr = r + n;
-    assert(rr.base() == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 5, random_access_iterator<const char*>(s));
-    test(s+5, 5, s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postdecrement.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postdecrement.pass.cpp
deleted file mode 100644
index 3220c1f9b1eb1..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postdecrement.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator operator--(int); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It> rr = r--;
-    assert(r.base() == x);
-    assert(rr.base() == i);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "123";
-    test(bidirectional_iterator<const char*>(s+1), bidirectional_iterator<const char*>(s+2));
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s+2));
-    test(s+1, s+2);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postincrement.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postincrement.pass.cpp
deleted file mode 100644
index 47477fe89545b..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postincrement.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator operator++(int); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It> rr = r++;
-    assert(r.base() == x);
-    assert(rr.base() == i);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "123";
-    test(bidirectional_iterator<const char*>(s+1), bidirectional_iterator<const char*>(s));
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s));
-    test(s+1, s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/predecrement.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/predecrement.pass.cpp
deleted file mode 100644
index 6ad41aeaf17a2..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/predecrement.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator& operator--(); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It>& rr = --r;
-    assert(r.base() == x);
-    assert(&rr == &r);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "123";
-    test(bidirectional_iterator<const char*>(s+1), bidirectional_iterator<const char*>(s+2));
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s+2));
-    test(s+1, s+2);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/preincrement.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/preincrement.pass.cpp
deleted file mode 100644
index 9c7e5b41738e9..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/preincrement.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator& operator++(); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It>& rr = ++r;
-    assert(r.base() == x);
-    assert(&rr == &r);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "123";
-    test(bidirectional_iterator<const char*>(s+1), bidirectional_iterator<const char*>(s));
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s));
-    test(s+1, s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nonmember/minus.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nonmember/minus.pass.cpp
deleted file mode 100644
index 632e2655dea07..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nonmember/minus.pass.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasMinus<Iter2, Iter1>
-// auto operator-(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y) // constexpr in C++17
-//  -> decltype(y.base() - x.base());
-
-#include <iterator>
-#include <cstddef>
-#include <cassert>
-#include <type_traits>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class, class, class = void> struct HasMinus : std::false_type {};
-template <class R1, class R2> struct HasMinus<R1, R2, decltype((R1() - R2(), void()))> : std::true_type {};
-
-template <class It1, class It2>
-TEST_CONSTEXPR_CXX17 void test(It1 l, It2 r, std::ptrdiff_t x) {
-    const std::__unconstrained_reverse_iterator<It1> r1(l);
-    const std::__unconstrained_reverse_iterator<It2> r2(r);
-    assert((r1 - r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    char s[3] = {0};
-
-    // Test same base iterator type
-    test(s, s, 0);
-    test(s, s+1, 1);
-    test(s+1, s, -1);
-
-    // Test non-subtractable base iterator types
-    static_assert( HasMinus<std::__unconstrained_reverse_iterator<int*>, std::__unconstrained_reverse_iterator<int*> >::value, "");
-#if TEST_STD_VER >= 11
-    static_assert(!HasMinus<std::__unconstrained_reverse_iterator<int*>, std::__unconstrained_reverse_iterator<char*> >::value, "");
-#endif
-
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/types.compile.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/types.compile.pass.cpp
deleted file mode 100644
index f8ffef364f37a..0000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/types.compile.pass.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// Test nested types and data member:
-
-// template <BidirectionalIterator Iter>
-// class __unconstrained_reverse_iterator {
-// protected:
-//   Iter current;
-// public:
-//   iterator<typename iterator_traits<Iterator>::iterator_category,
-//   typename iterator_traits<Iterator>::value_type,
-//   typename iterator_traits<Iterator>::difference_type,
-//   typename iterator_traits<Iterator>::pointer,
-//   typename iterator_traits<Iterator>::reference> {
-// };
-
-#include <iterator>
-#include <type_traits>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-void test() {
-  typedef std::__unconstrained_reverse_iterator<It> R;
-  typedef std::iterator_traits<It> T;
-  static_assert((std::is_same<typename R::iterator_type, It>::value), "");
-  static_assert((std::is_same<typename R::value_type, typename T::value_type>::value), "");
-  static_assert((std::is_same<typename R::difference_type, typename T::difference_type>::value), "");
-  static_assert((std::is_same<typename R::reference, typename T::reference>::value), "");
-  static_assert((std::is_same<typename R::pointer, typename std::iterator_traits<It>::pointer>::value), "");
-
-#if TEST_STD_VER <= 14
-  typedef std::iterator<typename T::iterator_category, typename T::value_type> iterator_base;
-  static_assert((std::is_base_of<iterator_base, R>::value), "");
-#endif
-#if TEST_STD_VER > 17
-  if constexpr (std::is_same_v<typename T::iterator_category, std::contiguous_iterator_tag>) {
-    static_assert((std::is_same<typename R::iterator_category, std::random_access_iterator_tag>::value), "");
-  } else {
-    static_assert((std::is_same<typename R::iterator_category, typename T::iterator_category>::value), "");
-  }
-#else
-  static_assert((std::is_same<typename R::iterator_category, typename T::iterator_category>::value), "");
-#endif
-}
-
-#if TEST_STD_VER > 17
-
-struct FooIter {
-  using iterator_category = std::bidirectional_iterator_tag;
-  using value_type = void*;
-  using difference_type = void*;
-  using pointer = void*;
-  using reference = int&;
-  int& operator*() const;
-};
-template <>
-struct std::indirectly_readable_traits<FooIter> {
-  using value_type = int;
-};
-template <>
-struct std::incrementable_traits<FooIter> {
-  using difference_type = char;
-};
-
-// Not using `FooIter::value_type`.
-static_assert(std::is_same_v<typename std::__unconstrained_reverse_iterator<FooIter>::value_type, int>);
-// Not using `FooIter::difference_type`.
-static_assert(std::is_same_v<typename std::__unconstrained_reverse_iterator<FooIter>::difference_type, char>);
-
-#endif
-
-struct BarIter {
-  bool& operator*() const;
-};
-template <>
-struct std::iterator_traits<BarIter> {
-  using difference_type = char;
-  using value_type = char;
-  using pointer = char*;
-  using reference = char&;
-  using iterator_category = std::bidirectional_iterator_tag;
-};
-
-#if TEST_STD_VER > 17
-  static_assert(std::is_same_v<typename std::__unconstrained_reverse_iterator<BarIter>::reference, bool&>);
-#else
-  static_assert(std::is_same<typename std::__unconstrained_reverse_iterator<BarIter>::reference, char&>::value, "");
-#endif
-
-void test_all() {
-  test<bidirectional_iterator<char*> >();
-  test<random_access_iterator<char*> >();
-  test<char*>();
-}
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 678a986e522aa..c65b9b9d705e2 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -266,9 +266,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index c3875fa2cfc06..b3d9e327fc7aa 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -267,9 +267,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index e28e0cd44fed9..d723409422a3e 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -269,9 +269,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index e28e0cd44fed9..d723409422a3e 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -269,9 +269,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index eec71f4fc6282..03b4eda8b4d86 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -113,14 +113,19 @@ charconv type_traits
 charconv version
 chrono array
 chrono bit
+chrono cctype
+chrono cerrno
 chrono charconv
+chrono clocale
 chrono cmath
 chrono compare
 chrono concepts
 chrono cstddef
 chrono cstdint
+chrono cstdlib
 chrono cstring
 chrono ctime
+chrono cwchar
 chrono forward_list
 chrono limits
 chrono locale
@@ -275,9 +280,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index daa3e17698bbe..062127364adfe 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -68,15 +68,20 @@ charconv limits
 charconv new
 charconv version
 chrono array
+chrono cctype
+chrono cerrno
+chrono clocale
 chrono cmath
 chrono compare
 chrono cstddef
 chrono cstdint
+chrono cstdlib
+chrono cstring
 chrono ctime
+chrono cwchar
 chrono forward_list
 chrono initializer_list
 chrono limits
-chrono locale
 chrono new
 chrono optional
 chrono ostream
@@ -86,6 +91,7 @@ chrono stdexcept
 chrono string
 chrono string_view
 chrono tuple
+chrono typeinfo
 chrono vector
 chrono version
 cinttypes cstdint
@@ -132,12 +138,10 @@ coroutine compare
 coroutine cstddef
 coroutine cstdint
 coroutine cstring
-coroutine limits
 coroutine version
 cstddef version
 ctgmath ccomplex
 ctgmath cmath
-cwchar cstddef
 cwchar cwctype
 cwctype cctype
 deque compare
@@ -156,7 +160,6 @@ exception cstdlib
 exception new
 exception typeinfo
 exception version
-execution cstddef
 execution version
 expected cstddef
 expected initializer_list
@@ -167,7 +170,6 @@ experimental/iterator iterator
 experimental/memory cstddef
 experimental/memory cstdint
 experimental/memory cstring
-experimental/memory limits
 experimental/propagate_const cstddef
 experimental/simd cstddef
 experimental/simd cstdint
@@ -187,12 +189,16 @@ filesystem string
 filesystem string_view
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
-format locale
 format new
 format optional
 format queue
@@ -201,6 +207,7 @@ format stdexcept
 format string
 format string_view
 format tuple
+format typeinfo
 format version
 forward_list compare
 forward_list cstddef
@@ -397,7 +404,6 @@ optional cstddef
 optional cstdint
 optional cstring
 optional initializer_list
-optional limits
 optional new
 optional version
 ostream bitset
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index daa3e17698bbe..062127364adfe 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -68,15 +68,20 @@ charconv limits
 charconv new
 charconv version
 chrono array
+chrono cctype
+chrono cerrno
+chrono clocale
 chrono cmath
 chrono compare
 chrono cstddef
 chrono cstdint
+chrono cstdlib
+chrono cstring
 chrono ctime
+chrono cwchar
 chrono forward_list
 chrono initializer_list
 chrono limits
-chrono locale
 chrono new
 chrono optional
 chrono ostream
@@ -86,6 +91,7 @@ chrono stdexcept
 chrono string
 chrono string_view
 chrono tuple
+chrono typeinfo
 chrono vector
 chrono version
 cinttypes cstdint
@@ -132,12 +138,10 @@ coroutine compare
 coroutine cstddef
 coroutine cstdint
 coroutine cstring
-coroutine limits
 coroutine version
 cstddef version
 ctgmath ccomplex
 ctgmath cmath
-cwchar cstddef
 cwchar cwctype
 cwctype cctype
 deque compare
@@ -156,7 +160,6 @@ exception cstdlib
 exception new
 exception typeinfo
 exception version
-execution cstddef
 execution version
 expected cstddef
 expected initializer_list
@@ -167,7 +170,6 @@ experimental/iterator iterator
 experimental/memory cstddef
 experimental/memory cstdint
 experimental/memory cstring
-experimental/memory limits
 experimental/propagate_const cstddef
 experimental/simd cstddef
 experimental/simd cstdint
@@ -187,12 +189,16 @@ filesystem string
 filesystem string_view
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
-format locale
 format new
 format optional
 format queue
@@ -201,6 +207,7 @@ format stdexcept
 format string
 format string_view
 format tuple
+format typeinfo
 format version
 forward_list compare
 forward_list cstddef
@@ -397,7 +404,6 @@ optional cstddef
 optional cstdint
 optional cstring
 optional initializer_list
-optional limits
 optional new
 optional version
 ostream bitset
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
index da56ec30f128b..481d565961b2b 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
@@ -14,62 +14,91 @@
 //   fill(Iter first, Iter last, const T& value);
 
 #include <algorithm>
+#include <array>
 #include <cassert>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
 
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
-    int ia[] = {0, 1, 2, 3, 4};
-
-    std::fill(std::begin(ia), std::end(ia), 5);
+template <class Iter, class Container>
+TEST_CONSTEXPR_CXX20 void
+test(Container in, size_t from, size_t to, typename Container::value_type value, Container expected) {
+  std::fill(Iter(in.data() + from), Iter(in.data() + to), value);
+  assert(in == expected);
+}
 
-    return std::all_of(std::begin(ia), std::end(ia), [](int a) {return a == 5; })
-        ;
+template <class T>
+struct Test {
+  template <class Iter>
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    {
+      std::array<T, 4> in       = {1, 2, 3, 4};
+      std::array<T, 4> expected = {5, 5, 5, 5};
+      test<Iter>(in, 0, 4, 5, expected);
     }
-#endif
-
-template <class Iter>
-void
-test_char()
-{
-    const unsigned n = 4;
-    char ca[n] = {0};
-    std::fill(Iter(ca), Iter(ca+n), char(1));
-    assert(ca[0] == 1);
-    assert(ca[1] == 1);
-    assert(ca[2] == 1);
-    assert(ca[3] == 1);
-}
+    {
+      std::array<T, 4> in       = {1, 2, 3, 4};
+      std::array<T, 4> expected = {1, 5, 5, 4};
+      test<Iter>(in, 1, 3, 5, expected);
+    }
+  }
+};
 
-template <class Iter>
-void
-test_int()
-{
-    const unsigned n = 4;
-    int ia[n] = {0};
-    std::fill(Iter(ia), Iter(ia+n), 1);
-    assert(ia[0] == 1);
-    assert(ia[1] == 1);
-    assert(ia[2] == 1);
-    assert(ia[3] == 1);
+TEST_CONSTEXPR_CXX20 bool test() {
+  types::for_each(types::forward_iterator_list<char*>(), Test<char>());
+  types::for_each(types::forward_iterator_list<int*>(), Test<int>());
+  {   // test vector<bool>::iterator optimization
+    { // simple case
+      std::vector<bool> in(4, false);
+      std::vector<bool> expected(4, true);
+      std::fill(in.begin(), in.end(), true);
+      assert(in == expected);
+    }
+    { // partial byte in the front is not filled
+      std::vector<bool> in(8, false);
+      std::vector<bool> expected(8, true);
+      expected[0] = false;
+      expected[1] = false;
+      std::fill(in.begin() + 2, in.end(), true);
+      assert(in == expected);
+    }
+    { // partial byte in the back is not filled
+      std::vector<bool> in(8, false);
+      std::vector<bool> expected(8, true);
+      expected[6] = false;
+      expected[7] = false;
+      std::fill(in.begin(), in.end() - 2, true);
+      assert(in == expected);
+    }
+    { // partial byte in the front and back is not filled
+      std::vector<bool> in(16, false);
+      std::vector<bool> expected(16, true);
+      expected[0]  = false;
+      expected[1]  = false;
+      expected[14] = false;
+      expected[15] = false;
+      std::fill(in.begin() + 2, in.end() - 2, true);
+      assert(in == expected);
+    }
+    { // only a few bits of a byte are set
+      std::vector<bool> in(8, false);
+      std::vector<bool> expected(8, true);
+      expected[0] = false;
+      expected[1] = false;
+      expected[6] = false;
+      expected[7] = false;
+      std::fill(in.begin() + 2, in.end() - 2, true);
+      assert(in == expected);
+    }
+  }
+  return true;
 }
 
-int main(int, char**)
-{
-    test_char<forward_iterator<char*> >();
-    test_char<bidirectional_iterator<char*> >();
-    test_char<random_access_iterator<char*> >();
-    test_char<char*>();
-
-    test_int<forward_iterator<int*> >();
-    test_int<bidirectional_iterator<int*> >();
-    test_int<random_access_iterator<int*> >();
-    test_int<int*>();
-
-#if TEST_STD_VER > 17
-    static_assert(test_constexpr());
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 20
+  static_assert(test());
 #endif
 
   return 0;
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
index 904100c1cf0bb..7654a4b0c7f00 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
@@ -14,7 +14,7 @@
 //   count(Iter first, Iter last, const T& value);
 
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
-// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
index b17272ea90cdd..b6631add7e48a 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
@@ -11,7 +11,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
-// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
 
 // template<input_iterator I, sentinel_for<I> S, class T, class Proj = identity>
 //   requires indirect_binary_predicate<ranges::equal_to, projected<I, Proj>, const T*>
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
new file mode 100644
index 0000000000000..8bcce28162033
--- /dev/null
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
@@ -0,0 +1,120 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <fstream>
+
+// basic_streambuf<charT, traits>* setbuf(char_type* s, streamsize n) override;
+
+#include <fstream>
+#include <cstddef>
+#include <cassert>
+
+#include "test_macros.h"
+
+template <class CharT>
+static std::size_t file_size(const char* filename) {
+  FILE* f = std::fopen(filename, "rb");
+  std::fseek(f, 0, SEEK_END);
+  long result = std::ftell(f);
+  std::fclose(f);
+  return result;
+}
+
+// Helper class to expose some protected std::basic_filebuf<CharT> members.
+template <class CharT>
+struct filebuf : public std::basic_filebuf<CharT> {
+  CharT* base() { return this->pbase(); }
+  CharT* ptr() { return this->pptr(); }
+};
+
+template <class CharT>
+static void buffered_request() {
+  filebuf<CharT> buffer;
+
+  CharT b[10] = {0};
+  assert(buffer.pubsetbuf(b, 10) == &buffer);
+
+  buffer.open("test.dat", std::ios_base::out);
+  buffer.sputc(CharT('a'));
+  assert(b[0] == 'a');
+
+  buffer.close();
+  assert(file_size<CharT>("test.dat") == 1);
+}
+
+template <class CharT>
+static void unbuffered_request_before_open() {
+  filebuf<CharT> buffer;
+
+  assert(buffer.pubsetbuf(nullptr, 0) == &buffer);
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  buffer.open("test.dat", std::ios_base::out);
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  buffer.sputc(CharT('a'));
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  assert(file_size<CharT>("test.dat") == 1);
+}
+
+template <class CharT>
+static void unbuffered_request_after_open() {
+  filebuf<CharT> buffer;
+
+  buffer.open("test.dat", std::ios_base::out);
+
+  assert(buffer.pubsetbuf(nullptr, 0) == &buffer);
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  buffer.sputc(CharT('a'));
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  assert(file_size<CharT>("test.dat") == 1);
+}
+
+template <class CharT>
+static void unbuffered_request_after_open_ate() {
+  filebuf<CharT> buffer;
+
+  buffer.open("test.dat", std::ios_base::out | std::ios_base::ate);
+
+  assert(buffer.pubsetbuf(nullptr, 0) == &buffer);
+
+  buffer.sputc(CharT('a'));
+  assert(file_size<CharT>("test.dat") <= 1);
+  // on libc++ buffering is used by default.
+  LIBCPP_ASSERT(file_size<CharT>("test.dat") == 0);
+
+  buffer.close();
+  assert(file_size<CharT>("test.dat") == 1);
+}
+
+template <class CharT>
+static void test() {
+  buffered_request<CharT>();
+
+  unbuffered_request_before_open<CharT>();
+  unbuffered_request_after_open<CharT>();
+  unbuffered_request_after_open_ate<CharT>();
+}
+
+int main(int, char**) {
+  test<char>();
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy.pass.cpp
index 842aece4121fb..719c231d33e75 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy_assign.pass.cpp
index 4be769612dec6..e897676ec2bfe 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move.pass.cpp
index 8b41854fb4bdc..1118ba23ab92f 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move_assign.pass.cpp
index f663f1aa536e0..a71f8ce314792 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
index 1476f29360fbb..a1a37012369d4 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/assign.pass.cpp
index c071d44de70c4..ba14af1240d4b 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
index 22f94def8529e..4f5002e068c19 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
index 62b6837f46323..03e9de882ab0a 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
index 5d25485479ea6..e1407858f66ac 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
index 91d6017f76a9d..303a95a0128bc 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // Starting in Android N (API 24), SELinux policy prevents the shell user from
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
index 34e60c66357b5..514bbb27c2c47 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
index 75877bc0a2752..41e2ee668f525 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
index d28497395be6c..dd72232ee530a 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
index bcb1f615986e9..24e8069509527 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
index 31dfd8dba0e73..b7100b22dcd42 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
index ff47ad2d0a968..11748f942bc86 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
index 780994f26386a..3e363ae9bceec 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
index d224b44aa0318..a7a4fd4c1df8b 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
index e6b779c7563f8..5dcd91d6c26dc 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
index 943708a8336a9..5a1750dcd1329 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
index 80024b95b0e57..d110912c0a4b1 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
index 61dab710ce640..05346693f4bb6 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
index 3f1e2a3cc9dff..e1452c365a387 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
index 4dd4a39a3d5d8..f0cc8d9a25ffc 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
index e5d2c64731e43..30804c0e54e54 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
index b0f6cbbbbdc73..26e24ad988bcd 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
index 9b3a50fb62c72..243bed0783dba 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
index e2969b7dc63b1..5fc3efee9cd06 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
index 4aa24dea475e6..e4a32e0595c35 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
index 9f8e5da5a9fa6..aa458a655ffc9 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
index e0abd19dcd4e9..f54ee86d5ec0b 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
index d0f3047bad51a..2dc9271fbe3b2 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
index 0098fe8ee698e..279a856511b99 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
index cc801922578cc..3f3effbf26752 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_symlink/copy_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_symlink/copy_symlink.pass.cpp
index 49eb30ebf39ce..7b27213d83b01 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_symlink/copy_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_symlink/copy_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
index e0bb40c3943bf..9955e08710c56 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
index dbb36e2889522..21f283cae86d5 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
index 8e6194f8c9862..a0ec52643e8c0 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory_symlink/create_directory_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory_symlink/create_directory_symlink.pass.cpp
index 7faf90f038b9b..65bb9ee452fc3 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory_symlink/create_directory_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory_symlink/create_directory_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_hard_link/create_hard_link.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_hard_link/create_hard_link.pass.cpp
index 5e801e01f4f2a..bb8fd1c344a72 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_hard_link/create_hard_link.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_hard_link/create_hard_link.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_symlink/create_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_symlink/create_symlink.pass.cpp
index 2cee2469c838f..99d0a3b07ad2b 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_symlink/create_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_symlink/create_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
index 5a48be10637e6..57df465bac18c 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
index 710d1a60b64c8..f9b00a28af212 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
index 2bcdf0fda1b3f..50181cb8d76c8 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
index 397617d489b63..99d303bc4244d 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
index b2fa8e2b57c16..1890696c4bf19 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
index b9d15f521fa11..c16664c9a3f56 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
index f1bb81a22341d..79645b9c2cfde 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
index d493924f97e3f..70fc0122427eb 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
index a05054e226047..647d2e6a255a9 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
index 8a9b0225c133f..76375d7b3b78c 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
index 16bd2912bbeb2..ba3070e7bdc7d 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
index a3e12a7532d15..06a6fb4d22deb 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
index 2c4fe7d1bf6d3..b6f92e47a67a2 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
index 1d70e64bb234a..a71361c847049 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
index 0678b4d767d23..d43bc5eea5121 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.permissions/permissions.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.permissions/permissions.pass.cpp
index f009befa49c37..085fa65e3559f 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.permissions/permissions.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.permissions/permissions.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.read_symlink/read_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.read_symlink/read_symlink.pass.cpp
index ee0ba7284903d..9bb102ab4da5c 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.read_symlink/read_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.read_symlink/read_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
index 9bd4051f79021..7bba86252498e 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove/remove.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove/remove.pass.cpp
index 3685bbf78d506..c8a5d7f6aa10e 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove/remove.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove/remove.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/remove_all.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/remove_all.pass.cpp
index 5cdbd08784316..5513c18f00585 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/remove_all.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/remove_all.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp
index 61a31543dd7ad..972a2a024c212 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.resize_file/resize_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.resize_file/resize_file.pass.cpp
index 8218504115d85..e21e6b9ddc1ff 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.resize_file/resize_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.resize_file/resize_file.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
index 9f7b5aa94b68d..60e9ac778967e 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
index 02a68712dabdb..75eae80194ba8 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
index 5c2078e4b4856..7a4658756f0c3 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
index 035e875e4ec83..053bddcad9b43 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap/iter_swap.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap/iter_swap.pass.cpp
index dd78707f25409..e6507f7e77673 100644
--- a/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap/iter_swap.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap/iter_swap.pass.cpp
@@ -47,6 +47,13 @@ static_assert( std::is_invocable_v<IterSwapT&&, HasIterSwap&, HasIterSwap&>);
 static_assert( std::is_invocable_v<IterSwapT&&, HasIterSwap&, int&>);
 static_assert(!std::is_invocable_v<IterSwapT&&, int&, HasIterSwap&>);
 
+struct StructWithNotMoreSpecializedIterSwap {
+  friend void iter_swap(auto&, auto&);
+};
+
+static_assert(
+    !std::is_invocable_v<IterSwapT, StructWithNotMoreSpecializedIterSwap&, StructWithNotMoreSpecializedIterSwap&>);
+
 struct NodiscardIterSwap {
   [[nodiscard]] friend int iter_swap(NodiscardIterSwap&, NodiscardIterSwap&) { return 0; }
 };
diff --git a/libcxx/test/std/ranges/range.access/begin.pass.cpp b/libcxx/test/std/ranges/range.access/begin.pass.cpp
index ca25fc297a011..5ca3d59abb140 100644
--- a/libcxx/test/std/ranges/range.access/begin.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/begin.pass.cpp
@@ -192,7 +192,7 @@ struct BeginFunction {
 };
 static_assert( std::is_invocable_v<RangeBeginT,  BeginFunction const&>);
 static_assert(!std::is_invocable_v<RangeBeginT,  BeginFunction &&>);
-static_assert(!std::is_invocable_v<RangeBeginT,  BeginFunction &>);
+static_assert(std::is_invocable_v<RangeBeginT, BeginFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeCBeginT, BeginFunction const&>);
 static_assert( std::is_invocable_v<RangeCBeginT, BeginFunction &>);
 
@@ -245,7 +245,7 @@ struct BeginFunctionWithPrivateBeginMember {
 constexpr bool testBeginFunction() {
   BeginFunction a{};
   const BeginFunction aa{};
-  static_assert(!std::invocable<RangeBeginT, decltype((a))>);
+  assert(std::ranges::begin(a) == &a.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cbegin(a) == &a.x);
   assert(std::ranges::begin(aa) == &aa.x);
   assert(std::ranges::cbegin(aa) == &aa.x);
@@ -266,21 +266,21 @@ constexpr bool testBeginFunction() {
 
   BeginFunctionReturnsEmptyPtr d{};
   const BeginFunctionReturnsEmptyPtr dd{};
-  static_assert(!std::invocable<RangeBeginT, decltype((d))>);
+  assert(std::ranges::begin(d) == &d.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cbegin(d) == &d.x);
   assert(std::ranges::begin(dd) == &dd.x);
   assert(std::ranges::cbegin(dd) == &dd.x);
 
   BeginFunctionWithDataMember e{};
   const BeginFunctionWithDataMember ee{};
-  static_assert(!std::invocable<RangeBeginT, decltype((e))>);
+  assert(std::ranges::begin(e) == &e.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::begin(ee) == &ee.x);
   assert(std::ranges::cbegin(e) == &e.x);
   assert(std::ranges::cbegin(ee) == &ee.x);
 
   BeginFunctionWithPrivateBeginMember f{};
   const BeginFunctionWithPrivateBeginMember ff{};
-  static_assert(!std::invocable<RangeBeginT, decltype((f))>);
+  assert(std::ranges::begin(f) == &f.y); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cbegin(f) == &f.y);
   assert(std::ranges::begin(ff) == &ff.y);
   assert(std::ranges::cbegin(ff) == &ff.y);
diff --git a/libcxx/test/std/ranges/range.access/end.pass.cpp b/libcxx/test/std/ranges/range.access/end.pass.cpp
index 21d97354ef08d..3e465b357e985 100644
--- a/libcxx/test/std/ranges/range.access/end.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/end.pass.cpp
@@ -193,7 +193,7 @@ static_assert(!std::is_invocable_v<RangeEndT, EndFunction &&>);
 
 static_assert( std::is_invocable_v<RangeEndT,  EndFunction const&>);
 static_assert(!std::is_invocable_v<RangeEndT,  EndFunction &&>);
-static_assert(!std::is_invocable_v<RangeEndT,  EndFunction &>);
+static_assert(std::is_invocable_v<RangeEndT, EndFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeCEndT, EndFunction const&>);
 static_assert( std::is_invocable_v<RangeCEndT, EndFunction &>);
 
@@ -271,7 +271,7 @@ constexpr bool testEndFunction() {
   assert(std::ranges::end(a) == &a.x);
   assert(std::ranges::cend(a) == &a.x);
   EndFunction aa{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((aa))>);
+  assert(std::ranges::end(aa) == &aa.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(aa) == &aa.x);
 
   EndFunctionByValue b;
@@ -286,28 +286,28 @@ constexpr bool testEndFunction() {
   assert(std::ranges::end(d) == &d.x);
   assert(std::ranges::cend(d) == &d.x);
   EndFunctionReturnsEmptyPtr dd{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((dd))>);
+  assert(std::ranges::end(dd) == &dd.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(dd) == &dd.x);
 
   const EndFunctionWithDataMember e{};
   assert(std::ranges::end(e) == &e.x);
   assert(std::ranges::cend(e) == &e.x);
   EndFunctionWithDataMember ee{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((ee))>);
+  assert(std::ranges::end(ee) == &ee.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(ee) == &ee.x);
 
   const EndFunctionWithPrivateEndMember f{};
   assert(std::ranges::end(f) == &f.y);
   assert(std::ranges::cend(f) == &f.y);
   EndFunctionWithPrivateEndMember ff{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((ff))>);
+  assert(std::ranges::end(ff) == &ff.y); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(ff) == &ff.y);
 
   const BeginMemberEndFunction g{};
   assert(std::ranges::end(g) == &g.x);
   assert(std::ranges::cend(g) == &g.x);
   BeginMemberEndFunction gg{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((gg))>);
+  assert(std::ranges::end(gg) == &gg.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(gg) == &gg.x);
 
   return true;
diff --git a/libcxx/test/std/ranges/range.access/rbegin.pass.cpp b/libcxx/test/std/ranges/range.access/rbegin.pass.cpp
index e1a564c94ed94..3997f38efd029 100644
--- a/libcxx/test/std/ranges/range.access/rbegin.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/rbegin.pass.cpp
@@ -187,7 +187,8 @@ struct RBeginFunction {
 };
 static_assert( std::is_invocable_v<RangeRBeginT,  RBeginFunction const&>);
 static_assert(!std::is_invocable_v<RangeRBeginT,  RBeginFunction &&>);
-static_assert(!std::is_invocable_v<RangeRBeginT,  RBeginFunction &>);
+static_assert(
+    std::is_invocable_v<RangeRBeginT, RBeginFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeCRBeginT, RBeginFunction const&>);
 static_assert( std::is_invocable_v<RangeCRBeginT, RBeginFunction &>);
 
@@ -246,7 +247,7 @@ struct RBeginFunctionWithPrivateBeginMember {
 constexpr bool testRBeginFunction() {
   RBeginFunction a{};
   const RBeginFunction aa{};
-  static_assert(!std::invocable<RangeRBeginT, decltype((a))>);
+  assert(std::ranges::rbegin(a) == &a.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crbegin(a) == &a.x);
   assert(std::ranges::rbegin(aa) == &aa.x);
   assert(std::ranges::crbegin(aa) == &aa.x);
@@ -267,21 +268,21 @@ constexpr bool testRBeginFunction() {
 
   RBeginFunctionReturnsEmptyPtr d{};
   const RBeginFunctionReturnsEmptyPtr dd{};
-  static_assert(!std::invocable<RangeRBeginT, decltype((d))>);
+  assert(std::ranges::rbegin(d) == &d.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crbegin(d) == &d.x);
   assert(std::ranges::rbegin(dd) == &dd.x);
   assert(std::ranges::crbegin(dd) == &dd.x);
 
   RBeginFunctionWithDataMember e{};
   const RBeginFunctionWithDataMember ee{};
-  static_assert(!std::invocable<RangeRBeginT, decltype((e))>);
+  assert(std::ranges::rbegin(e) == &e.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::rbegin(ee) == &ee.x);
   assert(std::ranges::crbegin(e) == &e.x);
   assert(std::ranges::crbegin(ee) == &ee.x);
 
   RBeginFunctionWithPrivateBeginMember f{};
   const RBeginFunctionWithPrivateBeginMember ff{};
-  static_assert(!std::invocable<RangeRBeginT, decltype((f))>);
+  assert(std::ranges::rbegin(f) == &f.y); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crbegin(f) == &f.y);
   assert(std::ranges::rbegin(ff) == &ff.y);
   assert(std::ranges::crbegin(ff) == &ff.y);
diff --git a/libcxx/test/std/ranges/range.access/rend.pass.cpp b/libcxx/test/std/ranges/range.access/rend.pass.cpp
index 5ba244b6b18cf..f5f59edf19393 100644
--- a/libcxx/test/std/ranges/range.access/rend.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/rend.pass.cpp
@@ -196,7 +196,7 @@ static_assert(!std::is_invocable_v<RangeREndT, REndFunction &&>);
 
 static_assert( std::is_invocable_v<RangeREndT,  REndFunction const&>);
 static_assert(!std::is_invocable_v<RangeREndT,  REndFunction &&>);
-static_assert(!std::is_invocable_v<RangeREndT,  REndFunction &>);
+static_assert(std::is_invocable_v<RangeREndT, REndFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeCREndT, REndFunction const&>);
 static_assert( std::is_invocable_v<RangeCREndT, REndFunction &>);
 
@@ -272,7 +272,7 @@ constexpr bool testREndFunction() {
   assert(std::ranges::rend(a) == &a.x);
   assert(std::ranges::crend(a) == &a.x);
   REndFunction aa{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((aa))>);
+  assert(std::ranges::rend(aa) == &aa.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(aa) == &aa.x);
 
   REndFunctionByValue b;
@@ -287,28 +287,28 @@ constexpr bool testREndFunction() {
   assert(std::ranges::rend(d) == &d.x);
   assert(std::ranges::crend(d) == &d.x);
   REndFunctionReturnsEmptyPtr dd{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((dd))>);
+  assert(std::ranges::rend(dd) == &dd.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(dd) == &dd.x);
 
   const REndFunctionWithDataMember e{};
   assert(std::ranges::rend(e) == &e.x);
   assert(std::ranges::crend(e) == &e.x);
   REndFunctionWithDataMember ee{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((ee))>);
+  assert(std::ranges::rend(ee) == &ee.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(ee) == &ee.x);
 
   const REndFunctionWithPrivateEndMember f{};
   assert(std::ranges::rend(f) == &f.y);
   assert(std::ranges::crend(f) == &f.y);
   REndFunctionWithPrivateEndMember ff{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((ff))>);
+  assert(std::ranges::rend(ff) == &ff.y); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(ff) == &ff.y);
 
   const RBeginMemberEndFunction g{};
   assert(std::ranges::rend(g) == &g.x);
   assert(std::ranges::crend(g) == &g.x);
   RBeginMemberEndFunction gg{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((gg))>);
+  assert(std::ranges::rend(gg) == &gg.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(gg) == &gg.x);
 
   return true;
diff --git a/libcxx/test/std/ranges/range.access/size.pass.cpp b/libcxx/test/std/ranges/range.access/size.pass.cpp
index fd7d0a8b99752..ee44aa815ba99 100644
--- a/libcxx/test/std/ranges/range.access/size.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/size.pass.cpp
@@ -219,7 +219,8 @@ inline constexpr bool std::ranges::disable_sized_range<const ImproperlyDisabledF
 
 static_assert( std::is_invocable_v<RangeSizeT, ImproperlyDisabledMember&>);
 static_assert( std::is_invocable_v<RangeSizeT, const ImproperlyDisabledMember&>);
-static_assert(!std::is_invocable_v<RangeSizeT, ImproperlyDisabledFunction&>);
+static_assert(std::is_invocable_v<RangeSizeT,
+                                  ImproperlyDisabledFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeSizeT, const ImproperlyDisabledFunction&>);
 
 // No begin end.
diff --git a/libcxx/test/std/ranges/range.utility/view.interface/view.interface.pass.cpp b/libcxx/test/std/ranges/range.utility/view.interface/view.interface.pass.cpp
index 2fc2fa8579996..3f7c174d3fe48 100644
--- a/libcxx/test/std/ranges/range.utility/view.interface/view.interface.pass.cpp
+++ b/libcxx/test/std/ranges/range.utility/view.interface/view.interface.pass.cpp
@@ -39,6 +39,14 @@ struct InputRange : std::ranges::view_interface<InputRange> {
   constexpr InputIter end() const { return InputIter(buff + 8); }
 };
 
+struct SizedInputRange : std::ranges::view_interface<SizedInputRange> {
+  int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  constexpr InputIter begin() const { return InputIter(buff); }
+  constexpr sentinel_wrapper<InputIter> end() const { return sentinel_wrapper(InputIter(buff + 8)); }
+  constexpr std::size_t size() const { return 8; }
+};
+static_assert(std::ranges::sized_range<SizedInputRange>);
+
 struct NotSizedSentinel {
   using value_type = int;
   using difference_type = std::ptrdiff_t;
@@ -155,11 +163,24 @@ concept BoolOpInvocable = requires (T const& obj) { bool(obj); };
 
 constexpr bool testEmpty() {
   static_assert(!EmptyInvocable<InputRange>);
+  // LWG 3715: `view_interface::empty` is overconstrained
+  static_assert(EmptyInvocable<SizedInputRange>);
   static_assert( EmptyInvocable<ForwardRange>);
 
   static_assert(!BoolOpInvocable<InputRange>);
+  static_assert(BoolOpInvocable<SizedInputRange>);
   static_assert( BoolOpInvocable<ForwardRange>);
 
+  SizedInputRange sizedInputRange;
+  assert(!sizedInputRange.empty());
+  assert(!static_cast<SizedInputRange const&>(sizedInputRange).empty());
+
+  assert(sizedInputRange);
+  assert(static_cast<SizedInputRange const&>(sizedInputRange));
+
+  assert(!std::ranges::empty(sizedInputRange));
+  assert(!std::ranges::empty(static_cast<SizedInputRange const&>(sizedInputRange)));
+
   ForwardRange forwardRange;
   assert(!forwardRange.empty());
   assert(!static_cast<ForwardRange const&>(forwardRange).empty());
diff --git a/libcxx/test/std/ranges/robust_against_poison_pills.compile.pass.cpp b/libcxx/test/std/ranges/robust_against_poison_pills.compile.pass.cpp
new file mode 100644
index 0000000000000..1b3da814d0800
--- /dev/null
+++ b/libcxx/test/std/ranges/robust_against_poison_pills.compile.pass.cpp
@@ -0,0 +1,102 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <compare>, <iterator>, <ranges>
+
+// ADL should be performed. Ordinary unqualified lookup should not be performed.
+
+namespace ns {
+struct StructWithGlobalFunctions {};
+} // namespace ns
+
+struct ConvertibleToCmpType;
+ConvertibleToCmpType strong_order(const ns::StructWithGlobalFunctions&, const ns::StructWithGlobalFunctions&);
+ConvertibleToCmpType weak_order(const ns::StructWithGlobalFunctions&, const ns::StructWithGlobalFunctions&);
+ConvertibleToCmpType partial_order(const ns::StructWithGlobalFunctions&, const ns::StructWithGlobalFunctions&);
+
+int&& iter_move(const ns::StructWithGlobalFunctions&);
+void iter_swap(const ns::StructWithGlobalFunctions&, const ns::StructWithGlobalFunctions&);
+
+int* begin(const ns::StructWithGlobalFunctions&);
+int* end(const ns::StructWithGlobalFunctions&);
+int* rbegin(const ns::StructWithGlobalFunctions&);
+int* rend(const ns::StructWithGlobalFunctions&);
+unsigned int size(const ns::StructWithGlobalFunctions&);
+
+#include <compare>
+#include <ranges>
+#include <type_traits>
+
+struct ConvertibleToCmpType {
+  operator std::strong_ordering() const;
+  operator std::weak_ordering() const;
+  operator std::partial_ordering() const;
+};
+
+struct StructWithHiddenFriends {
+  friend ConvertibleToCmpType strong_order(const StructWithHiddenFriends&, const StructWithHiddenFriends&);
+  friend ConvertibleToCmpType weak_order(const StructWithHiddenFriends&, const StructWithHiddenFriends&);
+  friend ConvertibleToCmpType partial_order(const StructWithHiddenFriends&, const StructWithHiddenFriends&);
+
+  friend int&& iter_move(const StructWithHiddenFriends&);
+  friend void iter_swap(const StructWithHiddenFriends&, const StructWithHiddenFriends&);
+
+  friend int* begin(const StructWithHiddenFriends&);
+  friend int* end(const StructWithHiddenFriends&);
+  friend int* rbegin(const StructWithHiddenFriends&);
+  friend int* rend(const StructWithHiddenFriends&);
+  friend unsigned int size(const StructWithHiddenFriends&);
+};
+
+// [cmp.alg] ADL should be performed.
+static_assert(std::is_invocable_v<decltype(std::strong_order), StructWithHiddenFriends&, StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::weak_order), StructWithHiddenFriends&, StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::partial_order), StructWithHiddenFriends&, StructWithHiddenFriends&>);
+
+// [cmp.alg] Ordinary unqualified lookup should not be performed.
+static_assert(
+    !std::is_invocable_v<decltype(std::strong_order), ns::StructWithGlobalFunctions&, ns::StructWithGlobalFunctions&>);
+static_assert(
+    !std::is_invocable_v<decltype(std::weak_order), ns::StructWithGlobalFunctions&, ns::StructWithGlobalFunctions&>);
+static_assert(
+    !std::is_invocable_v<decltype(std::partial_order), ns::StructWithGlobalFunctions&, ns::StructWithGlobalFunctions&>);
+
+// [iterator.cust] ADL should be performed.
+static_assert(std::is_invocable_v<decltype(std::ranges::iter_move), StructWithHiddenFriends&>);
+static_assert(
+    std::is_invocable_v<decltype(std::ranges::iter_swap), StructWithHiddenFriends&, StructWithHiddenFriends&>);
+
+// [iterator.cust] Ordinary unqualified lookup should not be performed.
+static_assert(!std::is_invocable_v<decltype(std::ranges::iter_move), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::iter_swap),
+                                   ns::StructWithGlobalFunctions&,
+                                   ns::StructWithGlobalFunctions&>);
+
+// [range.access] ADL should be performed.
+static_assert(std::is_invocable_v<decltype(std::ranges::begin), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::cbegin), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::end), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::cend), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::rbegin), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::crbegin), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::rend), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::crend), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::size), StructWithHiddenFriends&>);
+
+// [range.access] Ordinary unqualified lookup should not be performed.
+static_assert(!std::is_invocable_v<decltype(std::ranges::begin), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::cbegin), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::end), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::cend), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::rbegin), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::crbegin), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::rend), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::crend), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::size), ns::StructWithGlobalFunctions&>);
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp
index 335e8d20c2391..b6204c615d965 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // const tzdb& get_tzdb();
@@ -30,8 +27,6 @@ int main(int, const char**) {
 
   assert(!db.version.empty());
 
-  LIBCPP_ASSERT(!db.__rules.empty());
-
   assert(!db.zones.empty());
   assert(std::ranges::is_sorted(db.zones));
   assert(std::ranges::adjacent_find(db.zones) == db.zones.end()); // is unique?
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp
index 34af9b576361f..a5579a3820b6a 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // const tzdb& get_tzdb_list();
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp
index ac5fee8183b9b..12c5310772f6a 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 //
 // class tzdb_list;
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp
index 8bd9b321b2a0f..b00b8b44188d0 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 //
 // class tzdb_list;
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp
index bbf9002c0430c..af38772ee3cb2 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // Note there is no Standard way to change the remote database used.
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp
index 861075cd82aa6..36b68cefc8d31 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // const string remote_version();
diff --git a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp
index 95d86d586666e..c2412bac461bc 100644
--- a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // class time_zone_link;
diff --git a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp
index 305fbd21f625b..2f8b5b9421d63 100644
--- a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // class time_zone_link;
diff --git a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp
index e375d7e443ce4..944818c1ad0c1 100644
--- a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // bool operator==(const time_zone_link& x, const time_zone_link& y) noexcept;
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp
index 2bbe714b71a68..d1ff2fe683108 100644
--- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // class time_zone;
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp
index 9eae91e80a42d..7c680707bc518 100644
--- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // bool operator==(const time_zone& x, const time_zone& y) noexcept;
diff --git a/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp b/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
new file mode 100644
index 0000000000000..127f6546ccbe7
--- /dev/null
+++ b/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// XFAIL: availability-fp_to_chars-missing
+
+// The sample code is based on the bug report
+// https://github.com/llvm/llvm-project/issues/81590
+//
+// Tests whether this formatter does not fail to compile due to nested concept
+// evaluation.
+
+#include <format>
+#include <variant>
+
+struct X : std::variant<X*> {
+  X* p = nullptr;
+  constexpr const std::variant<X*>& decay() const noexcept { return *this; }
+};
+
+template <>
+struct std::formatter<X, char> : std::formatter<std::string, char> {
+  static constexpr auto format(const X& x, auto ctx) {
+    if (!x.p)
+      return ctx.out();
+    auto m = [&](const X* t) { return std::format_to(ctx.out(), "{}", *t); };
+    return std::visit(m, x.decay());
+  }
+};
+
+void bug_81590() { (void)std::format("{}", X{}); }
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple_size_v.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple_size_v.verify.cpp
index 7570230b4b074..149890499678d 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple_size_v.verify.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.helper/tuple_size_v.verify.cpp
@@ -17,8 +17,8 @@
 #include <tuple>
 
 void f() {
-    (void)std::tuple_size_v<std::tuple<> &>; // expected-note {{requested here}}
-    (void)std::tuple_size_v<int>; // expected-note {{requested here}}
-    (void)std::tuple_size_v<std::tuple<>*>; // expected-note {{requested here}}
-    // expected-error@tuple:* 3 {{implicit instantiation of undefined template}}
+  (void)std::tuple_size_v<std::tuple<>&>; // expected-note {{requested here}}
+  (void)std::tuple_size_v<int>;           // expected-note {{requested here}}
+  (void)std::tuple_size_v<std::tuple<>*>; // expected-note {{requested here}}
+                                          // expected-error@*:* 3 {{implicit instantiation of undefined template}}
 }
diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml
index 5cefcb28bd860..af9a48481e8b6 100644
--- a/libcxx/utils/ci/docker-compose.yml
+++ b/libcxx/utils/ci/docker-compose.yml
@@ -2,7 +2,7 @@ version: '3.7'
 
 x-versions: &compiler_versions
   GCC_LATEST_VERSION: 13
-  LLVM_HEAD_VERSION: 18
+  LLVM_HEAD_VERSION: 19
 
 services:
   buildkite-builder:
diff --git a/libcxx/utils/ci/run-buildbot-container b/libcxx/utils/ci/run-buildbot-container
index 7c00b88097f17..74e7dab81d734 100755
--- a/libcxx/utils/ci/run-buildbot-container
+++ b/libcxx/utils/ci/run-buildbot-container
@@ -26,6 +26,6 @@ if [[ ! -d "${MONOREPO_ROOT}/libcxx/utils/ci" ]]; then
     echo "Was unable to find the root of the LLVM monorepo; are you running from within the monorepo?"
     exit 1
 fi
-docker pull ghcr.io/libcxx/libcxx-builder
-docker run -it --volume "${MONOREPO_ROOT}:/llvm" --workdir "/llvm" --cap-add=SYS_PTRACE ghcr.io/libcxx/libcxx-builder \
+docker pull ghcr.io/libcxx/actions-builder
+docker run -it --volume "${MONOREPO_ROOT}:/llvm" --workdir "/llvm" --cap-add=SYS_PTRACE ghcr.io/libcxx/actions-builder \
     bash -c 'git config --global --add safe.directory /llvm ; exec bash'
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 872bff372b3db..307d35349f3a4 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -421,6 +421,56 @@ def _getAndroidDeviceApi(cfg):
           """,
         ),
     ),
+    Feature(
+        name="can-create-symlinks",
+        when=lambda cfg: "_WIN32" not in compilerMacros(cfg)
+        or programSucceeds(
+            cfg,
+            # Creation of symlinks require elevated privileges on Windows unless
+            # Windows developer mode is enabled.
+            """
+            #include <stdio.h>
+            #include <windows.h>
+            int main() {
+              CHAR tempDirPath[MAX_PATH];
+              DWORD tempPathRet = GetTempPathA(MAX_PATH, tempDirPath);
+              if (tempPathRet == 0 || tempPathRet > MAX_PATH) {
+                return 1;
+              }
+
+              CHAR tempFilePath[MAX_PATH];
+              UINT uRetVal = GetTempFileNameA(
+                tempDirPath,
+                "cxx", // Prefix
+                0, // Unique=0 also implies file creation.
+                tempFilePath);
+              if (uRetVal == 0) {
+                return 1;
+              }
+
+              CHAR symlinkFilePath[MAX_PATH];
+              int ret = sprintf_s(symlinkFilePath, MAX_PATH, "%s_symlink", tempFilePath);
+              if (ret == -1) {
+                DeleteFileA(tempFilePath);
+                return 1;
+              }
+
+              // Requires either administrator, or developer mode enabled.
+              BOOL bCreatedSymlink = CreateSymbolicLinkA(symlinkFilePath,
+                tempFilePath,
+                SYMBOLIC_LINK_FLAG_ALLOW_UNPRIVILEGED_CREATE);
+              if (!bCreatedSymlink) {
+                DeleteFileA(tempFilePath);
+                return 1;
+              }
+
+              DeleteFileA(tempFilePath);
+              DeleteFileA(symlinkFilePath);
+              return 0;
+            }
+            """,
+        ),
+    ),
 ]
 
 # Add features representing the build host platform name.
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index 0af4dc1448e91..c8cc93de50777 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -268,7 +268,10 @@ if(LIBCXXABI_HERMETIC_STATIC_LIBRARY)
   # then its code shouldn't declare them with hidden visibility.  They might
   # actually be provided by a shared library at link time.
   if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
-    target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+    target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete=force-hidden)
+    if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+      target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+    endif()
   endif()
   # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in libcxx's
   # __config_site too. Define it in the same way here, to avoid redefinition
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index 9c6f5d908b094..780430ba70ba6 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -201,7 +201,10 @@ set_target_properties(unwind_static_objects
 
 if(LIBUNWIND_HIDE_SYMBOLS)
   target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility=hidden)
-  target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+  target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete=force-hidden)
+  if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+    target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+  endif()
   target_compile_definitions(unwind_static_objects PRIVATE _LIBUNWIND_HIDE_SYMBOLS)
 endif()
 
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 657332deebfde..3188772f7c490 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -286,7 +286,7 @@ static void writeSequence(MutableArrayRef<uint32_t> buf, const char *prefix,
   // The full section content has the extent of [begin, end). We drop unused
   // instructions and write [first,end).
   auto *sec = make<InputSection>(
-      nullptr, SHF_ALLOC, SHT_PROGBITS, 4,
+      ctx.internalFile, SHF_ALLOC, SHT_PROGBITS, 4,
       ArrayRef(reinterpret_cast<uint8_t *>(buf.data() + first),
                4 * (buf.size() - first)),
       ".text");
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 9ae01eb90fa4a..bb3608da80b21 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -126,7 +126,7 @@ class LinkerDriver {
 private:
   void createFiles(llvm::opt::InputArgList &args);
   void inferMachineType();
-  void link(llvm::opt::InputArgList &args);
+  template <class ELFT> void link(llvm::opt::InputArgList &args);
   template <class ELFT> void compileBitcodeFiles(bool skipLinkedOutput);
   bool tryAddFatLTOFile(MemoryBufferRef mb, StringRef archiveName,
                         uint64_t offsetInArchive, bool lazy);
@@ -273,6 +273,7 @@ struct Config {
   bool printGcSections;
   bool printIcfSections;
   bool printMemoryUsage;
+  bool rejectMismatch;
   bool relax;
   bool relaxGP;
   bool relocatable;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 2439d141fb664..7257ebd0fac99 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -650,7 +650,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     if (errorCount())
       return;
 
-    link(args);
+    invokeELFT(link, args);
   }
 
   if (config->timeTraceEnabled) {
@@ -1348,6 +1348,7 @@ static void readConfigs(opt::InputArgList &args) {
   config->printArchiveStats = args.getLastArgValue(OPT_print_archive_stats);
   config->printSymbolOrder =
       args.getLastArgValue(OPT_print_symbol_order);
+  config->rejectMismatch = !args.hasArg(OPT_no_warn_mismatch);
   config->relax = args.hasFlag(OPT_relax, OPT_no_relax, true);
   config->relaxGP = args.hasFlag(OPT_relax_gp, OPT_no_relax_gp, false);
   config->rpath = getRpath(args);
@@ -1799,6 +1800,37 @@ static void setConfigs(opt::InputArgList &args) {
       args.hasFlag(OPT_toc_optimize, OPT_no_toc_optimize, m == EM_PPC64);
   config->pcRelOptimize =
       args.hasFlag(OPT_pcrel_optimize, OPT_no_pcrel_optimize, m == EM_PPC64);
+
+  if (!args.hasArg(OPT_hash_style)) {
+    if (config->emachine == EM_MIPS)
+      config->sysvHash = true;
+    else
+      config->sysvHash = config->gnuHash = true;
+  }
+
+  // Set default entry point and output file if not specified by command line or
+  // linker scripts.
+  config->warnMissingEntry =
+      (!config->entry.empty() || (!config->shared && !config->relocatable));
+  if (config->entry.empty() && !config->relocatable)
+    config->entry = config->emachine == EM_MIPS ? "__start" : "_start";
+  if (config->outputFile.empty())
+    config->outputFile = "a.out";
+
+  // Fail early if the output file or map file is not writable. If a user has a
+  // long link, e.g. due to a large LTO link, they do not wish to run it and
+  // find that it failed because there was a mistake in their command-line.
+  {
+    llvm::TimeTraceScope timeScope("Create output files");
+    if (auto e = tryCreateFile(config->outputFile))
+      error("cannot open output file " + config->outputFile + ": " +
+            e.message());
+    if (auto e = tryCreateFile(config->mapFile))
+      error("cannot open map file " + config->mapFile + ": " + e.message());
+    if (auto e = tryCreateFile(config->whyExtract))
+      error("cannot open --why-extract= file " + config->whyExtract + ": " +
+            e.message());
+  }
 }
 
 static bool isFormatBinary(StringRef s) {
@@ -2679,45 +2711,8 @@ static void postParseObjectFile(ELFFileBase *file) {
 
 // Do actual linking. Note that when this function is called,
 // all linker scripts have already been parsed.
-void LinkerDriver::link(opt::InputArgList &args) {
+template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   llvm::TimeTraceScope timeScope("Link", StringRef("LinkerDriver::Link"));
-  // If a --hash-style option was not given, set to a default value,
-  // which varies depending on the target.
-  if (!args.hasArg(OPT_hash_style)) {
-    if (config->emachine == EM_MIPS)
-      config->sysvHash = true;
-    else
-      config->sysvHash = config->gnuHash = true;
-  }
-
-  // Default output filename is "a.out" by the Unix tradition.
-  if (config->outputFile.empty())
-    config->outputFile = "a.out";
-
-  // Fail early if the output file or map file is not writable. If a user has a
-  // long link, e.g. due to a large LTO link, they do not wish to run it and
-  // find that it failed because there was a mistake in their command-line.
-  {
-    llvm::TimeTraceScope timeScope("Create output files");
-    if (auto e = tryCreateFile(config->outputFile))
-      error("cannot open output file " + config->outputFile + ": " +
-            e.message());
-    if (auto e = tryCreateFile(config->mapFile))
-      error("cannot open map file " + config->mapFile + ": " + e.message());
-    if (auto e = tryCreateFile(config->whyExtract))
-      error("cannot open --why-extract= file " + config->whyExtract + ": " +
-            e.message());
-  }
-  if (errorCount())
-    return;
-
-  // Use default entry point name if no name was given via the command
-  // line nor linker scripts. For some reason, MIPS entry point name is
-  // different from others.
-  config->warnMissingEntry =
-      (!config->entry.empty() || (!config->shared && !config->relocatable));
-  if (config->entry.empty() && !config->relocatable)
-    config->entry = (config->emachine == EM_MIPS) ? "__start" : "_start";
 
   // Handle --trace-symbol.
   for (auto *arg : args.filtered(OPT_trace_symbol))
@@ -2738,7 +2733,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
     llvm::TimeTraceScope timeScope("Parse input files");
     for (size_t i = 0; i < files.size(); ++i) {
       llvm::TimeTraceScope timeScope("Parse input files", files[i]->getName());
-      parseFile(files[i]);
+      doParseFile<ELFT>(files[i]);
     }
     if (armCmseImpLib)
       parseArmCMSEImportLib(*armCmseImpLib);
@@ -2872,7 +2867,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
 
   // Handle --lto-validate-all-vtables-have-type-infos.
   if (config->ltoValidateAllVtablesHaveTypeInfos)
-    invokeELFT(ltoValidateAllVtablesHaveTypeInfos, args);
+    ltoValidateAllVtablesHaveTypeInfos<ELFT>(args);
 
   // Do link-time optimization if given files are LLVM bitcode files.
   // This compiles bitcode files into real object files.
@@ -2880,7 +2875,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
   // With this the symbol table should be complete. After this, no new names
   // except a few linker-synthesized ones will be added to the symbol table.
   const size_t numObjsBeforeLTO = ctx.objectFiles.size();
-  invokeELFT(compileBitcodeFiles, skipLinkedOutput);
+  compileBitcodeFiles<ELFT>(skipLinkedOutput);
 
   // Symbol resolution finished. Report backward reference problems,
   // --print-archive-stats=, and --why-extract=.
@@ -2945,7 +2940,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
       llvm::erase_if(ctx.inputSections, [](InputSectionBase *s) {
         if (s->type != SHT_LLVM_SYMPART)
           return false;
-        invokeELFT(readSymbolPartitionSection, s);
+        readSymbolPartitionSection<ELFT>(s);
         return true;
       });
     }
@@ -3003,10 +2998,10 @@ void LinkerDriver::link(opt::InputArgList &args) {
     ctx.inputSections.push_back(createCommentSection());
 
   // Split SHF_MERGE and .eh_frame sections into pieces in preparation for garbage collection.
-  invokeELFT(splitSections,);
+  splitSections<ELFT>();
 
   // Garbage collection and removal of shared symbols from unused shared objects.
-  invokeELFT(markLive,);
+  markLive<ELFT>();
 
   // Make copies of any input sections that need to be copied into each
   // partition.
@@ -3019,7 +3014,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
 
   // Create synthesized sections such as .got and .plt. This is called before
   // processSectionCommands() so that they can be placed by SECTIONS commands.
-  invokeELFT(createSyntheticSections,);
+  createSyntheticSections<ELFT>();
 
   // Some input sections that are used for exception handling need to be moved
   // into synthetic sections. Do that now so that they aren't assigned to
@@ -3059,8 +3054,8 @@ void LinkerDriver::link(opt::InputArgList &args) {
   // Two input sections with different output sections should not be folded.
   // ICF runs after processSectionCommands() so that we know the output sections.
   if (config->icf != ICFLevel::None) {
-    invokeELFT(findKeepUniqueSections, args);
-    invokeELFT(doIcf,);
+    findKeepUniqueSections<ELFT>(args);
+    doIcf<ELFT>();
   }
 
   // Read the callgraph now that we know what was gced or icfed
@@ -3068,9 +3063,9 @@ void LinkerDriver::link(opt::InputArgList &args) {
     if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file))
       if (std::optional<MemoryBufferRef> buffer = readFile(arg->getValue()))
         readCallGraph(*buffer);
-    invokeELFT(readCallGraphsFromObjectFiles,);
+    readCallGraphsFromObjectFiles<ELFT>();
   }
 
   // Write the result to the file.
-  invokeELFT(writeResult,);
+  writeResult<ELFT>();
 }
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 00aebb47640e8..4a6e691938cf4 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -288,7 +288,7 @@ static bool isCompatible(InputFile *file) {
   return false;
 }
 
-template <class ELFT> static void doParseFile(InputFile *file) {
+template <class ELFT> void elf::doParseFile(InputFile *file) {
   if (!isCompatible(file))
     return;
 
@@ -741,6 +741,15 @@ template <class ELFT> void ObjFile<ELFT>::initializeJustSymbols() {
   sections.resize(numELFShdrs);
 }
 
+static bool isKnownSpecificSectionType(uint32_t t, uint32_t flags) {
+  if (SHT_LOUSER <= t && t <= SHT_HIUSER && !(flags & SHF_ALLOC))
+    return true;
+  if (SHT_LOOS <= t && t <= SHT_HIOS && !(flags & SHF_OS_NONCONFORMING))
+    return true;
+  // Allow all processor-specific types. This is different from GNU ld.
+  return SHT_LOPROC <= t && t <= SHT_HIPROC;
+}
+
 template <class ELFT>
 void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
                                        const llvm::object::ELFFile<ELFT> &obj) {
@@ -752,14 +761,15 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
     if (this->sections[i] == &InputSection::discarded)
       continue;
     const Elf_Shdr &sec = objSections[i];
+    const uint32_t type = sec.sh_type;
 
     // SHF_EXCLUDE'ed sections are discarded by the linker. However,
     // if -r is given, we'll let the final link discard such sections.
     // This is compatible with GNU.
     if ((sec.sh_flags & SHF_EXCLUDE) && !config->relocatable) {
-      if (sec.sh_type == SHT_LLVM_CALL_GRAPH_PROFILE)
+      if (type == SHT_LLVM_CALL_GRAPH_PROFILE)
         cgProfileSectionIndex = i;
-      if (sec.sh_type == SHT_LLVM_ADDRSIG) {
+      if (type == SHT_LLVM_ADDRSIG) {
         // We ignore the address-significance table if we know that the object
         // file was created by objcopy or ld -r. This is because these tools
         // will reorder the symbols in the symbol table, invalidating the data
@@ -778,7 +788,7 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
       continue;
     }
 
-    switch (sec.sh_type) {
+    switch (type) {
     case SHT_GROUP: {
       if (!config->relocatable)
         sections[i] = &InputSection::discarded;
@@ -801,12 +811,25 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
     case SHT_RELA:
     case SHT_NULL:
       break;
-    case SHT_LLVM_SYMPART:
-      ctx.hasSympart.store(true, std::memory_order_relaxed);
-      [[fallthrough]];
+    case SHT_PROGBITS:
+    case SHT_NOTE:
+    case SHT_NOBITS:
+    case SHT_INIT_ARRAY:
+    case SHT_FINI_ARRAY:
+    case SHT_PREINIT_ARRAY:
+      this->sections[i] =
+          createInputSection(i, sec, check(obj.getSectionName(sec, shstrtab)));
+      break;
     default:
       this->sections[i] =
           createInputSection(i, sec, check(obj.getSectionName(sec, shstrtab)));
+      if (type == SHT_LLVM_SYMPART)
+        ctx.hasSympart.store(true, std::memory_order_relaxed);
+      else if (config->rejectMismatch &&
+               !isKnownSpecificSectionType(type, sec.sh_flags))
+        errorOrWarn(toString(this->sections[i]) + ": unknown section type 0x" +
+                    Twine::utohexstr(type));
+      break;
     }
   }
 
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 54de842a81cf3..3beb5a3cb9a89 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -46,6 +46,7 @@ extern std::unique_ptr<llvm::TarWriter> tar;
 std::optional<MemoryBufferRef> readFile(StringRef path);
 
 // Add symbols in File to the symbol table.
+template <class ELFT> void doParseFile(InputFile *file);
 void parseFile(InputFile *file);
 
 void parseArmCMSEImportLib(InputFile *file);
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 7508a1336c91a..082e840adde4a 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -316,7 +316,9 @@ InputSection::InputSection(InputFile *f, uint64_t flags, uint32_t type,
                            StringRef name, Kind k)
     : InputSectionBase(f, flags, type,
                        /*Entsize*/ 0, /*Link*/ 0, /*Info*/ 0, addralign, data,
-                       name, k) {}
+                       name, k) {
+  assert(f || this == &InputSection::discarded);
+}
 
 template <class ELFT>
 InputSection::InputSection(ObjFile<ELFT> &f, const typename ELFT::Shdr &header,
@@ -346,7 +348,7 @@ template <class ELFT> void InputSection::copyShtGroup(uint8_t *buf) {
 }
 
 InputSectionBase *InputSection::getRelocatedSection() const {
-  if (!file || file->isInternal() || (type != SHT_RELA && type != SHT_REL))
+  if (file->isInternal() || (type != SHT_RELA && type != SHT_REL))
     return nullptr;
   ArrayRef<InputSectionBase *> sections = file->getSections();
   return sections[info];
@@ -408,7 +410,7 @@ void InputSection::copyRelocations(uint8_t *buf,
     // Output section VA is zero for -r, so r_offset is an offset within the
     // section, but for --emit-relocs it is a virtual address.
     p->r_offset = sec->getVA(rel.offset);
-    p->setSymbolAndType(in.symTab->getSymbolIndex(&sym), type,
+    p->setSymbolAndType(in.symTab->getSymbolIndex(sym), type,
                         config->isMips64EL);
 
     if (sym.type == STT_SECTION) {
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index 3819b86238ea6..c5e95d0d25c1a 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -312,6 +312,9 @@ def no_dynamic_linker: F<"no-dynamic-linker">,
 def noinhibit_exec: F<"noinhibit-exec">,
   HelpText<"Retain the executable output file whenever it is still usable">;
 
+def no_warn_mismatch: F<"no-warn-mismatch">,
+  HelpText<"Suppress errors for certain unknown seciton types">;
+
 def no_nmagic: F<"no-nmagic">, MetaVarName<"<magic>">,
   HelpText<"Page align sections (default)">;
 
@@ -753,7 +756,6 @@ def: FF<"no-add-needed">;
 def: F<"no-copy-dt-needed-entries">;
 def: F<"no-ctors-in-init-array">;
 def: F<"no-keep-memory">;
-def: F<"no-warn-mismatch">;
 def: Separate<["--", "-"], "rpath-link">;
 def: J<"rpath-link=">;
 def: F<"secure-plt">;
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 55e6a14f103e0..f986aa5f67570 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -583,7 +583,7 @@ static void finalizeShtGroup(OutputSection *os, InputSection *section) {
   // sh_info then contain index of an entry in symbol table section which
   // provides signature of the section group.
   ArrayRef<Symbol *> symbols = section->file->getSymbols();
-  os->info = in.symTab->getSymbolIndex(symbols[section->info]);
+  os->info = in.symTab->getSymbolIndex(*symbols[section->info]);
 
   // Some group members may be combined or discarded, so we need to compute the
   // new size. The content will be rewritten in InputSection::copyShtGroup.
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 206fb0f537666..248ff6b4a865a 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1600,7 +1600,7 @@ uint32_t DynamicReloc::getSymIndex(SymbolTableBaseSection *symTab) const {
   if (!needsDynSymIndex())
     return 0;
 
-  size_t index = symTab->getSymbolIndex(sym);
+  size_t index = symTab->getSymbolIndex(*sym);
   assert((index != 0 || (type != target->gotRel && type != target->pltRel) ||
           !mainPart->dynSymTab->getParent()) &&
          "GOT or PLT relocation must refer to symbol in dynamic symbol table");
@@ -2172,9 +2172,9 @@ void SymbolTableBaseSection::addSymbol(Symbol *b) {
   symbols.push_back({b, strTabSec.addString(b->getName(), false)});
 }
 
-size_t SymbolTableBaseSection::getSymbolIndex(Symbol *sym) {
+size_t SymbolTableBaseSection::getSymbolIndex(const Symbol &sym) {
   if (this == mainPart->dynSymTab.get())
-    return sym->dynsymIndex;
+    return sym.dynsymIndex;
 
   // Initializes symbol lookup tables lazily. This is used only for -r,
   // --emit-relocs and dynsyms in partitions other than the main one.
@@ -2191,9 +2191,9 @@ size_t SymbolTableBaseSection::getSymbolIndex(Symbol *sym) {
 
   // Section symbols are mapped based on their output sections
   // to maintain their semantics.
-  if (sym->type == STT_SECTION)
-    return sectionIndexMap.lookup(sym->getOutputSection());
-  return symbolIndexMap.lookup(sym);
+  if (sym.type == STT_SECTION)
+    return sectionIndexMap.lookup(sym.getOutputSection());
+  return symbolIndexMap.lookup(&sym);
 }
 
 template <class ELFT>
@@ -2427,7 +2427,7 @@ void GnuHashTableSection::writeTo(uint8_t *buf) {
     // Write a hash bucket. Hash buckets contain indices in the following hash
     // value table.
     write32(buckets + i->bucketIdx,
-            getPartition().dynSymTab->getSymbolIndex(i->sym));
+            getPartition().dynSymTab->getSymbolIndex(*i->sym));
     oldBucket = i->bucketIdx;
   }
 }
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 7882ad87c241d..b41e694100541 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -641,7 +641,7 @@ class SymbolTableBaseSection : public SyntheticSection {
   size_t getSize() const override { return getNumSymbols() * entsize; }
   void addSymbol(Symbol *sym);
   unsigned getNumSymbols() const { return symbols.size() + 1; }
-  size_t getSymbolIndex(Symbol *sym);
+  size_t getSymbolIndex(const Symbol &sym);
   ArrayRef<SymbolTableEntry> getSymbols() const { return symbols; }
 
 protected:
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index a9292b3b1a224..d8782affe879b 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -261,6 +261,9 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
   Undefined(sym.file, sym.getName(), binding, sym.stOther, sym.type,
             /*discardedSecIdx=*/map.lookup(sym.section))
       .overwrite(sym);
+  // Eliminate from the symbol table, otherwise we would leave an undefined
+  // symbol if the symbol is unreferenced in the absence of GC.
+  sym.isUsedInRegularObj = false;
 }
 
 // If all references to a DSO happen to be weak, the DSO is not added to
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 9edb6b9c60a1f..36248925d65ad 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1437,6 +1437,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     resetOutputSegments();
     resetWriter();
     InputFile::resetIdCount();
+
+    objc::doCleanup();
   };
 
   ctx->e.logName = args::getFilenameWithoutExe(argsArr[0]);
@@ -1979,9 +1981,16 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     if (config->deadStrip)
       markLive();
 
+    // Categories are not subject to dead-strip. The __objc_catlist section is
+    // marked as NO_DEAD_STRIP and that propagates into all category data.
     if (args.hasArg(OPT_check_category_conflicts))
       objc::checkCategories();
 
+    // Category merging uses "->live = false" to erase old category data, so
+    // it has to run after dead-stripping (markLive).
+    if (args.hasArg(OPT_objc_category_merging, OPT_no_objc_category_merging))
+      objc::mergeCategories();
+
     // ICF assumes that all literals have been folded already, so we must run
     // foldIdenticalLiterals before foldIdenticalSections.
     foldIdenticalLiterals();
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index becb01017d633..b25f0638f4c6c 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -93,9 +93,9 @@ class InputSection {
   // .subsections_via_symbols, there is typically only one element here.
   llvm::TinyPtrVector<Defined *> symbols;
 
-protected:
   const Section &section;
 
+protected:
   const Defined *getContainingSymbol(uint64_t off) const;
 };
 
diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 67254ec53a214..40df2243b26f0 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -7,16 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "ObjC.h"
+#include "ConcatOutputSection.h"
 #include "InputFiles.h"
 #include "InputSection.h"
 #include "Layout.h"
 #include "OutputSegment.h"
+#include "SyntheticSections.h"
 #include "Target.h"
 
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Support/TimeProfiler.h"
 
 using namespace llvm;
 using namespace llvm::MachO;
@@ -78,7 +81,8 @@ namespace {
   DO(Ptr, classMethods)                                                        \
   DO(Ptr, protocols)                                                           \
   DO(Ptr, instanceProps)                                                       \
-  DO(Ptr, classProps)
+  DO(Ptr, classProps)                                                          \
+  DO(uint32_t, size)
 
 CREATE_LAYOUT_CLASS(Category, FOR_EACH_CATEGORY_FIELD);
 
@@ -112,13 +116,19 @@ CREATE_LAYOUT_CLASS(ROClass, FOR_EACH_RO_CLASS_FIELD);
 #undef FOR_EACH_RO_CLASS_FIELD
 
 #define FOR_EACH_LIST_HEADER(DO)                                               \
-  DO(uint32_t, size)                                                           \
-  DO(uint32_t, count)
+  DO(uint32_t, structSize)                                                     \
+  DO(uint32_t, structCount)
 
 CREATE_LAYOUT_CLASS(ListHeader, FOR_EACH_LIST_HEADER);
 
 #undef FOR_EACH_LIST_HEADER
 
+#define FOR_EACH_PROTOCOL_LIST_HEADER(DO) DO(Ptr, protocolCount)
+
+CREATE_LAYOUT_CLASS(ProtocolListHeader, FOR_EACH_PROTOCOL_LIST_HEADER);
+
+#undef FOR_EACH_PROTOCOL_LIST_HEADER
+
 #define FOR_EACH_METHOD(DO)                                                    \
   DO(Ptr, name)                                                                \
   DO(Ptr, type)                                                                \
@@ -311,6 +321,8 @@ void ObjcCategoryChecker::parseClass(const Defined *classSym) {
 }
 
 void objc::checkCategories() {
+  TimeTraceScope timeScope("ObjcCategoryChecker");
+
   ObjcCategoryChecker checker;
   for (const InputSection *isec : inputSections) {
     if (isec->getName() == section_names::objcCatList)
@@ -320,3 +332,914 @@ void objc::checkCategories() {
       }
   }
 }
+
+namespace {
+
+class ObjcCategoryMerger {
+  // Information about an input category
+  struct InfoInputCategory {
+    ConcatInputSection *catListIsec;
+    ConcatInputSection *catBodyIsec;
+    uint32_t offCatListIsec = 0;
+
+    bool wasMerged = false;
+  };
+
+  // To write new (merged) categories or classes, we will try make limited
+  // assumptions about the alignment and the sections the various class/category
+  // info are stored in and . So we'll just reuse the same sections and
+  // alignment as already used in existing (input) categories. To do this we
+  // have InfoCategoryWriter which contains the various sections that the
+  // generated categories will be written to.
+  template <typename T> struct InfoWriteSection {
+    bool valid = false; // Data has been successfully collected from input
+    uint32_t align = 0;
+    Section *inputSection;
+    Reloc relocTemplate;
+    T *outputSection;
+  };
+
+  struct InfoCategoryWriter {
+    InfoWriteSection<ConcatOutputSection> catListInfo;
+    InfoWriteSection<ConcatOutputSection> catBodyInfo;
+    InfoWriteSection<CStringSection> catNameInfo;
+    InfoWriteSection<ConcatOutputSection> catPtrListInfo;
+  };
+
+  // Information about a pointer list in the original categories (method lists,
+  // protocol lists, etc)
+  struct PointerListInfo {
+    PointerListInfo(const char *_categoryPrefix, uint32_t _categoryOffset,
+                    uint32_t _pointersPerStruct)
+        : categoryPrefix(_categoryPrefix), categoryOffset(_categoryOffset),
+          pointersPerStruct(_pointersPerStruct) {}
+    const char *categoryPrefix;
+    uint32_t categoryOffset = 0;
+
+    uint32_t pointersPerStruct = 0;
+
+    uint32_t structSize = 0;
+    uint32_t structCount = 0;
+
+    std::vector<Symbol *> allPtrs;
+  };
+
+  // Full information about all the categories that extend a class. This will
+  // include all the additional methods, protocols, and properties that are
+  // contained in all the categories that extend a particular class.
+  struct ClassExtensionInfo {
+    ClassExtensionInfo(CategoryLayout &_catLayout) : catLayout(_catLayout){};
+
+    // Merged names of containers. Ex: base|firstCategory|secondCategory|...
+    std::string mergedContainerName;
+    std::string baseClassName;
+    Symbol *baseClass = nullptr;
+    CategoryLayout &catLayout;
+
+    // In case we generate new data, mark the new data as belonging to this file
+    ObjFile *objFileForMergeData = nullptr;
+
+    PointerListInfo instanceMethods = {
+        objc::symbol_names::categoryInstanceMethods,
+        /*_categoryOffset=*/catLayout.instanceMethodsOffset,
+        /*pointersPerStruct=*/3};
+    PointerListInfo classMethods = {
+        objc::symbol_names::categoryClassMethods,
+        /*_categoryOffset=*/catLayout.classMethodsOffset,
+        /*pointersPerStruct=*/3};
+    PointerListInfo protocols = {objc::symbol_names::categoryProtocols,
+                                 /*_categoryOffset=*/catLayout.protocolsOffset,
+                                 /*pointersPerStruct=*/0};
+    PointerListInfo instanceProps = {
+        objc::symbol_names::listProprieties,
+        /*_categoryOffset=*/catLayout.instancePropsOffset,
+        /*pointersPerStruct=*/2};
+    PointerListInfo classProps = {
+        objc::symbol_names::klassPropList,
+        /*_categoryOffset=*/catLayout.classPropsOffset,
+        /*pointersPerStruct=*/2};
+  };
+
+public:
+  ObjcCategoryMerger(std::vector<ConcatInputSection *> &_allInputSections);
+  void doMerge();
+  static void doCleanup();
+
+private:
+  void collectAndValidateCategoriesData();
+  void
+  mergeCategoriesIntoSingleCategory(std::vector<InfoInputCategory> &categories);
+
+  void eraseISec(ConcatInputSection *isec);
+  void eraseMergedCategories();
+
+  void generateCatListForNonErasedCategories(
+      std::map<ConcatInputSection *, std::set<uint64_t>>
+          catListToErasedOffsets);
+  template <typename T>
+  void collectSectionWriteInfoFromIsec(const InputSection *isec,
+                                       InfoWriteSection<T> &catWriteInfo);
+  void collectCategoryWriterInfoFromCategory(const InfoInputCategory &catInfo);
+  void parseCatInfoToExtInfo(const InfoInputCategory &catInfo,
+                             ClassExtensionInfo &extInfo);
+
+  void parseProtocolListInfo(const ConcatInputSection *isec, uint32_t secOffset,
+                             PointerListInfo &ptrList);
+
+  void parsePointerListInfo(const ConcatInputSection *isec, uint32_t secOffset,
+                            PointerListInfo &ptrList);
+
+  void emitAndLinkPointerList(Defined *parentSym, uint32_t linkAtOffset,
+                              const ClassExtensionInfo &extInfo,
+                              const PointerListInfo &ptrList);
+
+  void emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
+                               const ClassExtensionInfo &extInfo,
+                               const PointerListInfo &ptrList);
+
+  Defined *emitCategory(const ClassExtensionInfo &extInfo);
+  Defined *emitCatListEntrySec(const std::string &forCateogryName,
+                               const std::string &forBaseClassName,
+                               ObjFile *objFile);
+  Defined *emitCategoryBody(const std::string &name, const Defined *nameSym,
+                            const Symbol *baseClassSym,
+                            const std::string &baseClassName, ObjFile *objFile);
+  Defined *emitCategoryName(const std::string &name, ObjFile *objFile);
+  void createSymbolReference(Defined *refFrom, const Symbol *refTo,
+                             uint32_t offset, const Reloc &relocTemplate);
+  Symbol *tryGetSymbolAtIsecOffset(const ConcatInputSection *isec,
+                                   uint32_t offset);
+  Defined *tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
+                                     uint32_t offset);
+  void tryEraseDefinedAtIsecOffset(const ConcatInputSection *isec,
+                                   uint32_t offset);
+
+  // Allocate a null-terminated StringRef backed by generatedSectionData
+  StringRef newStringData(const char *str);
+  // Allocate section data, backed by generatedSectionData
+  SmallVector<uint8_t> &newSectionData(uint32_t size);
+
+  CategoryLayout catLayout;
+  ClassLayout classLayout;
+  ROClassLayout roClassLayout;
+  ListHeaderLayout listHeaderLayout;
+  MethodLayout methodLayout;
+  ProtocolListHeaderLayout protocolListHeaderLayout;
+
+  InfoCategoryWriter infoCategoryWriter;
+  std::vector<ConcatInputSection *> &allInputSections;
+  // Map of base class Symbol to list of InfoInputCategory's for it
+  DenseMap<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
+
+  // Normally, the binary data comes from the input files, but since we're
+  // generating binary data ourselves, we use the below array to store it in.
+  // Need this to be 'static' so the data survives past the ObjcCategoryMerger
+  // object, as the data will be read by the Writer when the final binary is
+  // generated.
+  static SmallVector<std::unique_ptr<SmallVector<uint8_t>>>
+      generatedSectionData;
+};
+
+SmallVector<std::unique_ptr<SmallVector<uint8_t>>>
+    ObjcCategoryMerger::generatedSectionData;
+
+ObjcCategoryMerger::ObjcCategoryMerger(
+    std::vector<ConcatInputSection *> &_allInputSections)
+    : catLayout(target->wordSize), classLayout(target->wordSize),
+      roClassLayout(target->wordSize), listHeaderLayout(target->wordSize),
+      methodLayout(target->wordSize),
+      protocolListHeaderLayout(target->wordSize),
+      allInputSections(_allInputSections) {}
+
+// This is a template so that it can be used both for CStringSection and
+// ConcatOutputSection
+template <typename T>
+void ObjcCategoryMerger::collectSectionWriteInfoFromIsec(
+    const InputSection *isec, InfoWriteSection<T> &catWriteInfo) {
+
+  catWriteInfo.inputSection = const_cast<Section *>(&isec->section);
+  catWriteInfo.align = isec->align;
+  catWriteInfo.outputSection = dyn_cast_or_null<T>(isec->parent);
+
+  assert(catWriteInfo.outputSection &&
+         "outputSection may not be null in collectSectionWriteInfoFromIsec.");
+
+  if (isec->relocs.size())
+    catWriteInfo.relocTemplate = isec->relocs[0];
+
+  catWriteInfo.valid = true;
+}
+
+Symbol *
+ObjcCategoryMerger::tryGetSymbolAtIsecOffset(const ConcatInputSection *isec,
+                                             uint32_t offset) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return nullptr;
+
+  return reloc->referent.get<Symbol *>();
+}
+
+Defined *
+ObjcCategoryMerger::tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
+                                              uint32_t offset) {
+  Symbol *sym = tryGetSymbolAtIsecOffset(isec, offset);
+  return dyn_cast_or_null<Defined>(sym);
+}
+
+// Given an ConcatInputSection or CStringInputSection and an offset, if there is
+// a symbol(Defined) at that offset, then erase the symbol (mark it not live)
+void ObjcCategoryMerger::tryEraseDefinedAtIsecOffset(
+    const ConcatInputSection *isec, uint32_t offset) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return;
+
+  Defined *sym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  if (!sym)
+    return;
+
+  if (auto *cisec = dyn_cast_or_null<ConcatInputSection>(sym->isec))
+    eraseISec(cisec);
+  else if (auto *csisec = dyn_cast_or_null<CStringInputSection>(sym->isec)) {
+    uint32_t totalOffset = sym->value + reloc->addend;
+    StringPiece &piece = csisec->getStringPiece(totalOffset);
+    piece.live = false;
+  } else {
+    llvm_unreachable("erased symbol has to be Defined or CStringInputSection");
+  }
+}
+
+void ObjcCategoryMerger::collectCategoryWriterInfoFromCategory(
+    const InfoInputCategory &catInfo) {
+
+  if (!infoCategoryWriter.catListInfo.valid)
+    collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+        catInfo.catListIsec, infoCategoryWriter.catListInfo);
+  if (!infoCategoryWriter.catBodyInfo.valid)
+    collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+        catInfo.catBodyIsec, infoCategoryWriter.catBodyInfo);
+
+  if (!infoCategoryWriter.catNameInfo.valid) {
+    lld::macho::Defined *catNameSym =
+        tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset);
+    assert(catNameSym && "Category does not have a valid name Symbol");
+
+    collectSectionWriteInfoFromIsec<CStringSection>(
+        catNameSym->isec, infoCategoryWriter.catNameInfo);
+  }
+
+  // Collect writer info from all the category lists (we're assuming they all
+  // would provide the same info)
+  if (!infoCategoryWriter.catPtrListInfo.valid) {
+    for (uint32_t off = catLayout.instanceMethodsOffset;
+         off <= catLayout.classPropsOffset; off += target->wordSize) {
+      if (Defined *ptrList =
+              tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, off)) {
+        collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+            ptrList->isec, infoCategoryWriter.catPtrListInfo);
+        // we've successfully collected data, so we can break
+        break;
+      }
+    }
+  }
+}
+
+// Parse a protocol list that might be linked to ConcatInputSection at a given
+// offset. The format of the protocol list is different than other lists (prop
+// lists, method lists) so we need to parse it differently
+void ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec,
+                                               uint32_t secOffset,
+                                               PointerListInfo &ptrList) {
+  assert((isec && (secOffset + target->wordSize <= isec->data.size())) &&
+         "Tried to read pointer list beyond protocol section end");
+
+  const Reloc *reloc = isec->getRelocAt(secOffset);
+  if (!reloc)
+    return;
+
+  auto *ptrListSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  assert(ptrListSym && "Protocol list reloc does not have a valid Defined");
+
+  // Theoretically protocol count can be either 32b or 64b, depending on
+  // platform pointer size, but to simplify implementation we always just read
+  // the lower 32b which should be good enough.
+  uint32_t protocolCount = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structSizeOffset);
+
+  ptrList.structCount += protocolCount;
+  ptrList.structSize = target->wordSize;
+
+  uint32_t expectedListSize =
+      (protocolCount * target->wordSize) +
+      /*header(count)*/ protocolListHeaderLayout.totalSize +
+      /*extra null value*/ target->wordSize;
+  assert(expectedListSize == ptrListSym->isec->data.size() &&
+         "Protocol list does not match expected size");
+
+  // Suppress unsuded var warning
+  (void)expectedListSize;
+
+  uint32_t off = protocolListHeaderLayout.totalSize;
+  for (uint32_t inx = 0; inx < protocolCount; ++inx) {
+    const Reloc *reloc = ptrListSym->isec->getRelocAt(off);
+    assert(reloc && "No reloc found at protocol list offset");
+
+    auto *listSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+    assert(listSym && "Protocol list reloc does not have a valid Defined");
+
+    ptrList.allPtrs.push_back(listSym);
+    off += target->wordSize;
+  }
+  assert((ptrListSym->isec->getRelocAt(off) == nullptr) &&
+         "expected null terminating protocol");
+  assert(off + /*extra null value*/ target->wordSize == expectedListSize &&
+         "Protocol list end offset does not match expected size");
+}
+
+// Parse a pointer list that might be linked to ConcatInputSection at a given
+// offset. This can be used for instance methods, class methods, instance props
+// and class props since they have the same format.
+void ObjcCategoryMerger::parsePointerListInfo(const ConcatInputSection *isec,
+                                              uint32_t secOffset,
+                                              PointerListInfo &ptrList) {
+  assert(ptrList.pointersPerStruct == 2 || ptrList.pointersPerStruct == 3);
+  assert(isec && "Trying to parse pointer list from null isec");
+  assert(secOffset + target->wordSize <= isec->data.size() &&
+         "Trying to read pointer list beyond section end");
+
+  const Reloc *reloc = isec->getRelocAt(secOffset);
+  if (!reloc)
+    return;
+
+  auto *ptrListSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  assert(ptrListSym && "Reloc does not have a valid Defined");
+
+  uint32_t thisStructSize = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structSizeOffset);
+  uint32_t thisStructCount = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structCountOffset);
+  assert(thisStructSize == ptrList.pointersPerStruct * target->wordSize);
+
+  assert(!ptrList.structSize || (thisStructSize == ptrList.structSize));
+
+  ptrList.structCount += thisStructCount;
+  ptrList.structSize = thisStructSize;
+
+  uint32_t expectedListSize =
+      listHeaderLayout.totalSize + (thisStructSize * thisStructCount);
+  assert(expectedListSize == ptrListSym->isec->data.size() &&
+         "Pointer list does not match expected size");
+
+  for (uint32_t off = listHeaderLayout.totalSize; off < expectedListSize;
+       off += target->wordSize) {
+    const Reloc *reloc = ptrListSym->isec->getRelocAt(off);
+    assert(reloc && "No reloc found at pointer list offset");
+
+    auto *listSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+    assert(listSym && "Reloc does not have a valid Defined");
+
+    ptrList.allPtrs.push_back(listSym);
+  }
+}
+
+// Here we parse all the information of an input category (catInfo) and
+// append the parsed info into the structure which will contain all the
+// information about how a class is extended (extInfo)
+void ObjcCategoryMerger::parseCatInfoToExtInfo(const InfoInputCategory &catInfo,
+                                               ClassExtensionInfo &extInfo) {
+  const Reloc *catNameReloc =
+      catInfo.catBodyIsec->getRelocAt(catLayout.nameOffset);
+
+  // Parse name
+  assert(catNameReloc && "Category does not have a reloc at 'nameOffset'");
+
+  // is this the first category we are parsing?
+  if (extInfo.mergedContainerName.empty())
+    extInfo.objFileForMergeData =
+        dyn_cast_or_null<ObjFile>(catInfo.catBodyIsec->getFile());
+  else
+    extInfo.mergedContainerName += "|";
+
+  assert(extInfo.objFileForMergeData &&
+         "Expected to already have valid objextInfo.objFileForMergeData");
+
+  StringRef catName = getReferentString(*catNameReloc);
+  extInfo.mergedContainerName += catName.str();
+
+  // Parse base class
+  if (!extInfo.baseClass) {
+    Symbol *classSym =
+        tryGetSymbolAtIsecOffset(catInfo.catBodyIsec, catLayout.klassOffset);
+    assert(extInfo.baseClassName.empty());
+    extInfo.baseClass = classSym;
+    llvm::StringRef classPrefix(objc::symbol_names::klass);
+    assert(classSym->getName().starts_with(classPrefix) &&
+           "Base class symbol does not start with expected prefix");
+    extInfo.baseClassName = classSym->getName().substr(classPrefix.size());
+  } else {
+    assert((extInfo.baseClass ==
+            tryGetSymbolAtIsecOffset(catInfo.catBodyIsec,
+                                     catLayout.klassOffset)) &&
+           "Trying to parse category info into container with different base "
+           "class");
+  }
+
+  parsePointerListInfo(catInfo.catBodyIsec, catLayout.instanceMethodsOffset,
+                       extInfo.instanceMethods);
+
+  parsePointerListInfo(catInfo.catBodyIsec, catLayout.classMethodsOffset,
+                       extInfo.classMethods);
+
+  parseProtocolListInfo(catInfo.catBodyIsec, catLayout.protocolsOffset,
+                        extInfo.protocols);
+
+  parsePointerListInfo(catInfo.catBodyIsec, catLayout.instancePropsOffset,
+                       extInfo.instanceProps);
+
+  parsePointerListInfo(catInfo.catBodyIsec, catLayout.classPropsOffset,
+                       extInfo.classProps);
+}
+
+// Generate a protocol list (including header) and link it into the parent at
+// the specified offset.
+void ObjcCategoryMerger::emitAndLinkProtocolList(
+    Defined *parentSym, uint32_t linkAtOffset,
+    const ClassExtensionInfo &extInfo, const PointerListInfo &ptrList) {
+  if (ptrList.allPtrs.empty())
+    return;
+
+  assert(ptrList.allPtrs.size() == ptrList.structCount);
+
+  uint32_t bodySize = (ptrList.structCount * target->wordSize) +
+                      /*header(count)*/ protocolListHeaderLayout.totalSize +
+                      /*extra null value*/ target->wordSize;
+  llvm::ArrayRef<uint8_t> bodyData = newSectionData(bodySize);
+
+  // This theoretically can be either 32b or 64b, but writing just the first 32b
+  // is good enough
+  const uint32_t *ptrProtoCount = reinterpret_cast<const uint32_t *>(
+      bodyData.data() + protocolListHeaderLayout.protocolCountOffset);
+
+  *const_cast<uint32_t *>(ptrProtoCount) = ptrList.allPtrs.size();
+
+  ConcatInputSection *listSec = make<ConcatInputSection>(
+      *infoCategoryWriter.catPtrListInfo.inputSection, bodyData,
+      infoCategoryWriter.catPtrListInfo.align);
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+  listSec->live = true;
+  allInputSections.push_back(listSec);
+
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+
+  std::string symName = ptrList.categoryPrefix;
+  symName += extInfo.baseClassName + "_$_(" + extInfo.mergedContainerName + ")";
+
+  Defined *ptrListSym = make<Defined>(
+      newStringData(symName.c_str()), /*file=*/parentSym->getObjectFile(),
+      listSec, /*value=*/0, bodyData.size(), /*isWeakDef=*/false,
+      /*isExternal=*/false, /*isPrivateExtern=*/false, /*includeInSymtab=*/true,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  ptrListSym->used = true;
+  parentSym->getObjectFile()->symbols.push_back(ptrListSym);
+
+  createSymbolReference(parentSym, ptrListSym, linkAtOffset,
+                        infoCategoryWriter.catBodyInfo.relocTemplate);
+
+  uint32_t offset = protocolListHeaderLayout.totalSize;
+  for (Symbol *symbol : ptrList.allPtrs) {
+    createSymbolReference(ptrListSym, symbol, offset,
+                          infoCategoryWriter.catPtrListInfo.relocTemplate);
+    offset += target->wordSize;
+  }
+}
+
+// Generate a pointer list (including header) and link it into the parent at the
+// specified offset. This is used for instance and class methods and
+// proprieties.
+void ObjcCategoryMerger::emitAndLinkPointerList(
+    Defined *parentSym, uint32_t linkAtOffset,
+    const ClassExtensionInfo &extInfo, const PointerListInfo &ptrList) {
+  if (ptrList.allPtrs.empty())
+    return;
+
+  assert(ptrList.allPtrs.size() * target->wordSize ==
+         ptrList.structCount * ptrList.structSize);
+
+  // Generate body
+  uint32_t bodySize =
+      listHeaderLayout.totalSize + (ptrList.structSize * ptrList.structCount);
+  llvm::ArrayRef<uint8_t> bodyData = newSectionData(bodySize);
+
+  const uint32_t *ptrStructSize = reinterpret_cast<const uint32_t *>(
+      bodyData.data() + listHeaderLayout.structSizeOffset);
+  const uint32_t *ptrStructCount = reinterpret_cast<const uint32_t *>(
+      bodyData.data() + listHeaderLayout.structCountOffset);
+
+  *const_cast<uint32_t *>(ptrStructSize) = ptrList.structSize;
+  *const_cast<uint32_t *>(ptrStructCount) = ptrList.structCount;
+
+  ConcatInputSection *listSec = make<ConcatInputSection>(
+      *infoCategoryWriter.catPtrListInfo.inputSection, bodyData,
+      infoCategoryWriter.catPtrListInfo.align);
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+  listSec->live = true;
+  allInputSections.push_back(listSec);
+
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+
+  std::string symName = ptrList.categoryPrefix;
+  symName += extInfo.baseClassName + "_$_" + extInfo.mergedContainerName;
+
+  Defined *ptrListSym = make<Defined>(
+      newStringData(symName.c_str()), /*file=*/parentSym->getObjectFile(),
+      listSec, /*value=*/0, bodyData.size(), /*isWeakDef=*/false,
+      /*isExternal=*/false, /*isPrivateExtern=*/false, /*includeInSymtab=*/true,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  ptrListSym->used = true;
+  parentSym->getObjectFile()->symbols.push_back(ptrListSym);
+
+  createSymbolReference(parentSym, ptrListSym, linkAtOffset,
+                        infoCategoryWriter.catBodyInfo.relocTemplate);
+
+  uint32_t offset = listHeaderLayout.totalSize;
+  for (Symbol *symbol : ptrList.allPtrs) {
+    createSymbolReference(ptrListSym, symbol, offset,
+                          infoCategoryWriter.catPtrListInfo.relocTemplate);
+    offset += target->wordSize;
+  }
+}
+
+// This method creates an __objc_catlist ConcatInputSection with a single slot
+Defined *
+ObjcCategoryMerger::emitCatListEntrySec(const std::string &forCateogryName,
+                                        const std::string &forBaseClassName,
+                                        ObjFile *objFile) {
+  uint32_t sectionSize = target->wordSize;
+  llvm::ArrayRef<uint8_t> bodyData = newSectionData(sectionSize);
+
+  ConcatInputSection *newCatList =
+      make<ConcatInputSection>(*infoCategoryWriter.catListInfo.inputSection,
+                               bodyData, infoCategoryWriter.catListInfo.align);
+  newCatList->parent = infoCategoryWriter.catListInfo.outputSection;
+  newCatList->live = true;
+  allInputSections.push_back(newCatList);
+
+  newCatList->parent = infoCategoryWriter.catListInfo.outputSection;
+
+  std::string catSymName = "<__objc_catlist slot for merged category ";
+  catSymName += forBaseClassName + "(" + forCateogryName + ")>";
+
+  Defined *catListSym = make<Defined>(
+      newStringData(catSymName.c_str()), /*file=*/objFile, newCatList,
+      /*value=*/0, bodyData.size(), /*isWeakDef=*/false, /*isExternal=*/false,
+      /*isPrivateExtern=*/false, /*includeInSymtab=*/false,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  catListSym->used = true;
+  objFile->symbols.push_back(catListSym);
+  return catListSym;
+}
+
+// Here we generate the main category body and link the name and base class into
+// it. We don't link any other info yet like the protocol and class/instance
+// methods/props.
+Defined *ObjcCategoryMerger::emitCategoryBody(const std::string &name,
+                                              const Defined *nameSym,
+                                              const Symbol *baseClassSym,
+                                              const std::string &baseClassName,
+                                              ObjFile *objFile) {
+  llvm::ArrayRef<uint8_t> bodyData = newSectionData(catLayout.totalSize);
+
+  uint32_t *ptrSize = (uint32_t *)(const_cast<uint8_t *>(bodyData.data()) +
+                                   catLayout.sizeOffset);
+  *ptrSize = catLayout.totalSize;
+
+  ConcatInputSection *newBodySec =
+      make<ConcatInputSection>(*infoCategoryWriter.catBodyInfo.inputSection,
+                               bodyData, infoCategoryWriter.catBodyInfo.align);
+  newBodySec->parent = infoCategoryWriter.catBodyInfo.outputSection;
+  newBodySec->live = true;
+  allInputSections.push_back(newBodySec);
+
+  std::string symName =
+      objc::symbol_names::category + baseClassName + "_$_(" + name + ")";
+  Defined *catBodySym = make<Defined>(
+      newStringData(symName.c_str()), /*file=*/objFile, newBodySec,
+      /*value=*/0, bodyData.size(), /*isWeakDef=*/false, /*isExternal=*/false,
+      /*isPrivateExtern=*/false, /*includeInSymtab=*/true,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  catBodySym->used = true;
+  objFile->symbols.push_back(catBodySym);
+
+  createSymbolReference(catBodySym, nameSym, catLayout.nameOffset,
+                        infoCategoryWriter.catBodyInfo.relocTemplate);
+
+  // Create a reloc to the base class (either external or internal)
+  createSymbolReference(catBodySym, baseClassSym, catLayout.klassOffset,
+                        infoCategoryWriter.catBodyInfo.relocTemplate);
+
+  return catBodySym;
+}
+
+// This writes the new category name (for the merged category) into the binary
+// and returns the sybmol for it.
+Defined *ObjcCategoryMerger::emitCategoryName(const std::string &name,
+                                              ObjFile *objFile) {
+  StringRef nameStrData = newStringData(name.c_str());
+  // We use +1 below to include the null terminator
+  llvm::ArrayRef<uint8_t> nameData(
+      reinterpret_cast<const uint8_t *>(nameStrData.data()),
+      nameStrData.size() + 1);
+
+  auto *parentSection = infoCategoryWriter.catNameInfo.inputSection;
+  CStringInputSection *newStringSec = make<CStringInputSection>(
+      *infoCategoryWriter.catNameInfo.inputSection, nameData,
+      infoCategoryWriter.catNameInfo.align, /*dedupLiterals=*/true);
+
+  parentSection->subsections.push_back({0, newStringSec});
+
+  newStringSec->splitIntoPieces();
+  newStringSec->pieces[0].live = true;
+  newStringSec->parent = infoCategoryWriter.catNameInfo.outputSection;
+  in.cStringSection->addInput(newStringSec);
+  assert(newStringSec->pieces.size() == 1);
+
+  Defined *catNameSym = make<Defined>(
+      "<merged category name>", /*file=*/objFile, newStringSec,
+      /*value=*/0, nameData.size(),
+      /*isWeakDef=*/false, /*isExternal=*/false, /*isPrivateExtern=*/false,
+      /*includeInSymtab=*/false, /*isReferencedDynamically=*/false,
+      /*noDeadStrip=*/false, /*isWeakDefCanBeHidden=*/false);
+
+  catNameSym->used = true;
+  objFile->symbols.push_back(catNameSym);
+  return catNameSym;
+}
+
+// This method fully creates a new category from the given ClassExtensionInfo.
+// It creates the category name, body and method/protocol/prop lists and links
+// them all together. Then it creates a new __objc_catlist entry and adds the
+// category to it. Calling this method will fully generate a category which will
+// be available in the final binary.
+Defined *ObjcCategoryMerger::emitCategory(const ClassExtensionInfo &extInfo) {
+  Defined *catNameSym = emitCategoryName(extInfo.mergedContainerName,
+                                         extInfo.objFileForMergeData);
+
+  Defined *catBodySym = emitCategoryBody(
+      extInfo.mergedContainerName, catNameSym, extInfo.baseClass,
+      extInfo.baseClassName, extInfo.objFileForMergeData);
+
+  Defined *catListSym =
+      emitCatListEntrySec(extInfo.mergedContainerName, extInfo.baseClassName,
+                          extInfo.objFileForMergeData);
+
+  // Add the single category body to the category list at the offset 0.
+  createSymbolReference(catListSym, catBodySym, /*offset=*/0,
+                        infoCategoryWriter.catListInfo.relocTemplate);
+
+  emitAndLinkPointerList(catBodySym, catLayout.instanceMethodsOffset, extInfo,
+                         extInfo.instanceMethods);
+
+  emitAndLinkPointerList(catBodySym, catLayout.classMethodsOffset, extInfo,
+                         extInfo.classMethods);
+
+  emitAndLinkProtocolList(catBodySym, catLayout.protocolsOffset, extInfo,
+                          extInfo.protocols);
+
+  emitAndLinkPointerList(catBodySym, catLayout.instancePropsOffset, extInfo,
+                         extInfo.instanceProps);
+
+  emitAndLinkPointerList(catBodySym, catLayout.classPropsOffset, extInfo,
+                         extInfo.classProps);
+
+  return catBodySym;
+}
+
+// This method merges all the categories (sharing a base class) into a single
+// category.
+void ObjcCategoryMerger::mergeCategoriesIntoSingleCategory(
+    std::vector<InfoInputCategory> &categories) {
+  assert(categories.size() > 1 && "Expected at least 2 categories");
+
+  ClassExtensionInfo extInfo(catLayout);
+
+  for (auto &catInfo : categories)
+    parseCatInfoToExtInfo(catInfo, extInfo);
+
+  Defined *newCatDef = emitCategory(extInfo);
+  assert(newCatDef && "Failed to create a new category");
+
+  // Suppress unsuded var warning
+  (void)newCatDef;
+
+  for (auto &catInfo : categories)
+    catInfo.wasMerged = true;
+}
+
+void ObjcCategoryMerger::createSymbolReference(Defined *refFrom,
+                                               const Symbol *refTo,
+                                               uint32_t offset,
+                                               const Reloc &relocTemplate) {
+  Reloc r = relocTemplate;
+  r.offset = offset;
+  r.addend = 0;
+  r.referent = const_cast<Symbol *>(refTo);
+  refFrom->isec->relocs.push_back(r);
+}
+
+void ObjcCategoryMerger::collectAndValidateCategoriesData() {
+  for (InputSection *sec : allInputSections) {
+    if (sec->getName() != section_names::objcCatList)
+      continue;
+    ConcatInputSection *catListCisec = dyn_cast<ConcatInputSection>(sec);
+    assert(catListCisec &&
+           "__objc_catList InputSection is not a ConcatInputSection");
+
+    for (uint32_t off = 0; off < catListCisec->getSize();
+         off += target->wordSize) {
+      Defined *categorySym = tryGetDefinedAtIsecOffset(catListCisec, off);
+      assert(categorySym &&
+             "Failed to get a valid cateogry at __objc_catlit offset");
+
+      // We only support ObjC categories (no swift + @objc)
+      // TODO: Support swift + @objc categories also
+      if (!categorySym->getName().starts_with(objc::symbol_names::category))
+        continue;
+
+      auto *catBodyIsec = dyn_cast<ConcatInputSection>(categorySym->isec);
+      assert(catBodyIsec &&
+             "Category data section is not an ConcatInputSection");
+
+      // Check that the category has a reloc at 'klassOffset' (which is
+      // a pointer to the class symbol)
+
+      Symbol *classSym =
+          tryGetSymbolAtIsecOffset(catBodyIsec, catLayout.klassOffset);
+      assert(classSym && "Category does not have a valid base class");
+
+      InfoInputCategory catInputInfo{catListCisec, catBodyIsec, off};
+      categoryMap[classSym].push_back(catInputInfo);
+
+      collectCategoryWriterInfoFromCategory(catInputInfo);
+    }
+  }
+}
+
+// In the input we have multiple __objc_catlist InputSection, each of which may
+// contain links to multiple categories. Of these categories, we will merge (and
+// erase) only some. There will be some categories that will remain untouched
+// (not erased). For these not erased categories, we generate new __objc_catlist
+// entries since the parent __objc_catlist entry will be erased
+void ObjcCategoryMerger::generateCatListForNonErasedCategories(
+    const std::map<ConcatInputSection *, std::set<uint64_t>>
+        catListToErasedOffsets) {
+
+  // Go through all offsets of all __objc_catlist's that we process and if there
+  // are categories that we didn't process - generate a new __objc_catlist for
+  // each.
+  for (auto &mapEntry : catListToErasedOffsets) {
+    ConcatInputSection *catListIsec = mapEntry.first;
+    for (uint32_t catListIsecOffset = 0;
+         catListIsecOffset < catListIsec->data.size();
+         catListIsecOffset += target->wordSize) {
+      // This slot was erased, we can just skip it
+      if (mapEntry.second.count(catListIsecOffset))
+        continue;
+
+      Defined *nonErasedCatBody =
+          tryGetDefinedAtIsecOffset(catListIsec, catListIsecOffset);
+      assert(nonErasedCatBody && "Failed to relocate non-deleted category");
+
+      // Allocate data for the new __objc_catlist slot
+      auto bodyData = newSectionData(target->wordSize);
+
+      // We mark the __objc_catlist slot as belonging to the same file as the
+      // category
+      ObjFile *objFile = dyn_cast<ObjFile>(nonErasedCatBody->getFile());
+
+      ConcatInputSection *listSec = make<ConcatInputSection>(
+          *infoCategoryWriter.catListInfo.inputSection, bodyData,
+          infoCategoryWriter.catListInfo.align);
+      listSec->parent = infoCategoryWriter.catListInfo.outputSection;
+      listSec->live = true;
+      allInputSections.push_back(listSec);
+
+      std::string slotSymName = "<__objc_catlist slot for category ";
+      slotSymName += nonErasedCatBody->getName();
+      slotSymName += ">";
+
+      Defined *catListSlotSym = make<Defined>(
+          newStringData(slotSymName.c_str()), /*file=*/objFile, listSec,
+          /*value=*/0, bodyData.size(),
+          /*isWeakDef=*/false, /*isExternal=*/false, /*isPrivateExtern=*/false,
+          /*includeInSymtab=*/false, /*isReferencedDynamically=*/false,
+          /*noDeadStrip=*/false, /*isWeakDefCanBeHidden=*/false);
+
+      catListSlotSym->used = true;
+      objFile->symbols.push_back(catListSlotSym);
+
+      // Now link the category body into the newly created slot
+      createSymbolReference(catListSlotSym, nonErasedCatBody, 0,
+                            infoCategoryWriter.catListInfo.relocTemplate);
+    }
+  }
+}
+
+void ObjcCategoryMerger::eraseISec(ConcatInputSection *isec) {
+  isec->live = false;
+  for (auto &sym : isec->symbols)
+    sym->used = false;
+}
+
+// This fully erases the merged categories, including their body, their names,
+// their method/protocol/prop lists and the __objc_catlist entries that link to
+// them.
+void ObjcCategoryMerger::eraseMergedCategories() {
+  // Map of InputSection to a set of offsets of the categories that were merged
+  std::map<ConcatInputSection *, std::set<uint64_t>> catListToErasedOffsets;
+
+  for (auto &mapEntry : categoryMap) {
+    for (InfoInputCategory &catInfo : mapEntry.second) {
+      if (catInfo.wasMerged) {
+        eraseISec(catInfo.catListIsec);
+        catListToErasedOffsets[catInfo.catListIsec].insert(
+            catInfo.offCatListIsec);
+      }
+    }
+  }
+
+  // If there were categories that we did not erase, we need to generate a new
+  // __objc_catList that contains only the un-merged categories, and get rid of
+  // the references to the ones we merged.
+  generateCatListForNonErasedCategories(catListToErasedOffsets);
+
+  // Erase the old method lists & names of the categories that were merged
+  for (auto &mapEntry : categoryMap) {
+    for (InfoInputCategory &catInfo : mapEntry.second) {
+      if (!catInfo.wasMerged)
+        continue;
+
+      eraseISec(catInfo.catBodyIsec);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.instanceMethodsOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.classMethodsOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.protocolsOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.classPropsOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.instancePropsOffset);
+    }
+  }
+}
+
+void ObjcCategoryMerger::doMerge() {
+  collectAndValidateCategoriesData();
+
+  for (auto &entry : categoryMap)
+    if (entry.second.size() > 1)
+      // Merge all categories into a new, single category
+      mergeCategoriesIntoSingleCategory(entry.second);
+
+  // Erase all categories that were merged
+  eraseMergedCategories();
+}
+
+void ObjcCategoryMerger::doCleanup() { generatedSectionData.clear(); }
+
+StringRef ObjcCategoryMerger::newStringData(const char *str) {
+  uint32_t len = strlen(str);
+  auto &data = newSectionData(len + 1);
+  char *strData = reinterpret_cast<char *>(data.data());
+  strncpy(strData, str, len);
+  return StringRef(strData, len);
+}
+
+SmallVector<uint8_t> &ObjcCategoryMerger::newSectionData(uint32_t size) {
+  generatedSectionData.push_back(
+      std::make_unique<SmallVector<uint8_t>>(size, 0));
+  return *generatedSectionData.back();
+}
+
+} // namespace
+
+void objc::mergeCategories() {
+  TimeTraceScope timeScope("ObjcCategoryMerger");
+
+  ObjcCategoryMerger merger(inputSections);
+  merger.doMerge();
+}
+
+void objc::doCleanup() { ObjcCategoryMerger::doCleanup(); }
diff --git a/lld/MachO/ObjC.h b/lld/MachO/ObjC.h
index 4c65f9a1f7881..9fbe984e6223e 100644
--- a/lld/MachO/ObjC.h
+++ b/lld/MachO/ObjC.h
@@ -17,14 +17,26 @@ namespace objc {
 
 namespace symbol_names {
 constexpr const char klass[] = "_OBJC_CLASS_$_";
+constexpr const char klassPropList[] = "__OBJC_$_CLASS_PROP_LIST_";
+
 constexpr const char metaclass[] = "_OBJC_METACLASS_$_";
 constexpr const char ehtype[] = "_OBJC_EHTYPE_$_";
 constexpr const char ivar[] = "_OBJC_IVAR_$_";
+constexpr const char listProprieties[] = "__OBJC_$_PROP_LIST_";
+
+constexpr const char category[] = "__OBJC_$_CATEGORY_";
+constexpr const char categoryInstanceMethods[] =
+    "__OBJC_$_CATEGORY_INSTANCE_METHODS_";
+constexpr const char categoryClassMethods[] =
+    "__OBJC_$_CATEGORY_CLASS_METHODS_";
+constexpr const char categoryProtocols[] = "__OBJC_CATEGORY_PROTOCOLS_$_";
 } // namespace symbol_names
 
 // Check for duplicate method names within related categories / classes.
 void checkCategories();
+void mergeCategories();
 
+void doCleanup();
 } // namespace objc
 
 bool hasObjCSection(llvm::MemoryBufferRef);
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index a524e4a4c5084..0d8ee2a0926be 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -129,6 +129,12 @@ def strict_auto_link : Flag<["--"], "strict-auto-link">,
 def check_category_conflicts : Flag<["--"], "check-category-conflicts">,
     HelpText<"Check for conflicts between category & class methods">,
     Group<grp_lld>;
+def objc_category_merging : Flag<["-"], "objc_category_merging">,
+    HelpText<"Merge Objective-C categories that share the same base class">,
+    Group<grp_lld>;
+def no_objc_category_merging : Flag<["-"], "no_objc_category_merging">,
+    HelpText<"Do not merge Objective-C categories">,
+    Group<grp_lld>;
 def lto_debug_pass_manager: Flag<["--"], "lto-debug-pass-manager">,
     HelpText<"Debug new pass manager">, Group<grp_lld>;
 def cs_profile_generate: Flag<["--"], "cs-profile-generate">,
@@ -966,10 +972,6 @@ def interposable_list : Separate<["-"], "interposable_list">,
 def no_function_starts : Flag<["-"], "no_function_starts">,
     HelpText<"Do not create table of function start addresses">,
     Group<grp_rare>;
-def no_objc_category_merging : Flag<["-"], "no_objc_category_merging">,
-    HelpText<"Do not merge Objective-C categories into their classes">,
-    Flags<[HelpHidden]>,
-    Group<grp_rare>;
 def object_path_lto : Separate<["-"], "object_path_lto">,
     MetaVarName<"<path>">,
     HelpText<"Retain any temporary mach-o file in <path> that would otherwise be deleted during LTO">,
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index e759776c8d55a..65e50e349c8cc 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -378,6 +378,8 @@ Do not put read-only non-executable sections in their own segment.
 Do not report version scripts that refer to undefined symbols.
 .It Fl -no-undefined
 Report unresolved symbols even if the linker is creating a shared library.
+.It Fl -no-warn-mismatch
+Do not reject unknown section types.
 .It Fl -no-warn-symbol-ordering
 Do not warn about problems with the symbol ordering file or call graph profile.
 .It Fl -no-warnings , Fl w
diff --git a/lld/test/ELF/aarch64-tlsdesc-zrel.s b/lld/test/ELF/aarch64-tlsdesc-zrel.s
index 1b35e0d29a26f..83e6894ddb3b5 100644
--- a/lld/test/ELF/aarch64-tlsdesc-zrel.s
+++ b/lld/test/ELF/aarch64-tlsdesc-zrel.s
@@ -13,6 +13,7 @@
 // RELA-NEXT:     0x[[#ADDR+16]]  R_AARCH64_TLSDESC - 0x4
 // RELA-NEXT:   }
 // RELA-NEXT: ]
+// RELA-EMPTY:
 // RELA-NEXT:              Hex dump of section '.got':
 // RELA-NEXT:              0x000[[#ADDR]]    00000000 00000000 00000000 00000000
 // RELA-NO-ADDENDS-NEXT:   0x000[[#ADDR+16]] 00000000 00000000 00000000 00000000
@@ -28,6 +29,7 @@
 // REL-NEXT:     0x[[#ADDR+16]]  R_AARCH64_TLSDESC -{{$}}
 // REL-NEXT:   }
 // REL-NEXT: ]
+// REL-EMPTY:
 // REL-NEXT: Hex dump of section '.got':
 // REL-NEXT: 0x000[[#ADDR]]    00000000 00000000 00000000 00000000
 // REL-NEXT: 0x000[[#ADDR+16]] 00000000 00000000 04000000 00000000
diff --git a/lld/test/ELF/incompatible-section-types2.s b/lld/test/ELF/incompatible-section-types2.s
index 3e281ce6c5da3..919515fe37e39 100644
--- a/lld/test/ELF/incompatible-section-types2.s
+++ b/lld/test/ELF/incompatible-section-types2.s
@@ -6,5 +6,5 @@
 // CHECK-NEXT: >>> <internal>:(.shstrtab): SHT_STRTAB
 // CHECK-NEXT: >>> output section .shstrtab: Unknown
 
-.section .shstrtab,"",@12345
+.section .shstrtab,"",@0x60000000
 .short 20
diff --git a/lld/test/ELF/linkerscript/custom-section-type.s b/lld/test/ELF/linkerscript/custom-section-type.s
index 68454f4df1c86..8ca0a4db325bd 100644
--- a/lld/test/ELF/linkerscript/custom-section-type.s
+++ b/lld/test/ELF/linkerscript/custom-section-type.s
@@ -76,7 +76,7 @@ SECTIONS {
 .section progbits,"a",@note
 .byte 0
 
-.section expr,"a",@12345
+.section expr,"a",@0x60000000
 .byte 0
 
 #--- unknown1.lds
diff --git a/lld/test/ELF/linkerscript/discard-section.s b/lld/test/ELF/linkerscript/discard-section.s
index 24f3b2b73e991..0bbebac59bb34 100644
--- a/lld/test/ELF/linkerscript/discard-section.s
+++ b/lld/test/ELF/linkerscript/discard-section.s
@@ -9,6 +9,9 @@
 # RUN: ld.lld -r -T a.lds a.o b.o -o a.ro 2>&1 | FileCheck %s --check-prefix=WARNING --implicit-check-not=warning:
 # RUN: llvm-readelf -r -s a.ro | FileCheck %s --check-prefix=RELOC
 
+# RUN: ld.lld -r --gc-sections -T a.lds a.o b.o -o a.gc.ro --no-fatal-warnings
+# RUN: llvm-readelf -r -s a.gc.ro | FileCheck %s --check-prefix=RELOC-GC
+
 # LOCAL:      error: relocation refers to a discarded section: .aaa
 # LOCAL-NEXT: >>> defined in a.o
 # LOCAL-NEXT: >>> referenced by a.o:(.bbb+0x0)
@@ -32,16 +35,18 @@
 # WARNING:      warning: relocation refers to a discarded section: .aaa
 # WARNING-NEXT: >>> referenced by a.o:(.rela.bbb+0x0)
 
+## GNU ld reports "defined in discarded secion" errors even in -r mode.
+## We set the symbol index to 0.
 # RELOC:      Relocation section '.rela.bbb' at offset {{.*}} contains 1 entries:
 # RELOC-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
 # RELOC-NEXT: 0000000000000000  0000000000000000 R_X86_64_NONE                             0
 # RELOC-EMPTY:
 # RELOC-NEXT: Relocation section '.rela.data' at offset {{.*}} contains 4 entries:
 # RELOC-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-# RELOC-NEXT: 0000000000000000  0000000500000001 R_X86_64_64            0000000000000000 global + 0
-# RELOC-NEXT: 0000000000000008  0000000700000001 R_X86_64_64            0000000000000000 weak + 0
-# RELOC-NEXT: 0000000000000010  0000000600000001 R_X86_64_64            0000000000000000 weakref1 + 0
-# RELOC-NEXT: 0000000000000018  0000000800000001 R_X86_64_64            0000000000000000 weakref2 + 0
+# RELOC-NEXT: 0000000000000000  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000008  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000010  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000018  0000000000000001 R_X86_64_64                             0
 
 # RELOC:      Num:    Value          Size Type    Bind   Vis      Ndx Name
 # RELOC-NEXT:   0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND
@@ -49,23 +54,25 @@
 # RELOC-NEXT:   2: 0000000000000000     0 SECTION LOCAL  DEFAULT    2 .bbb
 # RELOC-NEXT:   3: 0000000000000000     0 SECTION LOCAL  DEFAULT    4 .data
 # RELOC-NEXT:   4: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT    1 _start
-# RELOC-NEXT:   5: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND global
-# RELOC-NEXT:   6: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weakref1
-# RELOC-NEXT:   7: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weak
-# RELOC-NEXT:   8: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weakref2
 # RELOC-EMPTY:
 
+# RELOC-GC:   There are no relocations in this file.
+
 #--- a.s
 .globl _start
 _start:
 
 .section .aaa,"a"
-.globl global, weakref1
+.globl global, weakref1, unused
 .weak weak, weakref2
 global:
 weak:
 weakref1:
 weakref2:
+## Eliminate `unused` just like GC discarded definitions.
+## Linux kernel's CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y configuration expects
+## that the unreferenced `unused` is not emitted to .symtab.
+unused:
   .quad 0
 
 .section .bbb,"aw"
diff --git a/lld/test/ELF/pack-dyn-relocs-arm2.s b/lld/test/ELF/pack-dyn-relocs-arm2.s
index cf2cd8bb597ec..ed4fa52e7a5ab 100644
--- a/lld/test/ELF/pack-dyn-relocs-arm2.s
+++ b/lld/test/ELF/pack-dyn-relocs-arm2.s
@@ -5,7 +5,7 @@
 
 // RUN: llvm-mc -filetype=obj -triple=armv7a-none-linux-gnueabi %s -o %t.o
 // RUN: ld.lld -pie --pack-dyn-relocs=relr %t.o %t.so -o %t.exe
-// RUN: llvm-readobj -r %t.exe | FileCheck %s
+// RUN: llvm-readobj -r -x .data %t.exe | FileCheck %s
 
 // CHECK:      Section (5) .relr.dyn {
 // CHECK-NEXT:   0x301E8 R_ARM_RELATIVE -
@@ -43,6 +43,9 @@
 // CHECK-NEXT:   0x30268 R_ARM_RELATIVE -
 // CHECK-NEXT:   0x3026C R_ARM_RELATIVE -
 // CHECK-NEXT: }
+// CHECK:      Hex dump of section '.data':
+// CHECK-NEXT: 0x000301e8 00000000 01000000 02000000 ffffffff .
+// CHECK-NEXT: 0x000301f8 00000080 00000000 00000000 00000000 .
 
 // RUN: llvm-readobj -S --dynamic-table %t.exe | FileCheck --check-prefix=HEADER %s
 // HEADER: 0x00000023 RELRSZ 12 (bytes)
@@ -50,10 +53,10 @@
 .data
 .align 2
 .dc.a __ehdr_start
-.dc.a __ehdr_start
-.dc.a __ehdr_start
-.dc.a __ehdr_start
-.dc.a __ehdr_start
+.dc.a __ehdr_start+1
+.dc.a __ehdr_start+2
+.dc.a __ehdr_start-1
+.dc.a __ehdr_start-0x80000000
 .dc.a __ehdr_start
 .dc.a __ehdr_start
 .dc.a __ehdr_start
diff --git a/lld/test/ELF/pack-dyn-relocs.s b/lld/test/ELF/pack-dyn-relocs.s
index da97e388c3d0c..733ddd4ecad39 100644
--- a/lld/test/ELF/pack-dyn-relocs.s
+++ b/lld/test/ELF/pack-dyn-relocs.s
@@ -198,11 +198,11 @@
 // RUN: llvm-readobj -r %t2.a64 | FileCheck --check-prefix=UNPACKED64 %s
 
 // UNPACKED64:          Section ({{.+}}) .rela.dyn {
-// UNPACKED64-NEXT:     0x30690 R_AARCH64_RELATIVE - 0x1
-// UNPACKED64-NEXT:     0x30698 R_AARCH64_RELATIVE - 0x2
-// UNPACKED64-NEXT:     0x306A0 R_AARCH64_RELATIVE - 0x3
-// UNPACKED64-NEXT:     0x306A8 R_AARCH64_RELATIVE - 0x4
-// UNPACKED64-NEXT:     0x306B0 R_AARCH64_RELATIVE - 0x5
+// UNPACKED64-NEXT:     0x30690 R_AARCH64_RELATIVE - 0x0
+// UNPACKED64-NEXT:     0x30698 R_AARCH64_RELATIVE - 0x1
+// UNPACKED64-NEXT:     0x306A0 R_AARCH64_RELATIVE - 0x2
+// UNPACKED64-NEXT:     0x306A8 R_AARCH64_RELATIVE - 0xFFFFFFFFFFFFFFFF
+// UNPACKED64-NEXT:     0x306B0 R_AARCH64_RELATIVE - 0x80000000
 // UNPACKED64-NEXT:     0x306B8 R_AARCH64_RELATIVE - 0x6
 // UNPACKED64-NEXT:     0x306C0 R_AARCH64_RELATIVE - 0x7
 // UNPACKED64-NEXT:     0x306C8 R_AARCH64_RELATIVE - 0x8
@@ -238,91 +238,72 @@
 // UNPACKED64-NEXT:     }
 
 // RUN: ld.lld -pie --pack-dyn-relocs=android %t.a64.o %t.a64.so -o %t3.a64
-// RUN: llvm-readobj -S --dynamic-table %t3.a64 | FileCheck --check-prefix=ANDROID64-HEADERS %s
-// RUN: llvm-readobj -r %t3.a64 | FileCheck --check-prefix=ANDROID64 %s
-
-// ANDROID64-HEADERS:       Index: 1
-// ANDROID64-HEADERS-NEXT:  Name: .dynsym
-
-// ANDROID64-HEADERS:       Name: .rela.dyn
-// ANDROID64-HEADERS-NEXT:  Type: SHT_ANDROID_RELA
-// ANDROID64-HEADERS-NEXT:  Flags [ (0x2)
-// ANDROID64-HEADERS-NEXT:    SHF_ALLOC (0x2)
-// ANDROID64-HEADERS-NEXT:  ]
-// ANDROID64-HEADERS-NEXT:  Address: [[ADDR:.*]]
-// ANDROID64-HEADERS-NEXT:  Offset: [[ADDR]]
-// ANDROID64-HEADERS-NEXT:  Size: [[SIZE:.*]]
-// ANDROID64-HEADERS-NEXT:  Link: 1
-// ANDROID64-HEADERS-NEXT:  Info: 0
-// ANDROID64-HEADERS-NEXT:  AddressAlignment: 8
-// ANDROID64-HEADERS-NEXT:  EntrySize: 1
+// RUN: llvm-readelf -S -d -r %t3.a64 | FileCheck --check-prefix=ANDROID64 %s
+
+// ANDROID64:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+// ANDROID64:      .dynstr           STRTAB          {{.*}}                         00   A  0   0  1
+// ANDROID64-NEXT: .rela.dyn         ANDROID_RELA    {{0*}}[[#%x,ANDROID:]] {{.*}}  01   A  1   0  8
+// ANDROID64-NEXT: .text             PROGBITS        {{.*}}                         00  AX  0   0  4
+
+// ANDROID64:      (DEBUG)           0x0
+// ANDROID64-NEXT: (ANDROID_RELA)    0x[[#ANDROID]]
+// ANDROID64-NEXT: (ANDROID_RELASZ)  122 (bytes)
+// ANDROID64-NEXT: (RELAENT)         24 (bytes)
 
 // ANDROID64-HEADERS: 0x0000000060000011 ANDROID_RELA          [[ADDR]]
 // ANDROID64-HEADERS: 0x0000000060000012 ANDROID_RELASZ        [[SIZE]]
 
-// ANDROID64:          Section ({{.+}}) .rela.dyn {
-// ANDROID64-NEXT:     0x303E0 R_AARCH64_RELATIVE - 0x1
-// ANDROID64-NEXT:     0x303E8 R_AARCH64_RELATIVE - 0x2
-// ANDROID64-NEXT:     0x303F0 R_AARCH64_RELATIVE - 0x3
-// ANDROID64-NEXT:     0x303F8 R_AARCH64_RELATIVE - 0x4
-// ANDROID64-NEXT:     0x30400 R_AARCH64_RELATIVE - 0x5
-// ANDROID64-NEXT:     0x30408 R_AARCH64_RELATIVE - 0x6
-// ANDROID64-NEXT:     0x30410 R_AARCH64_RELATIVE - 0x7
-// ANDROID64-NEXT:     0x30418 R_AARCH64_RELATIVE - 0x8
-// ANDROID64-NEXT:     0x30470 R_AARCH64_RELATIVE - 0x1
-// ANDROID64-NEXT:     0x30478 R_AARCH64_RELATIVE - 0x2
-// ANDROID64-NEXT:     0x30480 R_AARCH64_RELATIVE - 0x3
-// ANDROID64-NEXT:     0x30488 R_AARCH64_RELATIVE - 0x4
-// ANDROID64-NEXT:     0x30490 R_AARCH64_RELATIVE - 0x5
-// ANDROID64-NEXT:     0x30498 R_AARCH64_RELATIVE - 0x6
-// ANDROID64-NEXT:     0x304A0 R_AARCH64_RELATIVE - 0x7
-// ANDROID64-NEXT:     0x304A8 R_AARCH64_RELATIVE - 0x8
-// ANDROID64-NEXT:     0x304B0 R_AARCH64_RELATIVE - 0x9
-
-// ANDROID64-NEXT:     0x30428 R_AARCH64_RELATIVE - 0x1
-// ANDROID64-NEXT:     0x30430 R_AARCH64_RELATIVE - 0x2
-// ANDROID64-NEXT:     0x30438 R_AARCH64_RELATIVE - 0x3
-// ANDROID64-NEXT:     0x30440 R_AARCH64_RELATIVE - 0x4
-// ANDROID64-NEXT:     0x30448 R_AARCH64_RELATIVE - 0x5
-// ANDROID64-NEXT:     0x30450 R_AARCH64_RELATIVE - 0x6
-// ANDROID64-NEXT:     0x30458 R_AARCH64_RELATIVE - 0x7
-
-// ANDROID64-NEXT:     0x304B9 R_AARCH64_RELATIVE - 0xA
-
-// ANDROID64-NEXT:     0x30468 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x304C1 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x304C9 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x304E1 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x30420 R_AARCH64_ABS64 bar2 0x1
-// ANDROID64-NEXT:     0x30460 R_AARCH64_ABS64 zed2 0x0
-// ANDROID64-NEXT:     0x304D1 R_AARCH64_ABS64 bar2 0x1
-// ANDROID64-NEXT:     0x304D9 R_AARCH64_ABS64 bar2 0x1
-// ANDROID64-NEXT:     }
+// ANDROID64:       Relocation section '.rela.dyn' at offset {{.*}} contains 33 entries:
+// ANDROID64-NEXT:      Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+// ANDROID64-NEXT:  00000000000303e8  0000000000000403 R_AARCH64_RELATIVE                0
+// ANDROID64-NEXT:  00000000000303f0  0000000000000403 R_AARCH64_RELATIVE                1
+// ANDROID64-NEXT:  00000000000303f8  0000000000000403 R_AARCH64_RELATIVE                2
+// ANDROID64-NEXT:  0000000000030400  0000000000000403 R_AARCH64_RELATIVE                ffffffffffffffff
+// ANDROID64-NEXT:  0000000000030408  0000000000000403 R_AARCH64_RELATIVE                80000000
+// ANDROID64-NEXT:  0000000000030410  0000000000000403 R_AARCH64_RELATIVE                6
+// ANDROID64-NEXT:  0000000000030418  0000000000000403 R_AARCH64_RELATIVE                7
+// ANDROID64-NEXT:  0000000000030420  0000000000000403 R_AARCH64_RELATIVE                8
+// ANDROID64-NEXT:  0000000000030478  0000000000000403 R_AARCH64_RELATIVE                1
+// ANDROID64-NEXT:  0000000000030480  0000000000000403 R_AARCH64_RELATIVE                2
+// ANDROID64-NEXT:  0000000000030488  0000000000000403 R_AARCH64_RELATIVE                3
+// ANDROID64-NEXT:  0000000000030490  0000000000000403 R_AARCH64_RELATIVE                4
+// ANDROID64-NEXT:  0000000000030498  0000000000000403 R_AARCH64_RELATIVE                5
+// ANDROID64-NEXT:  00000000000304a0  0000000000000403 R_AARCH64_RELATIVE                6
+// ANDROID64-NEXT:  00000000000304a8  0000000000000403 R_AARCH64_RELATIVE                7
+// ANDROID64-NEXT:  00000000000304b0  0000000000000403 R_AARCH64_RELATIVE                8
+// ANDROID64-NEXT:  00000000000304b8  0000000000000403 R_AARCH64_RELATIVE                9
+// ANDROID64-NEXT:  0000000000030430  0000000000000403 R_AARCH64_RELATIVE                1
+// ANDROID64-NEXT:  0000000000030438  0000000000000403 R_AARCH64_RELATIVE                2
+// ANDROID64-NEXT:  0000000000030440  0000000000000403 R_AARCH64_RELATIVE                3
+// ANDROID64-NEXT:  0000000000030448  0000000000000403 R_AARCH64_RELATIVE                4
+// ANDROID64-NEXT:  0000000000030450  0000000000000403 R_AARCH64_RELATIVE                5
+// ANDROID64-NEXT:  0000000000030458  0000000000000403 R_AARCH64_RELATIVE                6
+// ANDROID64-NEXT:  0000000000030460  0000000000000403 R_AARCH64_RELATIVE                7
+// ANDROID64-NEXT:  00000000000304c1  0000000000000403 R_AARCH64_RELATIVE                a
+// ANDROID64-NEXT:  0000000000030470  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// ANDROID64-NEXT:  00000000000304c9  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// ANDROID64-NEXT:  00000000000304d1  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// ANDROID64-NEXT:  00000000000304e9  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// ANDROID64-NEXT:  0000000000030428  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// ANDROID64-NEXT:  0000000000030468  0000000200000101 R_AARCH64_ABS64        0000000000000000 zed2 + 0
+// ANDROID64-NEXT:  00000000000304d9  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// ANDROID64-NEXT:  00000000000304e1  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// ANDROID64-EMPTY:
 
 // RUN: ld.lld -pie --pack-dyn-relocs=relr %t.a64.o %t.a64.so -o %t4.a64
-// RUN: llvm-readobj -S --dynamic-table %t4.a64 | FileCheck --check-prefix=RELR64-HEADERS %s
+// RUN: llvm-readelf -Sdr -x .data %t4.a64 | FileCheck --check-prefix=RELR64 %s
 // RUN: llvm-readobj -r --raw-relr %t4.a64 | FileCheck --check-prefix=RAW-RELR64 %s
-// RUN: llvm-readobj -r %t4.a64 | FileCheck --check-prefix=RELR64 %s
-
-// RELR64-HEADERS:       Index: 1
-// RELR64-HEADERS-NEXT:  Name: .dynsym
-
-// RELR64-HEADERS:       Name: .relr.dyn
-// RELR64-HEADERS-NEXT:  Type: SHT_RELR
-// RELR64-HEADERS-NEXT:  Flags [ (0x2)
-// RELR64-HEADERS-NEXT:    SHF_ALLOC (0x2)
-// RELR64-HEADERS-NEXT:  ]
-// RELR64-HEADERS-NEXT:  Address: [[ADDR:.*]]
-// RELR64-HEADERS-NEXT:  Offset: [[ADDR]]
-// RELR64-HEADERS-NEXT:  Size: 16
-// RELR64-HEADERS-NEXT:  Link: 0
-// RELR64-HEADERS-NEXT:  Info: 0
-// RELR64-HEADERS-NEXT:  AddressAlignment: 8
-// RELR64-HEADERS-NEXT:  EntrySize: 8
-
-// RELR64-HEADERS:       0x0000000000000024 RELR                 [[ADDR]]
-// RELR64-HEADERS:       0x0000000000000023 RELRSZ               16 (bytes)
-// RELR64-HEADERS:       0x0000000000000025 RELRENT              8 (bytes)
+
+// RELR64:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+// RELR64:      .dynstr           STRTAB          {{.*}}                         00   A  0   0  1
+// RELR64-NEXT: .rela.dyn         RELA            {{.*}}                         18   A  1   0  8
+// RELR64-NEXT: .relr.dyn         RELR            {{0*}}[[#%x,RELR:]] {{.*}}     08   A  0   0  8
+// RELR64-NEXT: .text             PROGBITS        0000000000010380 000380 000000 00  AX  0   0  4
+
+// RELR64:      (RELACOUNT)   1
+// RELR64:      (RELR)        0x[[#RELR]]
+// RELR64-NEXT: (RELRSZ)      16 (bytes)
+// RELR64-NEXT: (RELRENT)     8 (bytes)
 
 /// SHT_RELR section contains address/bitmap entries
 /// encoding the offsets for relative relocation.
@@ -334,51 +315,57 @@
 /// Decoded SHT_RELR section is same as UNPACKED,
 /// but contains only the relative relocations.
 /// Any relative relocations with odd offset stay in SHT_RELA.
-// RELR64:      Section ({{.+}}) .rela.dyn {
-// RELR64-NEXT:   0x30569 R_AARCH64_RELATIVE - 0xA
-// RELR64-NEXT:   0x304D0 R_AARCH64_ABS64 bar2 0x1
-// RELR64-NEXT:   0x30518 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30571 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30579 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30581 R_AARCH64_ABS64 bar2 0x1
-// RELR64-NEXT:   0x30589 R_AARCH64_ABS64 bar2 0x1
-// RELR64-NEXT:   0x30591 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30510 R_AARCH64_ABS64 zed2 0x0
-// RELR64-NEXT: }
-// RELR64-NEXT: Section ({{.+}}) .relr.dyn {
-// RELR64-NEXT:   0x30490 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30498 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304A0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304A8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304B0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304B8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304C0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304C8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304D8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304E0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304E8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304F0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304F8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30500 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30508 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30520 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30528 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30530 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30538 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30540 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30548 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30550 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30558 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30560 R_AARCH64_RELATIVE -
-// RELR64-NEXT: }
+// RELR64:       Relocation section '.rela.dyn' at offset {{.*}} contains 9 entries:
+// RELR64-NEXT:      Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+// RELR64-NEXT:  0000000000030569  0000000000000403 R_AARCH64_RELATIVE                a
+// RELR64-NEXT:  00000000000304d0  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// RELR64-NEXT:  0000000000030518  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// RELR64-NEXT:  0000000000030571  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// RELR64-NEXT:  0000000000030579  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// RELR64-NEXT:  0000000000030581  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// RELR64-NEXT:  0000000000030589  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// RELR64-NEXT:  0000000000030591  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// RELR64-NEXT:  0000000000030510  0000000200000101 R_AARCH64_ABS64        0000000000000000 zed2 + 0
+// RELR64-EMPTY: 
+// RELR64-NEXT:  Relocation section '.relr.dyn' at offset {{.*}} contains 24 entries:
+// RELR64-NEXT:      Offset             Info             Type               Symbol's Value  Symbol's Name
+// RELR64-NEXT:  0000000000030490  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030498  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304a0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304a8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304b0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304b8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304c0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304c8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304d8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304e0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304e8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304f0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304f8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030500  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030508  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030520  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030528  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030530  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030538  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030540  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030548  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030550  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030558  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030560  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-EMPTY:
+// RELR64-NEXT: Hex dump of section '.data':
+// RELR64-NEXT: 0x00030490 00000000 00000000 01000000 00000000 .
+// RELR64-NEXT: 0x000304a0 02000000 00000000 ffffffff ffffffff .
+// RELR64-NEXT: 0x000304b0 00000080 00000000 06000000 00000000 .
 
 .data
-.align 2
+.balign 2
+.dc.a __ehdr_start
 .dc.a __ehdr_start + 1
 .dc.a __ehdr_start + 2
-.dc.a __ehdr_start + 3
-.dc.a __ehdr_start + 4
-.dc.a __ehdr_start + 5
+.dc.a __ehdr_start - 1
+.dc.a __ehdr_start + 0x80000000
 .dc.a __ehdr_start + 6
 .dc.a __ehdr_start + 7
 .dc.a __ehdr_start + 8
diff --git a/lld/test/ELF/unknown-section.test b/lld/test/ELF/unknown-section.test
new file mode 100644
index 0000000000000..f6ecca29a22ae
--- /dev/null
+++ b/lld/test/ELF/unknown-section.test
@@ -0,0 +1,48 @@
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: yaml2obj %s -o a.o
+# RUN: not ld.lld a.o -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
+
+# CHECK:      error: a.o:(relr): unknown section type 0x13
+# CHECK-NEXT: error: a.o:(regular): unknown section type 0x15
+# CHECK-NEXT: error: a.o:(loos_nonconforming): unknown section type 0x60000000
+# CHECK-NEXT: error: a.o:(hios_nonconforming): unknown section type 0x6fffffff
+# CHECK-NEXT: error: a.o:(louser_alloc): unknown section type 0x80000000
+# CHECK-NEXT: error: a.o:(hiuser_alloc): unknown section type 0xffffffff
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:  relr
+    Type:  19
+  - Name:  regular
+    Type:  21
+  - Name:  loos
+    Type:  0x60000000
+  - Name:  hios
+    Type:  0x6fffffff
+  - Name:  loos_nonconforming
+    Type:  0x60000000
+    Flags: [ SHF_OS_NONCONFORMING ]
+  - Name:  hios_nonconforming
+    Type:  0x6fffffff
+    Flags: [ SHF_OS_NONCONFORMING ]
+
+  - Name:  loproc
+    Type:  0x70000000
+  - Name:  hiproc
+    Type:  0x7fffffff
+
+  - Name:  louser
+    Type:  0x80000000
+  - Name:  hiuser
+    Type:  0xffffffff
+  - Name:  louser_alloc
+    Type:  0x80000000
+    Flags: [ SHF_ALLOC ]
+  - Name:  hiuser_alloc
+    Type:  0xffffffff
+    Flags: [ SHF_ALLOC ]
diff --git a/lld/test/MachO/objc-category-merging-complete-test.s b/lld/test/MachO/objc-category-merging-complete-test.s
new file mode 100644
index 0000000000000..3bc3ca26b6ae6
--- /dev/null
+++ b/lld/test/MachO/objc-category-merging-complete-test.s
@@ -0,0 +1,762 @@
+# REQUIRES: aarch64
+# RUN: rm -rf %t; split-file %s %t && cd %t
+
+## Create a dylib to link against(a64_file1.dylib) and merge categories in the main binary (file2_merge_a64.exe)
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_file1.o a64_file1.s
+# RUN: %lld -arch arm64 a64_file1.o -o a64_file1.dylib -dylib
+
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_file2.o a64_file2.s
+# RUN: %lld -arch arm64 -o a64_file2_no_merge.exe a64_file1.dylib a64_file2.o
+# RUN: %lld -arch arm64 -o a64_file2_merge.exe -objc_category_merging a64_file1.dylib a64_file2.o
+
+# RUN: llvm-objdump --objc-meta-data --macho a64_file2_no_merge.exe | FileCheck %s --check-prefixes=NO_MERGE_CATS
+# RUN: llvm-objdump --objc-meta-data --macho a64_file2_merge.exe | FileCheck %s --check-prefixes=MERGE_CATS
+
+
+MERGE_CATS:     __OBJC_$_CATEGORY_MyBaseClass_$_(Category02|Category03)
+MERGE_CATS-NEXT:              name {{.*}} Category02|Category03
+MERGE_CATS:           instanceMethods
+MERGE_CATS-NEXT:           entsize 24
+MERGE_CATS-NEXT:             count 4
+MERGE_CATS-NEXT:              name {{.*}} class02InstanceMethod
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp -[MyBaseClass(Category02) class02InstanceMethod]
+MERGE_CATS-NEXT:              name {{.*}} myProtocol02Method
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp -[MyBaseClass(Category02) myProtocol02Method]
+MERGE_CATS-NEXT:              name {{.*}} class03InstanceMethod
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp -[MyBaseClass(Category03) class03InstanceMethod]
+MERGE_CATS-NEXT:              name {{.*}} myProtocol03Method
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp -[MyBaseClass(Category03) myProtocol03Method]
+MERGE_CATS-NEXT:      classMethods {{.*}}
+MERGE_CATS-NEXT:           entsize 24
+MERGE_CATS-NEXT:             count 4
+MERGE_CATS-NEXT:              name {{.*}} class02ClassMethod
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp +[MyBaseClass(Category02) class02ClassMethod]
+MERGE_CATS-NEXT:              name {{.*}} MyProtocol02Prop
+MERGE_CATS-NEXT:             types {{.*}} i16@0:8
+MERGE_CATS-NEXT:               imp +[MyBaseClass(Category02) MyProtocol02Prop]
+MERGE_CATS-NEXT:              name {{.*}} class03ClassMethod
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp +[MyBaseClass(Category03) class03ClassMethod]
+MERGE_CATS-NEXT:              name {{.*}} MyProtocol03Prop
+MERGE_CATS-NEXT:             types {{.*}} i16@0:8
+MERGE_CATS-NEXT:               imp +[MyBaseClass(Category03) MyProtocol03Prop]
+MERGE_CATS-NEXT:         protocols
+MERGE_CATS-NEXT:                      count 2
+MERGE_CATS-NEXT:              list[0] {{.*}} (struct protocol_t *)
+MERGE_CATS-NEXT:                  isa 0x0
+MERGE_CATS-NEXT:                 name {{.*}} MyProtocol02
+MERGE_CATS-NEXT:            protocols 0x0
+MERGE_CATS-NEXT:          instanceMethods
+MERGE_CATS-NEXT:               entsize 24
+MERGE_CATS-NEXT:                 count 2
+MERGE_CATS-NEXT:                  name {{.*}} myProtocol02Method
+MERGE_CATS-NEXT:                 types {{.*}} v16@0:8
+MERGE_CATS-NEXT:                   imp 0x0
+MERGE_CATS-NEXT:                  name {{.*}} MyProtocol02Prop
+MERGE_CATS-NEXT:                 types {{.*}} i16@0:8
+MERGE_CATS-NEXT:                   imp 0x0
+MERGE_CATS-NEXT:             classMethods
+MERGE_CATS-NEXT:      optionalInstanceMethods 0x0
+MERGE_CATS-NEXT:         optionalClassMethods 0x0
+MERGE_CATS-NEXT:           instanceProperties {{.*}}
+MERGE_CATS-NEXT:              list[1] {{.*}}
+MERGE_CATS-NEXT:                  isa 0x0
+MERGE_CATS-NEXT:                 name {{.*}} MyProtocol03
+MERGE_CATS-NEXT:            protocols 0x0
+MERGE_CATS-NEXT:          instanceMethods
+MERGE_CATS-NEXT:               entsize 24
+MERGE_CATS-NEXT:                 count 2
+MERGE_CATS-NEXT:                  name {{.*}} myProtocol03Method
+MERGE_CATS-NEXT:                 types {{.*}} v16@0:8
+MERGE_CATS-NEXT:                   imp 0x0
+MERGE_CATS-NEXT:                  name {{.*}} MyProtocol03Prop
+MERGE_CATS-NEXT:                 types {{.*}} i16@0:8
+MERGE_CATS-NEXT:                   imp 0x0
+MERGE_CATS-NEXT:             classMethods 0x0
+MERGE_CATS-NEXT:      optionalInstanceMethods 0x0
+MERGE_CATS-NEXT:         optionalClassMethods 0x0
+MERGE_CATS-NEXT:           instanceProperties {{.*}}
+MERGE_CATS-NEXT:      instanceProperties
+MERGE_CATS-NEXT:                    entsize 16
+MERGE_CATS-NEXT:                      count 2
+MERGE_CATS-NEXT:                 name {{.*}} MyProtocol02Prop
+MERGE_CATS-NEXT:            attributes {{.*}} Ti,R,D
+MERGE_CATS-NEXT:                 name {{.*}} MyProtocol03Prop
+MERGE_CATS-NEXT:            attributes {{.*}} Ti,R,D
+
+
+NO_MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_(Category02|Category03)
+NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+NO_MERGE_CATS: instanceMethods
+NO_MERGE_CATS-NEXT: 24
+NO_MERGE_CATS-NEXT: 2
+NO_MERGE_CATS: classMethods
+NO_MERGE_CATS-NEXT: 24
+NO_MERGE_CATS-NEXT: 2
+
+
+#--- a64_file1.s
+
+## @protocol MyProtocol01
+## - (void)myProtocol01Method;
+## @property (nonatomic) int MyProtocol01Prop;
+## @end
+##
+## __attribute__((objc_root_class))
+## @interface MyBaseClass<MyProtocol01>
+## - (void)baseInstanceMethod;
+## - (void)myProtocol01Method;
+## + (void)baseClassMethod;
+## @end
+##
+## @implementation MyBaseClass
+## @synthesize MyProtocol01Prop;
+## - (void)baseInstanceMethod {}
+## - (void)myProtocol01Method {}
+## + (void)baseClassMethod {}
+## @end
+##
+## void *_objc_empty_cache;
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.p2align	2                               ; -- Begin function -[MyBaseClass baseInstanceMethod]
+"-[MyBaseClass baseInstanceMethod]":    ; @"\01-[MyBaseClass baseInstanceMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass myProtocol01Method]
+"-[MyBaseClass myProtocol01Method]":    ; @"\01-[MyBaseClass myProtocol01Method]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass baseClassMethod]
+"+[MyBaseClass baseClassMethod]":       ; @"\01+[MyBaseClass baseClassMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass MyProtocol01Prop]
+"-[MyBaseClass MyProtocol01Prop]":      ; @"\01-[MyBaseClass MyProtocol01Prop]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+Lloh0:
+	adrp	x8, _OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop@PAGE
+Lloh1:
+	ldrsw	x8, [x8, _OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop@PAGEOFF]
+	ldr	w0, [x0, x8]
+	ret
+	.loh AdrpLdr	Lloh0, Lloh1
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass setMyProtocol01Prop:]
+"-[MyBaseClass setMyProtocol01Prop:]":  ; @"\01-[MyBaseClass setMyProtocol01Prop:]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+Lloh2:
+	adrp	x8, _OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop@PAGE
+Lloh3:
+	ldrsw	x8, [x8, _OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop@PAGEOFF]
+	str	w2, [x0, x8]
+	ret
+	.loh AdrpLdr	Lloh2, Lloh3
+	.cfi_endproc
+                                        ; -- End function
+	.private_extern	_OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop ; @"OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop"
+	.section	__DATA,__objc_ivar
+	.globl	_OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop
+	.p2align	2, 0x0
+_OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop:
+	.long	0                               ; 0x0
+	.section	__DATA,__objc_data
+	.globl	_OBJC_CLASS_$_MyBaseClass       ; @"OBJC_CLASS_$_MyBaseClass"
+	.p2align	3, 0x0
+_OBJC_CLASS_$_MyBaseClass:
+	.quad	_OBJC_METACLASS_$_MyBaseClass
+	.quad	0
+	.quad	__objc_empty_cache
+	.quad	0
+	.quad	__OBJC_CLASS_RO_$_MyBaseClass
+	.globl	_OBJC_METACLASS_$_MyBaseClass   ; @"OBJC_METACLASS_$_MyBaseClass"
+	.p2align	3, 0x0
+_OBJC_METACLASS_$_MyBaseClass:
+	.quad	_OBJC_METACLASS_$_MyBaseClass
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__objc_empty_cache
+	.quad	0
+	.quad	__OBJC_METACLASS_RO_$_MyBaseClass
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:                     ; @OBJC_CLASS_NAME_
+	.asciz	"MyBaseClass"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:                  ; @OBJC_METH_VAR_NAME_
+	.asciz	"baseClassMethod"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:                  ; @OBJC_METH_VAR_TYPE_
+	.asciz	"v16@0:8"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CLASS_METHODS_MyBaseClass"
+__OBJC_$_CLASS_METHODS_MyBaseClass:
+	.long	24                              ; 0x18
+	.long	1                               ; 0x1
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyBaseClass baseClassMethod]"
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.1:                   ; @OBJC_CLASS_NAME_.1
+	.asciz	"MyProtocol01"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.2:                ; @OBJC_METH_VAR_NAME_.2
+	.asciz	"myProtocol01Method"
+l_OBJC_METH_VAR_NAME_.3:                ; @OBJC_METH_VAR_NAME_.3
+	.asciz	"MyProtocol01Prop"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.4:                ; @OBJC_METH_VAR_TYPE_.4
+	.asciz	"i16@0:8"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.5:                ; @OBJC_METH_VAR_NAME_.5
+	.asciz	"setMyProtocol01Prop:"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.6:                ; @OBJC_METH_VAR_TYPE_.6
+	.asciz	"v20@0:8i16"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol01"
+__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol01:
+	.long	24                              ; 0x18
+	.long	3                               ; 0x3
+	.quad	l_OBJC_METH_VAR_NAME_.2
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	0
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	0
+	.quad	l_OBJC_METH_VAR_NAME_.5
+	.quad	l_OBJC_METH_VAR_TYPE_.6
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_:                 ; @OBJC_PROP_NAME_ATTR_
+	.asciz	"MyProtocol01Prop"
+l_OBJC_PROP_NAME_ATTR_.7:               ; @OBJC_PROP_NAME_ATTR_.7
+	.asciz	"Ti,N"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyProtocol01"
+__OBJC_$_PROP_LIST_MyProtocol01:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_
+	.quad	l_OBJC_PROP_NAME_ATTR_.7
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol01"
+__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol01:
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	l_OBJC_METH_VAR_TYPE_.6
+	.private_extern	__OBJC_PROTOCOL_$_MyProtocol01 ; @"_OBJC_PROTOCOL_$_MyProtocol01"
+	.section	__DATA,__data
+	.globl	__OBJC_PROTOCOL_$_MyProtocol01
+	.weak_definition	__OBJC_PROTOCOL_$_MyProtocol01
+	.p2align	3, 0x0
+__OBJC_PROTOCOL_$_MyProtocol01:
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_.1
+	.quad	0
+	.quad	__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol01
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	__OBJC_$_PROP_LIST_MyProtocol01
+	.long	96                              ; 0x60
+	.long	0                               ; 0x0
+	.quad	__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol01
+	.quad	0
+	.quad	0
+	.private_extern	__OBJC_LABEL_PROTOCOL_$_MyProtocol01 ; @"_OBJC_LABEL_PROTOCOL_$_MyProtocol01"
+	.section	__DATA,__objc_protolist,coalesced,no_dead_strip
+	.globl	__OBJC_LABEL_PROTOCOL_$_MyProtocol01
+	.weak_definition	__OBJC_LABEL_PROTOCOL_$_MyProtocol01
+	.p2align	3, 0x0
+__OBJC_LABEL_PROTOCOL_$_MyProtocol01:
+	.quad	__OBJC_PROTOCOL_$_MyProtocol01
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_CLASS_PROTOCOLS_$_MyBaseClass"
+__OBJC_CLASS_PROTOCOLS_$_MyBaseClass:
+	.quad	1                               ; 0x1
+	.quad	__OBJC_PROTOCOL_$_MyProtocol01
+	.quad	0
+	.p2align	3, 0x0                          ; @"_OBJC_METACLASS_RO_$_MyBaseClass"
+__OBJC_METACLASS_RO_$_MyBaseClass:
+	.long	3                               ; 0x3
+	.long	40                              ; 0x28
+	.long	40                              ; 0x28
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	__OBJC_$_CLASS_METHODS_MyBaseClass
+	.quad	__OBJC_CLASS_PROTOCOLS_$_MyBaseClass
+	.quad	0
+	.quad	0
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.8:                ; @OBJC_METH_VAR_NAME_.8
+	.asciz	"baseInstanceMethod"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_INSTANCE_METHODS_MyBaseClass"
+__OBJC_$_INSTANCE_METHODS_MyBaseClass:
+	.long	24                              ; 0x18
+	.long	4                               ; 0x4
+	.quad	l_OBJC_METH_VAR_NAME_.8
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass baseInstanceMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.2
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass myProtocol01Method]"
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	"-[MyBaseClass MyProtocol01Prop]"
+	.quad	l_OBJC_METH_VAR_NAME_.5
+	.quad	l_OBJC_METH_VAR_TYPE_.6
+	.quad	"-[MyBaseClass setMyProtocol01Prop:]"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.9:                ; @OBJC_METH_VAR_TYPE_.9
+	.asciz	"i"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_INSTANCE_VARIABLES_MyBaseClass"
+__OBJC_$_INSTANCE_VARIABLES_MyBaseClass:
+	.long	32                              ; 0x20
+	.long	1                               ; 0x1
+	.quad	_OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.9
+	.long	2                               ; 0x2
+	.long	4                               ; 0x4
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_.10:              ; @OBJC_PROP_NAME_ATTR_.10
+	.asciz	"Ti,N,VMyProtocol01Prop"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyBaseClass"
+__OBJC_$_PROP_LIST_MyBaseClass:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_
+	.quad	l_OBJC_PROP_NAME_ATTR_.10
+	.p2align	3, 0x0                          ; @"_OBJC_CLASS_RO_$_MyBaseClass"
+__OBJC_CLASS_RO_$_MyBaseClass:
+	.long	2                               ; 0x2
+	.long	0                               ; 0x0
+	.long	4                               ; 0x4
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	__OBJC_$_INSTANCE_METHODS_MyBaseClass
+	.quad	__OBJC_CLASS_PROTOCOLS_$_MyBaseClass
+	.quad	__OBJC_$_INSTANCE_VARIABLES_MyBaseClass
+	.quad	0
+	.quad	__OBJC_$_PROP_LIST_MyBaseClass
+	.globl	__objc_empty_cache              ; @_objc_empty_cache
+.zerofill __DATA,__common,__objc_empty_cache,8,3
+	.section	__DATA,__objc_classlist,regular,no_dead_strip
+	.p2align	3, 0x0                          ; @"OBJC_LABEL_CLASS_$"
+l_OBJC_LABEL_CLASS_$:
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.no_dead_strip	__OBJC_LABEL_PROTOCOL_$_MyProtocol01
+	.no_dead_strip	__OBJC_PROTOCOL_$_MyProtocol01
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	96
+.subsections_via_symbols
+
+
+#--- a64_file2.s
+
+## @protocol MyProtocol01
+## - (void)myProtocol01Method;
+## @end
+##
+## @protocol MyProtocol02
+## - (void)myProtocol02Method;
+## @property(readonly) int MyProtocol02Prop;
+## @end
+##
+## @protocol MyProtocol03
+## - (void)myProtocol03Method;
+## @property(readonly) int MyProtocol03Prop;
+## @end
+##
+##
+## __attribute__((objc_root_class))
+## @interface MyBaseClass<MyProtocol01>
+## - (void)baseInstanceMethod;
+## - (void)myProtocol01Method;
+## + (void)baseClassMethod;
+## @end
+##
+##
+##
+## @interface MyBaseClass(Category02)<MyProtocol02>
+## - (void)class02InstanceMethod;
+## - (void)myProtocol02Method;
+## + (void)class02ClassMethod;
+## + (int)MyProtocol02Prop;
+## @end
+##
+## @implementation MyBaseClass(Category02)
+## - (void)class02InstanceMethod {}
+## - (void)myProtocol02Method {}
+## + (void)class02ClassMethod {}
+## + (int)MyProtocol02Prop { return 0;}
+## @dynamic MyProtocol02Prop;
+## @end
+##
+## @interface MyBaseClass(Category03)<MyProtocol03>
+## - (void)class03InstanceMethod;
+## - (void)myProtocol03Method;
+## + (void)class03ClassMethod;
+## + (int)MyProtocol03Prop;
+## @end
+##
+## @implementation MyBaseClass(Category03)
+## - (void)class03InstanceMethod {}
+## - (void)myProtocol03Method {}
+## + (void)class03ClassMethod {}
+## + (int)MyProtocol03Prop { return 0;}
+## @dynamic MyProtocol03Prop;
+## @end
+##
+## int main() {
+##     return 0;
+## }
+
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category02) class02InstanceMethod]
+"-[MyBaseClass(Category02) class02InstanceMethod]": ; @"\01-[MyBaseClass(Category02) class02InstanceMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category02) myProtocol02Method]
+"-[MyBaseClass(Category02) myProtocol02Method]": ; @"\01-[MyBaseClass(Category02) myProtocol02Method]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass(Category02) class02ClassMethod]
+"+[MyBaseClass(Category02) class02ClassMethod]": ; @"\01+[MyBaseClass(Category02) class02ClassMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass(Category02) MyProtocol02Prop]
+"+[MyBaseClass(Category02) MyProtocol02Prop]": ; @"\01+[MyBaseClass(Category02) MyProtocol02Prop]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	b	_OUTLINED_FUNCTION_0
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category03) class03InstanceMethod]
+"-[MyBaseClass(Category03) class03InstanceMethod]": ; @"\01-[MyBaseClass(Category03) class03InstanceMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category03) myProtocol03Method]
+"-[MyBaseClass(Category03) myProtocol03Method]": ; @"\01-[MyBaseClass(Category03) myProtocol03Method]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass(Category03) class03ClassMethod]
+"+[MyBaseClass(Category03) class03ClassMethod]": ; @"\01+[MyBaseClass(Category03) class03ClassMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass(Category03) MyProtocol03Prop]
+"+[MyBaseClass(Category03) MyProtocol03Prop]": ; @"\01+[MyBaseClass(Category03) MyProtocol03Prop]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	b	_OUTLINED_FUNCTION_0
+	.cfi_endproc
+                                        ; -- End function
+	.globl	_main                           ; -- Begin function main
+	.p2align	2
+_main:                                  ; @main
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	b	_OUTLINED_FUNCTION_0
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function OUTLINED_FUNCTION_0
+_OUTLINED_FUNCTION_0:                   ; @OUTLINED_FUNCTION_0 Tail Call
+	.cfi_startproc
+; %bb.0:
+	mov	w0, #0
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:                     ; @OBJC_CLASS_NAME_
+	.asciz	"Category02"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:                  ; @OBJC_METH_VAR_NAME_
+	.asciz	"class02InstanceMethod"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:                  ; @OBJC_METH_VAR_TYPE_
+	.asciz	"v16@0:8"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.1:                ; @OBJC_METH_VAR_NAME_.1
+	.asciz	"myProtocol02Method"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category02) class02InstanceMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.1
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category02) myProtocol02Method]"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.2:                ; @OBJC_METH_VAR_NAME_.2
+	.asciz	"class02ClassMethod"
+l_OBJC_METH_VAR_NAME_.3:                ; @OBJC_METH_VAR_NAME_.3
+	.asciz	"MyProtocol02Prop"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.4:                ; @OBJC_METH_VAR_TYPE_.4
+	.asciz	"i16@0:8"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category02:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.2
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyBaseClass(Category02) class02ClassMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	"+[MyBaseClass(Category02) MyProtocol02Prop]"
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.5:                   ; @OBJC_CLASS_NAME_.5
+	.asciz	"MyProtocol02"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol02"
+__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol02:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.1
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	0
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_:                 ; @OBJC_PROP_NAME_ATTR_
+	.asciz	"MyProtocol02Prop"
+l_OBJC_PROP_NAME_ATTR_.6:               ; @OBJC_PROP_NAME_ATTR_.6
+	.asciz	"Ti,R"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyProtocol02"
+__OBJC_$_PROP_LIST_MyProtocol02:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_
+	.quad	l_OBJC_PROP_NAME_ATTR_.6
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol02"
+__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol02:
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.private_extern	__OBJC_PROTOCOL_$_MyProtocol02 ; @"_OBJC_PROTOCOL_$_MyProtocol02"
+	.section	__DATA,__data
+	.globl	__OBJC_PROTOCOL_$_MyProtocol02
+	.weak_definition	__OBJC_PROTOCOL_$_MyProtocol02
+	.p2align	3, 0x0
+__OBJC_PROTOCOL_$_MyProtocol02:
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_.5
+	.quad	0
+	.quad	__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol02
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	__OBJC_$_PROP_LIST_MyProtocol02
+	.long	96                              ; 0x60
+	.long	0                               ; 0x0
+	.quad	__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol02
+	.quad	0
+	.quad	0
+	.private_extern	__OBJC_LABEL_PROTOCOL_$_MyProtocol02 ; @"_OBJC_LABEL_PROTOCOL_$_MyProtocol02"
+	.section	__DATA,__objc_protolist,coalesced,no_dead_strip
+	.globl	__OBJC_LABEL_PROTOCOL_$_MyProtocol02
+	.weak_definition	__OBJC_LABEL_PROTOCOL_$_MyProtocol02
+	.p2align	3, 0x0
+__OBJC_LABEL_PROTOCOL_$_MyProtocol02:
+	.quad	__OBJC_PROTOCOL_$_MyProtocol02
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category02"
+__OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category02:
+	.quad	1                               ; 0x1
+	.quad	__OBJC_PROTOCOL_$_MyProtocol02
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_.7:               ; @OBJC_PROP_NAME_ATTR_.7
+	.asciz	"Ti,R,D"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyBaseClass_$_Category02"
+__OBJC_$_PROP_LIST_MyBaseClass_$_Category02:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_
+	.quad	l_OBJC_PROP_NAME_ATTR_.7
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_MyBaseClass_$_Category02:
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02
+	.quad	__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category02
+	.quad	__OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category02
+	.quad	__OBJC_$_PROP_LIST_MyBaseClass_$_Category02
+	.quad	0
+	.long	64                              ; 0x40
+	.space	4
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.8:                   ; @OBJC_CLASS_NAME_.8
+	.asciz	"Category03"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.9:                ; @OBJC_METH_VAR_NAME_.9
+	.asciz	"class03InstanceMethod"
+l_OBJC_METH_VAR_NAME_.10:               ; @OBJC_METH_VAR_NAME_.10
+	.asciz	"myProtocol03Method"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category03"
+__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category03:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.9
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category03) class03InstanceMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.10
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category03) myProtocol03Method]"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.11:               ; @OBJC_METH_VAR_NAME_.11
+	.asciz	"class03ClassMethod"
+l_OBJC_METH_VAR_NAME_.12:               ; @OBJC_METH_VAR_NAME_.12
+	.asciz	"MyProtocol03Prop"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category03"
+__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category03:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.11
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyBaseClass(Category03) class03ClassMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.12
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	"+[MyBaseClass(Category03) MyProtocol03Prop]"
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.13:                  ; @OBJC_CLASS_NAME_.13
+	.asciz	"MyProtocol03"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol03"
+__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol03:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.10
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	0
+	.quad	l_OBJC_METH_VAR_NAME_.12
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_.14:              ; @OBJC_PROP_NAME_ATTR_.14
+	.asciz	"MyProtocol03Prop"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyProtocol03"
+__OBJC_$_PROP_LIST_MyProtocol03:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_.14
+	.quad	l_OBJC_PROP_NAME_ATTR_.6
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol03"
+__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol03:
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.private_extern	__OBJC_PROTOCOL_$_MyProtocol03 ; @"_OBJC_PROTOCOL_$_MyProtocol03"
+	.section	__DATA,__data
+	.globl	__OBJC_PROTOCOL_$_MyProtocol03
+	.weak_definition	__OBJC_PROTOCOL_$_MyProtocol03
+	.p2align	3, 0x0
+__OBJC_PROTOCOL_$_MyProtocol03:
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_.13
+	.quad	0
+	.quad	__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol03
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	__OBJC_$_PROP_LIST_MyProtocol03
+	.long	96                              ; 0x60
+	.long	0                               ; 0x0
+	.quad	__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol03
+	.quad	0
+	.quad	0
+	.private_extern	__OBJC_LABEL_PROTOCOL_$_MyProtocol03 ; @"_OBJC_LABEL_PROTOCOL_$_MyProtocol03"
+	.section	__DATA,__objc_protolist,coalesced,no_dead_strip
+	.globl	__OBJC_LABEL_PROTOCOL_$_MyProtocol03
+	.weak_definition	__OBJC_LABEL_PROTOCOL_$_MyProtocol03
+	.p2align	3, 0x0
+__OBJC_LABEL_PROTOCOL_$_MyProtocol03:
+	.quad	__OBJC_PROTOCOL_$_MyProtocol03
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category03"
+__OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category03:
+	.quad	1                               ; 0x1
+	.quad	__OBJC_PROTOCOL_$_MyProtocol03
+	.quad	0
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyBaseClass_$_Category03"
+__OBJC_$_PROP_LIST_MyBaseClass_$_Category03:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_.14
+	.quad	l_OBJC_PROP_NAME_ATTR_.7
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_MyBaseClass_$_Category03"
+__OBJC_$_CATEGORY_MyBaseClass_$_Category03:
+	.quad	l_OBJC_CLASS_NAME_.8
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category03
+	.quad	__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category03
+	.quad	__OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category03
+	.quad	__OBJC_$_PROP_LIST_MyBaseClass_$_Category03
+	.quad	0
+	.long	64                              ; 0x40
+	.space	4
+	.section	__DATA,__objc_catlist,regular,no_dead_strip
+	.p2align	3, 0x0                          ; @"OBJC_LABEL_CATEGORY_$"
+l_OBJC_LABEL_CATEGORY_$:
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category02
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category03
+	.no_dead_strip	__OBJC_LABEL_PROTOCOL_$_MyProtocol02
+	.no_dead_strip	__OBJC_LABEL_PROTOCOL_$_MyProtocol03
+	.no_dead_strip	__OBJC_PROTOCOL_$_MyProtocol02
+	.no_dead_strip	__OBJC_PROTOCOL_$_MyProtocol03
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	96
+.subsections_via_symbols
diff --git a/lld/test/MachO/objc-category-merging-extern-class-minimal.s b/lld/test/MachO/objc-category-merging-extern-class-minimal.s
new file mode 100644
index 0000000000000..ede7ef5d9c32d
--- /dev/null
+++ b/lld/test/MachO/objc-category-merging-extern-class-minimal.s
@@ -0,0 +1,155 @@
+# REQUIRES: aarch64
+# RUN: rm -rf %t; split-file %s %t && cd %t
+
+## Create a dylib with a fake base class to link against
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_fakedylib.o a64_fakedylib.s
+# RUN: %lld -arch arm64 a64_fakedylib.o -o a64_fakedylib.dylib -dylib
+
+## Create our main testing dylib - linking against the fake dylib above
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_cat_minimal.o merge_cat_minimal.s
+# RUN: %lld -arch arm64 -dylib -o merge_cat_minimal_no_merge.dylib a64_fakedylib.dylib merge_cat_minimal.o
+# RUN: %lld -arch arm64 -dylib -o merge_cat_minimal_merge.dylib -objc_category_merging a64_fakedylib.dylib merge_cat_minimal.o
+
+## Now verify that the flag caused category merging to happen appropriatelly
+# RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_no_merge.dylib | FileCheck %s --check-prefixes=NO_MERGE_CATS
+# RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_merge.dylib | FileCheck %s --check-prefixes=MERGE_CATS
+
+#### Check merge categories enabled ###
+# Check that the original categories are not there
+MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+
+# Check that the merged cateogry is there, in the correct format
+MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_(Category01|Category02)
+MERGE_CATS-NEXT:   name {{.*}} Category01|Category02
+MERGE_CATS:       instanceMethods
+MERGE_CATS-NEXT:  24
+MERGE_CATS-NEXT:  2
+MERGE_CATS-NEXT:   name {{.*}} cat01_InstanceMethod
+MERGE_CATS-NEXT:  types {{.*}} v16@0:8
+MERGE_CATS-NEXT:    imp -[MyBaseClass(Category01) cat01_InstanceMethod]
+MERGE_CATS-NEXT:   name {{.*}} cat02_InstanceMethod
+MERGE_CATS-NEXT:  types {{.*}} v16@0:8
+MERGE_CATS-NEXT:    imp -[MyBaseClass(Category02) cat02_InstanceMethod]
+MERGE_CATS-NEXT:         classMethods 0x0
+MERGE_CATS-NEXT:            protocols 0x0
+MERGE_CATS-NEXT:   instanceProperties 0x0
+
+#### Check merge categories disabled ###
+# Check that the merged category is not there
+NO_MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_(Category01|Category02)
+
+# Check that the original categories are there
+NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+
+
+
+#--- a64_fakedylib.s
+
+    .section    __DATA,__objc_data
+    .globl    _OBJC_CLASS_$_MyBaseClass
+_OBJC_CLASS_$_MyBaseClass:
+    .quad    0
+
+#--- merge_cat_minimal.s
+
+;  ================== Generated from ObjC: ==================
+; __attribute__((objc_root_class))
+; @interface MyBaseClass
+; - (void)baseInstanceMethod;
+; @end
+;
+; @interface MyBaseClass(Category01)
+; - (void)cat01_InstanceMethod;
+; @end
+;
+; @implementation MyBaseClass(Category01)
+; - (void)cat01_InstanceMethod {}
+; @end
+;
+; @interface MyBaseClass(Category02)
+; - (void)cat02_InstanceMethod;
+; @end
+;
+; @implementation MyBaseClass(Category02)
+; - (void)cat02_InstanceMethod {}
+; @end
+;  ================== Generated from ObjC: ==================
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category01) cat01_InstanceMethod]
+"-[MyBaseClass(Category01) cat01_InstanceMethod]": ; @"\01-[MyBaseClass(Category01) cat01_InstanceMethod]"
+	.cfi_startproc
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category02) cat02_InstanceMethod]
+"-[MyBaseClass(Category02) cat02_InstanceMethod]": ; @"\01-[MyBaseClass(Category02) cat02_InstanceMethod]"
+	.cfi_startproc
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:                     ; @OBJC_CLASS_NAME_
+	.asciz	"Category01"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:                  ; @OBJC_METH_VAR_NAME_
+	.asciz	"cat01_InstanceMethod"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:                  ; @OBJC_METH_VAR_TYPE_
+	.asciz	"v16@0:8"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category01"
+__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category01:
+	.long	24                              ; 0x18
+	.long	1                               ; 0x1
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category01) cat01_InstanceMethod]"
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_MyBaseClass_$_Category01"
+__OBJC_$_CATEGORY_MyBaseClass_$_Category01:
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category01
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.long	64                              ; 0x40
+	.space	4
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.1:                   ; @OBJC_CLASS_NAME_.1
+	.asciz	"Category02"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.2:                ; @OBJC_METH_VAR_NAME_.2
+	.asciz	"cat02_InstanceMethod"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02:
+	.long	24                              ; 0x18
+	.long	1                               ; 0x1
+	.quad	l_OBJC_METH_VAR_NAME_.2
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category02) cat02_InstanceMethod]"
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_MyBaseClass_$_Category02:
+	.quad	l_OBJC_CLASS_NAME_.1
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.long	64                              ; 0x40
+	.space	4
+	.section	__DATA,__objc_catlist,regular,no_dead_strip
+	.p2align	3, 0x0                          ; @"OBJC_LABEL_CATEGORY_$"
+l_OBJC_LABEL_CATEGORY_$:
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category01
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category02
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	96
+.subsections_via_symbols
diff --git a/lldb/cmake/caches/Apple-lldb-Linux.cmake b/lldb/cmake/caches/Apple-lldb-Linux.cmake
index 13d3839f852f6..b2d3cf595fe18 100644
--- a/lldb/cmake/caches/Apple-lldb-Linux.cmake
+++ b/lldb/cmake/caches/Apple-lldb-Linux.cmake
@@ -5,4 +5,5 @@ set(LLVM_DISTRIBUTION_COMPONENTS
   liblldb
   lldb-argdumper
   lldb-server
+  lldb-python-scripts
   CACHE STRING "")
diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h
index f7bdd3093d202..3644ac056da3d 100644
--- a/lldb/include/lldb/API/SBTarget.h
+++ b/lldb/include/lldb/API/SBTarget.h
@@ -42,7 +42,8 @@ class LLDB_API SBTarget {
     eBroadcastBitModulesLoaded = (1 << 1),
     eBroadcastBitModulesUnloaded = (1 << 2),
     eBroadcastBitWatchpointChanged = (1 << 3),
-    eBroadcastBitSymbolsLoaded = (1 << 4)
+    eBroadcastBitSymbolsLoaded = (1 << 4),
+    eBroadcastBitSymbolsChanged = (1 << 5),
   };
 
   // Constructors
diff --git a/lldb/include/lldb/Host/Alarm.h b/lldb/include/lldb/Host/Alarm.h
new file mode 100644
index 0000000000000..23b1ff1af5689
--- /dev/null
+++ b/lldb/include/lldb/Host/Alarm.h
@@ -0,0 +1,115 @@
+//===-- Alarm.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_HOST_ALARM_H
+#define LLDB_HOST_ALARM_H
+
+#include "lldb/Host/HostThread.h"
+#include "lldb/lldb-types.h"
+#include "llvm/Support/Chrono.h"
+
+#include <condition_variable>
+#include <mutex>
+
+namespace lldb_private {
+
+/// \class Alarm abstraction that enables scheduling a callback function after a
+/// specified timeout. Creating an alarm for a callback returns a Handle that
+/// can be used to restart or cancel the alarm.
+class Alarm {
+public:
+  using Handle = uint64_t;
+  using Callback = std::function<void()>;
+  using TimePoint = llvm::sys::TimePoint<>;
+  using Duration = std::chrono::milliseconds;
+
+  Alarm(Duration timeout, bool run_callback_on_exit = false);
+  ~Alarm();
+
+  /// Create an alarm for the given callback. The alarm will expire and the
+  /// callback will be called after the timeout.
+  ///
+  /// \returns
+  ///   Handle which can be used to restart or cancel the alarm.
+  Handle Create(Callback callback);
+
+  /// Restart the alarm for the given Handle. The alarm will expire and the
+  /// callback will be called after the timeout.
+  ///
+  /// \returns
+  ///   True if the alarm was successfully restarted. False if there is no alarm
+  ///   for the given Handle or the alarm already expired.
+  bool Restart(Handle handle);
+
+  /// Cancel the alarm for the given Handle. The alarm and its handle will be
+  /// removed.
+  ///
+  /// \returns
+  ///   True if the alarm was successfully canceled and the Handle removed.
+  ///   False if there is no alarm for the given Handle or the alarm already
+  ///   expired.
+  bool Cancel(Handle handle);
+
+  static constexpr Handle INVALID_HANDLE = 0;
+
+private:
+  /// Helper functions to start, stop and check the status of the alarm thread.
+  /// @{
+  void StartAlarmThread();
+  void StopAlarmThread();
+  bool AlarmThreadRunning();
+  /// @}
+
+  /// Return an unique, monotonically increasing handle.
+  static Handle GetNextUniqueHandle();
+
+  /// Helper to compute the next time the alarm thread needs to wake up.
+  TimePoint GetNextExpiration() const;
+
+  /// Alarm entry.
+  struct Entry {
+    Handle handle;
+    Callback callback;
+    TimePoint expiration;
+
+    Entry(Callback callback, TimePoint expiration);
+    bool operator==(const Entry &rhs) { return handle == rhs.handle; }
+  };
+
+  /// List of alarm entries.
+  std::vector<Entry> m_entries;
+
+  /// Timeout between when an alarm is created and when it fires.
+  Duration m_timeout;
+
+  /// The alarm thread.
+  /// @{
+  HostThread m_alarm_thread;
+  lldb::thread_result_t AlarmThread();
+  /// @}
+
+  /// Synchronize access between the alarm thread and the main thread.
+  std::mutex m_alarm_mutex;
+
+  /// Condition variable used to wake up the alarm thread.
+  std::condition_variable m_alarm_cv;
+
+  /// Flag to signal the alarm thread that something changed and we need to
+  /// recompute the next alarm.
+  bool m_recompute_next_alarm = false;
+
+  /// Flag to signal the alarm thread to exit.
+  bool m_exit = false;
+
+  /// Flag to signal we should run all callbacks on exit.
+  bool m_run_callbacks_on_exit = false;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_HOST_ALARM_H
diff --git a/lldb/include/lldb/Symbol/LineEntry.h b/lldb/include/lldb/Symbol/LineEntry.h
index c2daba916e3f9..31e1cd0b36f96 100644
--- a/lldb/include/lldb/Symbol/LineEntry.h
+++ b/lldb/include/lldb/Symbol/LineEntry.h
@@ -130,31 +130,40 @@ struct LineEntry {
   ///     Shared pointer to the target this LineEntry belongs to.
   void ApplyFileMappings(lldb::TargetSP target_sp);
 
-  // Member variables.
-  AddressRange range; ///< The section offset address range for this line entry.
-  FileSpec file; ///< The source file, possibly mapped by the target.source-map
-                 ///setting
-  lldb::SupportFileSP
-      original_file_sp; ///< The original source file, from debug info.
-  uint32_t line = LLDB_INVALID_LINE_NUMBER; ///< The source line number, or zero
-                                            ///< if there is no line number
-                                            /// information.
-  uint16_t column =
-      0; ///< The column number of the source line, or zero if there
-         /// is no column information.
-  uint16_t is_start_of_statement : 1, ///< Indicates this entry is the beginning
-                                      ///of a statement.
-      is_start_of_basic_block : 1, ///< Indicates this entry is the beginning of
-                                   ///a basic block.
-      is_prologue_end : 1,   ///< Indicates this entry is one (of possibly many)
-                             ///where execution should be suspended for an entry
-                             ///breakpoint of a function.
-      is_epilogue_begin : 1, ///< Indicates this entry is one (of possibly many)
-                             ///where execution should be suspended for an exit
-                             ///breakpoint of a function.
-      is_terminal_entry : 1; ///< Indicates this entry is that of the first byte
-                             ///after the end of a sequence of target machine
-                             ///instructions.
+  /// The section offset address range for this line entry.
+  AddressRange range;
+
+  /// The source file, possibly mapped by the target.source-map setting.
+  FileSpec file;
+
+  /// The original source file, from debug info.
+  lldb::SupportFileSP original_file_sp;
+
+  /// The source line number, or LLDB_INVALID_LINE_NUMBER if there is no line
+  /// number information.
+  uint32_t line = LLDB_INVALID_LINE_NUMBER;
+
+  /// The column number of the source line, or zero if there is no column
+  /// information.
+  uint16_t column = 0;
+
+  /// Indicates this entry is the beginning of a statement.
+  uint16_t is_start_of_statement : 1;
+
+  /// Indicates this entry is the beginning of a basic block.
+  uint16_t is_start_of_basic_block : 1;
+
+  /// Indicates this entry is one (of possibly many) where execution should be
+  /// suspended for an entry breakpoint of a function.
+  uint16_t is_prologue_end : 1;
+
+  /// Indicates this entry is one (of possibly many) where execution should be
+  /// suspended for an exit breakpoint of a function.
+  uint16_t is_epilogue_begin : 1;
+
+  /// Indicates this entry is that of the first byte after the end of a sequence
+  /// of target machine instructions.
+  uint16_t is_terminal_entry : 1;
 };
 
 /// Less than operator.
diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index 0cbd8a32dccd5..67714e6fdf942 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -26,6 +26,15 @@
 
 namespace lldb_private {
 
+class LanguageProperties : public Properties {
+public:
+  LanguageProperties();
+
+  static llvm::StringRef GetSettingName();
+
+  bool GetEnableFilterForLineBreakpoints() const;
+};
+
 class Language : public PluginInterface {
 public:
   class TypeScavenger {
@@ -324,6 +333,8 @@ class Language : public PluginInterface {
   static LanguageSet GetLanguagesSupportingTypeSystemsForExpressions();
   static LanguageSet GetLanguagesSupportingREPLs();
 
+  static LanguageProperties &GetGlobalLanguageProperties();
+
   // Given a mangled function name, calculates some alternative manglings since
   // the compiler mangling may not line up with the symbol we are expecting.
   virtual std::vector<ConstString>
@@ -339,6 +350,15 @@ class Language : public PluginInterface {
 
   virtual llvm::StringRef GetInstanceVariableName() { return {}; }
 
+  /// Returns true if this SymbolContext should be ignored when setting
+  /// breakpoints by line (number or regex). Helpful for languages that create
+  /// artificial functions without meaningful user code associated with them
+  /// (e.g. code that gets expanded in late compilation stages, like by
+  /// CoroSplitter).
+  virtual bool IgnoreForLineBreakpoints(const SymbolContext &) const {
+    return false;
+  }
+
 protected:
   // Classes that inherit from Language can see and modify these
 
diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index b691f82b90652..8e13aa6a13882 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -409,10 +409,6 @@ def add_test_categories(cat):
     cat = test_categories.validate(cat, True)
 
     def impl(func):
-        if isinstance(func, type) and issubclass(func, unittest.TestCase):
-            raise Exception(
-                "@add_test_categories can only be used to decorate a test method"
-            )
         try:
             if hasattr(func, "categories"):
                 cat.extend(func.categories)
diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index 291d7bad5c089..8c29145ecc527 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -914,6 +914,18 @@ def checkForkVForkSupport():
         configuration.skip_categories.append("fork")
 
 
+def checkPexpectSupport():
+    from lldbsuite.test import lldbplatformutil
+
+    platform = lldbplatformutil.getPlatform()
+
+    # llvm.org/pr22274: need a pexpect replacement for windows
+    if platform in ["windows"]:
+        if configuration.verbose:
+            print("pexpect tests will be skipped because of unsupported platform")
+        configuration.skip_categories.append("pexpect")
+
+
 def run_suite():
     # On MacOS X, check to make sure that domain for com.apple.DebugSymbols defaults
     # does not exist before proceeding to running the test suite.
@@ -1013,6 +1025,7 @@ def run_suite():
     checkDebugServerSupport()
     checkObjcSupport()
     checkForkVForkSupport()
+    checkPexpectSupport()
 
     skipped_categories_list = ", ".join(configuration.skip_categories)
     print(
diff --git a/lldb/packages/Python/lldbsuite/test/lldbpexpect.py b/lldb/packages/Python/lldbsuite/test/lldbpexpect.py
index 9d216d9030747..998a080565b6b 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbpexpect.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbpexpect.py
@@ -10,7 +10,7 @@
 
 
 @skipIfRemote
-@skipIfWindows  # llvm.org/pr22274: need a pexpect replacement for windows
+@add_test_categories(["pexpect"])
 class PExpectTest(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
     PROMPT = "(lldb) "
diff --git a/lldb/packages/Python/lldbsuite/test/test_categories.py b/lldb/packages/Python/lldbsuite/test/test_categories.py
index 3f8de175e29df..036bda9c957d1 100644
--- a/lldb/packages/Python/lldbsuite/test/test_categories.py
+++ b/lldb/packages/Python/lldbsuite/test/test_categories.py
@@ -33,6 +33,7 @@
     "lldb-server": "Tests related to lldb-server",
     "lldb-dap": "Tests for the Debug Adaptor Protocol with lldb-dap",
     "llgs": "Tests for the gdb-server functionality of lldb-server",
+    "pexpect": "Tests requiring the pexpect library to be available",
     "objc": "Tests related to the Objective-C programming language support",
     "pyapi": "Tests related to the Python API",
     "std-module": "Tests related to importing the std module",
diff --git a/lldb/packages/Python/lldbsuite/test/test_result.py b/lldb/packages/Python/lldbsuite/test/test_result.py
index 20365f53a6754..2d574b343b413 100644
--- a/lldb/packages/Python/lldbsuite/test/test_result.py
+++ b/lldb/packages/Python/lldbsuite/test/test_result.py
@@ -148,9 +148,11 @@ def getCategoriesForTest(self, test):
         Gets all the categories for the currently running test method in test case
         """
         test_categories = []
+        test_categories.extend(getattr(test, "categories", []))
+
         test_method = getattr(test, test._testMethodName)
-        if test_method is not None and hasattr(test_method, "categories"):
-            test_categories.extend(test_method.categories)
+        if test_method is not None:
+            test_categories.extend(getattr(test_method, "categories", []))
 
         test_categories.extend(self._getFileBasedCategories(test))
 
diff --git a/lldb/source/Breakpoint/BreakpointResolver.cpp b/lldb/source/Breakpoint/BreakpointResolver.cpp
index bc6348716ef41..1861a0fe7c4fe 100644
--- a/lldb/source/Breakpoint/BreakpointResolver.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolver.cpp
@@ -23,6 +23,7 @@
 #include "lldb/Symbol/CompileUnit.h"
 #include "lldb/Symbol/Function.h"
 #include "lldb/Symbol/SymbolContext.h"
+#include "lldb/Target/Language.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
@@ -203,8 +204,15 @@ void BreakpointResolver::SetSCMatchesByLine(
     SearchFilter &filter, SymbolContextList &sc_list, bool skip_prologue,
     llvm::StringRef log_ident, uint32_t line, std::optional<uint16_t> column) {
   llvm::SmallVector<SymbolContext, 16> all_scs;
-  for (uint32_t i = 0; i < sc_list.GetSize(); ++i)
-    all_scs.push_back(sc_list[i]);
+
+  for (const auto &sc : sc_list) {
+    if (Language::GetGlobalLanguageProperties()
+            .GetEnableFilterForLineBreakpoints())
+      if (Language *lang = Language::FindPlugin(sc.GetLanguage());
+          lang && lang->IgnoreForLineBreakpoints(sc))
+        continue;
+    all_scs.push_back(sc);
+  }
 
   while (all_scs.size()) {
     uint32_t closest_line = UINT32_MAX;
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index b183cb423111f..e1255f37d9bc5 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -23,7 +23,6 @@
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-forward.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatVariadic.h"
 
 #include <regex>
 
@@ -130,6 +129,19 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
     }
   };
 
+  // Dump `valobj` according to whether `po` was requested or not.
+  auto dump_val_object = [&](ValueObject &valobj) {
+    if (is_po) {
+      StreamString temp_result_stream;
+      valobj.Dump(temp_result_stream, dump_options);
+      llvm::StringRef output = temp_result_stream.GetString();
+      maybe_add_hint(output);
+      result.GetOutputStream() << output;
+    } else {
+      valobj.Dump(result.GetOutputStream(), dump_options);
+    }
+  };
+
   // First, try `expr` as the name of a frame variable.
   if (frame) {
     auto valobj_sp = frame->FindVariable(ConstString(expr));
@@ -147,21 +159,23 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
                                         flags, expr);
       }
 
-      if (is_po) {
-        StreamString temp_result_stream;
-        valobj_sp->Dump(temp_result_stream, dump_options);
-        llvm::StringRef output = temp_result_stream.GetString();
-        maybe_add_hint(output);
-        result.GetOutputStream() << output;
-      } else {
-        valobj_sp->Dump(result.GetOutputStream(), dump_options);
-      }
+      dump_val_object(*valobj_sp);
       result.SetStatus(eReturnStatusSuccessFinishResult);
       return;
     }
   }
 
-  // Second, also lastly, try `expr` as a source expression to evaluate.
+  // Second, try `expr` as a persistent variable.
+  if (expr.starts_with("$"))
+    if (auto *state = target.GetPersistentExpressionStateForLanguage(language))
+      if (auto var_sp = state->GetVariable(expr))
+        if (auto valobj_sp = var_sp->GetValueObject()) {
+          dump_val_object(*valobj_sp);
+          result.SetStatus(eReturnStatusSuccessFinishResult);
+          return;
+        }
+
+  // Third, and lastly, try `expr` as a source expression to evaluate.
   {
     auto *exe_scope = m_exe_ctx.GetBestExecutionContextScope();
     ValueObjectSP valobj_sp;
@@ -187,17 +201,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
                                         expr);
       }
 
-      if (valobj_sp->GetError().GetError() != UserExpression::kNoResult) {
-        if (is_po) {
-          StreamString temp_result_stream;
-          valobj_sp->Dump(temp_result_stream, dump_options);
-          llvm::StringRef output = temp_result_stream.GetString();
-          maybe_add_hint(output);
-          result.GetOutputStream() << output;
-        } else {
-          valobj_sp->Dump(result.GetOutputStream(), dump_options);
-        }
-      }
+      if (valobj_sp->GetError().GetError() != UserExpression::kNoResult)
+        dump_val_object(*valobj_sp);
 
       if (suppress_result)
         if (auto result_var_sp =
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index b2346c2402a81..ae6c6d5479a19 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -3377,7 +3377,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
       case 'r': {
         size_t ref_count = 0;
         char in_shared_cache = 'Y';
-        
+
         ModuleSP module_sp(module->shared_from_this());
         if (!ModuleList::ModuleIsInCache(module))
           in_shared_cache = 'N';
@@ -4508,11 +4508,8 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
 
     ModuleSpec module_spec;
     module_spec.GetUUID() = frame_module_sp->GetUUID();
-
-    if (FileSystem::Instance().Exists(frame_module_sp->GetPlatformFileSpec())) {
-      module_spec.GetArchitecture() = frame_module_sp->GetArchitecture();
-      module_spec.GetFileSpec() = frame_module_sp->GetPlatformFileSpec();
-    }
+    module_spec.GetArchitecture() = frame_module_sp->GetArchitecture();
+    module_spec.GetFileSpec() = frame_module_sp->GetPlatformFileSpec();
 
     if (!DownloadObjectAndSymbolFile(module_spec, result, flush)) {
       result.AppendError("unable to find debug symbols for the current frame");
@@ -4557,12 +4554,8 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
 
       ModuleSpec module_spec;
       module_spec.GetUUID() = frame_module_sp->GetUUID();
-
-      if (FileSystem::Instance().Exists(
-              frame_module_sp->GetPlatformFileSpec())) {
-        module_spec.GetArchitecture() = frame_module_sp->GetArchitecture();
-        module_spec.GetFileSpec() = frame_module_sp->GetPlatformFileSpec();
-      }
+      module_spec.GetFileSpec() = frame_module_sp->GetPlatformFileSpec();
+      module_spec.GetArchitecture() = frame_module_sp->GetArchitecture();
 
       bool current_frame_flush = false;
       if (DownloadObjectAndSymbolFile(module_spec, result, current_frame_flush))
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 90aabde2b764f..ebd112110e5f2 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -860,6 +860,9 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton)
   m_collection_sp->AppendProperty(
       "symbols", "Symbol lookup and cache settings.", true,
       ModuleList::GetGlobalModuleListProperties().GetValueProperties());
+  m_collection_sp->AppendProperty(
+      LanguageProperties::GetSettingName(), "Language settings.", true,
+      Language::GetGlobalLanguageProperties().GetValueProperties());
   if (m_command_interpreter_up) {
     m_collection_sp->AppendProperty(
         "interpreter",
diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt
index fe6e539f758fd..c2e091ee8555b 100644
--- a/lldb/source/Host/CMakeLists.txt
+++ b/lldb/source/Host/CMakeLists.txt
@@ -13,6 +13,7 @@ macro(add_host_subdirectory group)
 endmacro()
 
 add_host_subdirectory(common
+  common/Alarm.cpp
   common/FileAction.cpp
   common/FileCache.cpp
   common/File.cpp
diff --git a/lldb/source/Host/common/Alarm.cpp b/lldb/source/Host/common/Alarm.cpp
new file mode 100644
index 0000000000000..245cdc7ae5c2d
--- /dev/null
+++ b/lldb/source/Host/common/Alarm.cpp
@@ -0,0 +1,216 @@
+//===-- Alarm.cpp ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/Alarm.h"
+#include "lldb/Host/ThreadLauncher.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+Alarm::Alarm(Duration timeout, bool run_callback_on_exit)
+    : m_timeout(timeout), m_run_callbacks_on_exit(run_callback_on_exit) {
+  StartAlarmThread();
+}
+
+Alarm::~Alarm() { StopAlarmThread(); }
+
+Alarm::Handle Alarm::Create(std::function<void()> callback) {
+  // Gracefully deal with the unlikely event that the alarm thread failed to
+  // launch.
+  if (!AlarmThreadRunning())
+    return INVALID_HANDLE;
+
+  // Compute the next expiration before we take the lock. This ensures that
+  // waiting on the lock doesn't eat into the timeout.
+  const TimePoint expiration = GetNextExpiration();
+
+  Handle handle = INVALID_HANDLE;
+
+  {
+    std::lock_guard<std::mutex> alarm_guard(m_alarm_mutex);
+
+    // Create a new unique entry and remember its handle.
+    m_entries.emplace_back(callback, expiration);
+    handle = m_entries.back().handle;
+
+    // Tell the alarm thread we need to recompute the next alarm.
+    m_recompute_next_alarm = true;
+  }
+
+  m_alarm_cv.notify_one();
+  return handle;
+}
+
+bool Alarm::Restart(Handle handle) {
+  // Gracefully deal with the unlikely event that the alarm thread failed to
+  // launch.
+  if (!AlarmThreadRunning())
+    return false;
+
+  // Compute the next expiration before we take the lock. This ensures that
+  // waiting on the lock doesn't eat into the timeout.
+  const TimePoint expiration = GetNextExpiration();
+
+  {
+    std::lock_guard<std::mutex> alarm_guard(m_alarm_mutex);
+
+    // Find the entry corresponding to the given handle.
+    const auto it =
+        std::find_if(m_entries.begin(), m_entries.end(),
+                     [handle](Entry &entry) { return entry.handle == handle; });
+    if (it == m_entries.end())
+      return false;
+
+    // Update the expiration.
+    it->expiration = expiration;
+
+    // Tell the alarm thread we need to recompute the next alarm.
+    m_recompute_next_alarm = true;
+  }
+
+  m_alarm_cv.notify_one();
+  return true;
+}
+
+bool Alarm::Cancel(Handle handle) {
+  // Gracefully deal with the unlikely event that the alarm thread failed to
+  // launch.
+  if (!AlarmThreadRunning())
+    return false;
+
+  {
+    std::lock_guard<std::mutex> alarm_guard(m_alarm_mutex);
+
+    const auto it =
+        std::find_if(m_entries.begin(), m_entries.end(),
+                     [handle](Entry &entry) { return entry.handle == handle; });
+
+    if (it == m_entries.end())
+      return false;
+
+    m_entries.erase(it);
+  }
+
+  // No need to notify the alarm thread. This only affects the alarm thread if
+  // we removed the entry that corresponds to the next alarm. If that's the
+  // case, the thread will wake up as scheduled, find no expired events, and
+  // recompute the next alarm time.
+  return true;
+}
+
+Alarm::Entry::Entry(Alarm::Callback callback, Alarm::TimePoint expiration)
+    : handle(Alarm::GetNextUniqueHandle()), callback(std::move(callback)),
+      expiration(std::move(expiration)) {}
+
+void Alarm::StartAlarmThread() {
+  if (!m_alarm_thread.IsJoinable()) {
+    llvm::Expected<HostThread> alarm_thread = ThreadLauncher::LaunchThread(
+        "lldb.debugger.alarm-thread", [this] { return AlarmThread(); },
+        8 * 1024 * 1024); // Use larger 8MB stack for this thread
+    if (alarm_thread) {
+      m_alarm_thread = *alarm_thread;
+    } else {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Host), alarm_thread.takeError(),
+                     "failed to launch host thread: {0}");
+    }
+  }
+}
+
+void Alarm::StopAlarmThread() {
+  if (m_alarm_thread.IsJoinable()) {
+    {
+      std::lock_guard<std::mutex> alarm_guard(m_alarm_mutex);
+      m_exit = true;
+    }
+    m_alarm_cv.notify_one();
+    m_alarm_thread.Join(nullptr);
+  }
+}
+
+bool Alarm::AlarmThreadRunning() { return m_alarm_thread.IsJoinable(); }
+
+lldb::thread_result_t Alarm::AlarmThread() {
+  bool exit = false;
+  std::optional<TimePoint> next_alarm;
+
+  const auto predicate = [this] { return m_exit || m_recompute_next_alarm; };
+
+  while (!exit) {
+    // Synchronization between the main thread and the alarm thread using a
+    // mutex and condition variable. There are 2 reasons the thread can wake up:
+    //
+    // 1. The timeout for the next alarm expired.
+    //
+    // 2. The condition variable is notified that one of our shared variables
+    //    (see predicate) was modified. Either the thread is asked to shut down
+    //    or a new alarm came in and we need to recompute the next timeout.
+    //
+    // Below we only deal with the timeout expiring and fall through for dealing
+    // with the rest.
+    std::unique_lock<std::mutex> alarm_lock(m_alarm_mutex);
+    if (next_alarm) {
+      if (!m_alarm_cv.wait_until(alarm_lock, *next_alarm, predicate)) {
+        // The timeout for the next alarm expired.
+
+        // Clear the next timeout to signal that we need to recompute the next
+        // timeout.
+        next_alarm.reset();
+
+        // Iterate over all the callbacks. Call the ones that have expired
+        // and remove them from the list.
+        const TimePoint now = std::chrono::system_clock::now();
+        auto it = m_entries.begin();
+        while (it != m_entries.end()) {
+          if (it->expiration <= now) {
+            it->callback();
+            it = m_entries.erase(it);
+          } else {
+            it++;
+          }
+        }
+      }
+    } else {
+      m_alarm_cv.wait(alarm_lock, predicate);
+    }
+
+    // Fall through after waiting on the condition variable. At this point
+    // either the predicate is true or we woke up because an alarm expired.
+
+    // The alarm thread is shutting down.
+    if (m_exit) {
+      exit = true;
+      if (m_run_callbacks_on_exit) {
+        for (Entry &entry : m_entries)
+          entry.callback();
+      }
+      continue;
+    }
+
+    // A new alarm was added or an alarm expired. Either way we need to
+    // recompute when this thread should wake up for the next alarm.
+    if (m_recompute_next_alarm || !next_alarm) {
+      for (Entry &entry : m_entries) {
+        if (!next_alarm || entry.expiration < *next_alarm)
+          next_alarm = entry.expiration;
+      }
+      m_recompute_next_alarm = false;
+    }
+  }
+  return {};
+}
+
+Alarm::TimePoint Alarm::GetNextExpiration() const {
+  return std::chrono::system_clock::now() + m_timeout;
+}
+
+Alarm::Handle Alarm::GetNextUniqueHandle() {
+  static std::atomic<Handle> g_next_handle = 1;
+  return g_next_handle++;
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
index 97fa894ea7376..0c6fdb2b95731 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
@@ -13,6 +13,7 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN
   LibCxxMap.cpp
   LibCxxQueue.cpp
   LibCxxRangesRefView.cpp
+  LibCxxSliceArray.cpp
   LibCxxSpan.cpp
   LibCxxTuple.cpp
   LibCxxUnorderedMap.cpp
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 675ca38518610..4a536096a066f 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -755,6 +755,11 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       lldb_private::formatters::LibcxxStdValarraySyntheticFrontEndCreator,
       "libc++ std::valarray synthetic children",
       "^std::__[[:alnum:]]+::valarray<.+>$", stl_deref_flags, true);
+  AddCXXSynthetic(
+      cpp_category_sp,
+      lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEndCreator,
+      "libc++ std::slice_array synthetic children",
+      "^std::__[[:alnum:]]+::slice_array<.+>$", stl_deref_flags, true);
   AddCXXSynthetic(
       cpp_category_sp,
       lldb_private::formatters::LibcxxStdForwardListSyntheticFrontEndCreator,
@@ -880,6 +885,11 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
                 lldb_private::formatters::LibcxxContainerSummaryProvider,
                 "libc++ std::valarray summary provider",
                 "^std::__[[:alnum:]]+::valarray<.+>$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp,
+                lldb_private::formatters::LibcxxStdSliceArraySummaryProvider,
+                "libc++ std::slice_array summary provider",
+                "^std::__[[:alnum:]]+::slice_array<.+>$", stl_summary_flags,
+                true);
   AddCXXSummary(
       cpp_category_sp, lldb_private::formatters::LibcxxContainerSummaryProvider,
       "libc++ std::list summary provider",
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
index a59f21841ec89..d8b807d180e06 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
@@ -59,6 +59,10 @@ bool LibcxxWStringViewSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions &options); // libc++ std::wstring_view
 
+bool LibcxxStdSliceArraySummaryProvider(
+    ValueObject &valobj, Stream &stream,
+    const TypeSummaryOptions &options); // libc++ std::slice_array
+
 bool LibcxxSmartPointerSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions
@@ -223,6 +227,10 @@ SyntheticChildrenFrontEnd *
 LibcxxStdValarraySyntheticFrontEndCreator(CXXSyntheticChildren *,
                                           lldb::ValueObjectSP);
 
+SyntheticChildrenFrontEnd *
+LibcxxStdSliceArraySyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                            lldb::ValueObjectSP);
+
 SyntheticChildrenFrontEnd *
 LibcxxStdListSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                       lldb::ValueObjectSP);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp
new file mode 100644
index 0000000000000..32e67d2e38c5d
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp
@@ -0,0 +1,166 @@
+//===-- LibCxxSliceArray.cpp-----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibCxx.h"
+
+#include "lldb/Core/ValueObject.h"
+#include "lldb/DataFormatters/FormattersHelpers.h"
+#include <optional>
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::formatters;
+
+namespace lldb_private {
+namespace formatters {
+
+bool LibcxxStdSliceArraySummaryProvider(ValueObject &valobj, Stream &stream,
+                                        const TypeSummaryOptions &options) {
+  ValueObjectSP obj = valobj.GetNonSyntheticValue();
+  if (!obj)
+    return false;
+
+  ValueObjectSP ptr_sp = obj->GetChildMemberWithName("__size_");
+  if (!ptr_sp)
+    return false;
+  const size_t size = ptr_sp->GetValueAsUnsigned(0);
+
+  ptr_sp = obj->GetChildMemberWithName("__stride_");
+  if (!ptr_sp)
+    return false;
+  const size_t stride = ptr_sp->GetValueAsUnsigned(0);
+
+  stream.Printf("stride=%zu size=%zu", stride, size);
+
+  return true;
+}
+
+/// Data formatter for libc++'s std::slice_array.
+///
+/// A slice_array is created by using:
+///   operator[](std::slice slicearr);
+/// and std::slice is created by:
+///   slice(std::size_t start, std::size_t size, std::size_t stride);
+/// The std::slice_array has the following members:
+/// - __vp_ points to std::valarray::__begin_ + @a start
+/// - __size_ is @a size
+/// - __stride_is @a stride
+class LibcxxStdSliceArraySyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  LibcxxStdSliceArraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  ~LibcxxStdSliceArraySyntheticFrontEnd() override;
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  bool MightHaveChildren() override;
+
+  size_t GetIndexOfChildWithName(ConstString name) override;
+
+private:
+  /// A non-owning pointer to slice_array.__vp_.
+  ValueObject *m_start = nullptr;
+  /// slice_array.__size_.
+  size_t m_size = 0;
+  /// slice_array.__stride_.
+  size_t m_stride = 0;
+  /// The type of slize_array's template argument T.
+  CompilerType m_element_type;
+  /// The sizeof slize_array's template argument T.
+  uint32_t m_element_size = 0;
+};
+
+} // namespace formatters
+} // namespace lldb_private
+
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::
+    LibcxxStdSliceArraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_element_type() {
+  if (valobj_sp)
+    Update();
+}
+
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::
+    ~LibcxxStdSliceArraySyntheticFrontEnd() {
+  // these need to stay around because they are child objects who will follow
+  // their parent's life cycle
+  // delete m_start;
+}
+
+llvm::Expected<uint32_t> lldb_private::formatters::
+    LibcxxStdSliceArraySyntheticFrontEnd::CalculateNumChildren() {
+  return m_size;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::GetChildAtIndex(
+    uint32_t idx) {
+  if (!m_start)
+    return lldb::ValueObjectSP();
+
+  uint64_t offset = idx * m_stride * m_element_size;
+  offset = offset + m_start->GetValueAsUnsigned(0);
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  return CreateValueObjectFromAddress(name.GetString(), offset,
+                                      m_backend.GetExecutionContextRef(),
+                                      m_element_type);
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::Update() {
+  m_start = nullptr;
+
+  CompilerType type = m_backend.GetCompilerType();
+  if (type.GetNumTemplateArguments() == 0)
+    return ChildCacheState::eRefetch;
+
+  m_element_type = type.GetTypeTemplateArgument(0);
+  if (std::optional<uint64_t> size = m_element_type.GetByteSize(nullptr))
+    m_element_size = *size;
+
+  if (m_element_size == 0)
+    return ChildCacheState::eRefetch;
+
+  ValueObjectSP start = m_backend.GetChildMemberWithName("__vp_");
+  ValueObjectSP size = m_backend.GetChildMemberWithName("__size_");
+  ValueObjectSP stride = m_backend.GetChildMemberWithName("__stride_");
+
+  if (!start || !size || !stride)
+    return ChildCacheState::eRefetch;
+
+  m_start = start.get();
+  m_size = size->GetValueAsUnsigned(0);
+  m_stride = stride->GetValueAsUnsigned(0);
+
+  return ChildCacheState::eRefetch;
+}
+
+bool lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::
+    MightHaveChildren() {
+  return true;
+}
+
+size_t lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::
+    GetIndexOfChildWithName(ConstString name) {
+  if (!m_start)
+    return std::numeric_limits<size_t>::max();
+  return ExtractIndexFromString(name.GetCString());
+}
+
+lldb_private::SyntheticChildrenFrontEnd *
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+  return new LibcxxStdSliceArraySyntheticFrontEnd(valobj_sp);
+}
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
index 3961dcf0fbcc0..7b9938d4f0202 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
@@ -652,7 +652,7 @@ size_t ProcessMachCore::ReadMemory(addr_t addr, void *buf, size_t size,
                                    Status &error) {
   // Don't allow the caching that lldb_private::Process::ReadMemory does since
   // in core files we have it all cached our our core file anyway.
-  return DoReadMemory(addr, buf, size, error);
+  return DoReadMemory(FixAnyAddress(addr), buf, size, error);
 }
 
 size_t ProcessMachCore::DoReadMemory(addr_t addr, void *buf, size_t size,
diff --git a/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp b/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp
index f7df4650941a8..69595c6c14e4c 100644
--- a/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp
+++ b/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp
@@ -1066,11 +1066,21 @@ bool SymbolLocatorDebugSymbols::DownloadObjectAndSymbolFile(
   command << lookup_arg;
 
   // Log and report progress.
+  std::string lookup_desc;
+  if (uuid_ptr && file_spec_ptr)
+    lookup_desc =
+        llvm::formatv("{0} ({1})", file_spec_ptr->GetFilename().GetString(),
+                      uuid_ptr->GetAsString());
+  else if (uuid_ptr)
+    lookup_desc = uuid_ptr->GetAsString();
+  else if (file_spec_ptr)
+    lookup_desc = file_spec_ptr->GetFilename().GetString();
+
   Log *log = GetLog(LLDBLog::Host);
-  LLDB_LOG(log, "Calling {0} with {1} to find dSYM: {2}", dsymForUUID_exe_path,
-           lookup_arg, command.GetString());
+  LLDB_LOG(log, "Calling {0} for {1} to find dSYM: {2}", dsymForUUID_exe_path,
+           lookup_desc, command.GetString());
 
-  Progress progress("Downloading symbol file", lookup_arg);
+  Progress progress("Downloading symbol file for", lookup_desc);
 
   // Invoke dsymForUUID.
   int exit_status = -1;
diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp
index caf3e6636c1de..1542c8cb68ce6 100644
--- a/lldb/source/Target/Language.cpp
+++ b/lldb/source/Target/Language.cpp
@@ -13,6 +13,7 @@
 #include "lldb/Target/Language.h"
 
 #include "lldb/Core/PluginManager.h"
+#include "lldb/Interpreter/OptionValueProperties.h"
 #include "lldb/Symbol/SymbolFile.h"
 #include "lldb/Symbol/TypeList.h"
 #include "lldb/Target/Target.h"
@@ -27,6 +28,35 @@ using namespace lldb_private::formatters;
 typedef std::unique_ptr<Language> LanguageUP;
 typedef std::map<lldb::LanguageType, LanguageUP> LanguagesMap;
 
+#define LLDB_PROPERTIES_language
+#include "TargetProperties.inc"
+
+enum {
+#define LLDB_PROPERTIES_language
+#include "TargetPropertiesEnum.inc"
+};
+
+LanguageProperties &Language::GetGlobalLanguageProperties() {
+  static LanguageProperties g_settings;
+  return g_settings;
+}
+
+llvm::StringRef LanguageProperties::GetSettingName() {
+  static constexpr llvm::StringLiteral g_setting_name("language");
+  return g_setting_name;
+}
+
+LanguageProperties::LanguageProperties() {
+  m_collection_sp = std::make_shared<OptionValueProperties>(GetSettingName());
+  m_collection_sp->Initialize(g_language_properties);
+}
+
+bool LanguageProperties::GetEnableFilterForLineBreakpoints() const {
+  const uint32_t idx = ePropertyEnableFilterForLineBreakpoints;
+  return GetPropertyAtIndexAs<bool>(
+      idx, g_language_properties[idx].default_uint_value != 0);
+}
+
 static LanguagesMap &GetLanguagesMap() {
   static LanguagesMap *g_map = nullptr;
   static llvm::once_flag g_initialize;
diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index d2fccdb7b9b39..7f79218e0a6a4 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -311,3 +311,9 @@ let Definition = "thread" in {
     DefaultUnsignedValue<600000>,
     Desc<"Maximum number of frames to backtrace.">;
 }
+
+let Definition = "language" in {
+  def EnableFilterForLineBreakpoints: Property<"enable-filter-for-line-breakpoints", "Boolean">,
+    DefaultTrue,
+    Desc<"If true, allow Language plugins to filter locations when setting breakpoints by line number or regex.">;
+}
diff --git a/lldb/test/API/benchmarks/expression/TestExpressionCmd.py b/lldb/test/API/benchmarks/expression/TestExpressionCmd.py
index 9b512305d626b..8261b1b25da9f 100644
--- a/lldb/test/API/benchmarks/expression/TestExpressionCmd.py
+++ b/lldb/test/API/benchmarks/expression/TestExpressionCmd.py
@@ -17,10 +17,7 @@ def setUp(self):
         self.count = 25
 
     @benchmarks_test
-    @expectedFailureAll(
-        oslist=["windows"],
-        bugnumber="llvm.org/pr22274: need a pexpect replacement for windows",
-    )
+    @add_test_categories(["pexpect"])
     def test_expr_cmd(self):
         """Test lldb's expression commands and collect statistics."""
         self.build()
diff --git a/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py b/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py
index 104e69b384237..acc6b74c17b7e 100644
--- a/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py
+++ b/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py
@@ -19,10 +19,7 @@ def setUp(self):
         self.count = 100
 
     @benchmarks_test
-    @expectedFailureAll(
-        oslist=["windows"],
-        bugnumber="llvm.org/pr22274: need a pexpect replacement for windows",
-    )
+    @add_test_categories(["pexpect"])
     def test_compare_lldb_to_gdb(self):
         """Test repeated expressions with lldb vs. gdb."""
         self.build()
diff --git a/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py b/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py
index f3989fc0ff483..e364fb8ce7782 100644
--- a/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py
+++ b/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py
@@ -17,10 +17,7 @@ def setUp(self):
 
     @benchmarks_test
     @no_debug_info_test
-    @expectedFailureAll(
-        oslist=["windows"],
-        bugnumber="llvm.org/pr22274: need a pexpect replacement for windows",
-    )
+    @add_test_categories(["pexpect"])
     def test_startup_delay(self):
         """Test response time for the 'frame variable' command."""
         print()
diff --git a/lldb/test/API/benchmarks/startup/TestStartupDelays.py b/lldb/test/API/benchmarks/startup/TestStartupDelays.py
index f31a10507ed7e..faec21e95e5d2 100644
--- a/lldb/test/API/benchmarks/startup/TestStartupDelays.py
+++ b/lldb/test/API/benchmarks/startup/TestStartupDelays.py
@@ -22,10 +22,7 @@ def setUp(self):
 
     @benchmarks_test
     @no_debug_info_test
-    @expectedFailureAll(
-        oslist=["windows"],
-        bugnumber="llvm.org/pr22274: need a pexpect replacement for windows",
-    )
+    @add_test_categories(["pexpect"])
     def test_startup_delay(self):
         """Test start up delays creating a target, setting a breakpoint, and run to breakpoint stop."""
         print()
diff --git a/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py b/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py
index a3264e3f3253b..d0f9b0d61d17e 100644
--- a/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py
+++ b/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py
@@ -22,10 +22,7 @@ def setUp(self):
 
     @benchmarks_test
     @no_debug_info_test
-    @expectedFailureAll(
-        oslist=["windows"],
-        bugnumber="llvm.org/pr22274: need a pexpect replacement for windows",
-    )
+    @add_test_categories(["pexpect"])
     def test_run_lldb_steppings(self):
         """Test lldb steppings on a large executable."""
         print()
diff --git a/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py b/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py
index 98a2ec9ebf231..91527cd114534 100644
--- a/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py
+++ b/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py
@@ -21,10 +21,7 @@ def setUp(self):
 
     @benchmarks_test
     @no_debug_info_test
-    @expectedFailureAll(
-        oslist=["windows"],
-        bugnumber="llvm.org/pr22274: need a pexpect replacement for windows",
-    )
+    @add_test_categories(["pexpect"])
     def test_run_lldb_then_gdb(self):
         """Benchmark turnaround time with lldb vs. gdb."""
         print()
diff --git a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
index 040632096c70e..c650b1e3533e0 100644
--- a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
+++ b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
@@ -146,3 +146,15 @@ def test_void_result(self):
             self, "// break here", lldb.SBFileSpec("main.c")
         )
         self.expect("dwim-print (void)15", matching=False, patterns=["(?i)error"])
+
+    def test_preserves_persistent_variables(self):
+        """Test dwim-print does not delete persistent variables."""
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "// break here", lldb.SBFileSpec("main.c")
+        )
+        self.expect("dwim-print int $i = 15")
+        # Run the same expression twice and verify success. This ensures the
+        # first command does not delete the persistent variable.
+        for _ in range(2):
+            self.expect("dwim-print $i", startstr="(int) 15")
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_options/TestBreakpointOptions.py b/lldb/test/API/functionalities/breakpoint/breakpoint_options/TestBreakpointOptions.py
index 5179ffe730b9a..d262b627195bc 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_options/TestBreakpointOptions.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_options/TestBreakpointOptions.py
@@ -95,8 +95,10 @@ def breakpoint_options_language_test(self):
         self.expect(
             "breakpoint list -v",
             "Verbose breakpoint list contains mangled names",
+            # The demangled function name is system-dependent, e.g.
+            # 'int ns::func(void)' on Windows and 'ns::func()' on Linux.
             substrs=[
-                "function = ns::func",
+                f"function = {function.GetName()}",
                 f"mangled function = {function.GetMangledName()}",
             ],
         )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
index 7b54b3485d04d..b59b770ed6790 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
@@ -18,6 +18,10 @@ def test_with_run_command(self):
             self, "break here", lldb.SBFileSpec("main.cpp", False)
         )
 
+        #
+        # std::valarray
+        #
+
         self.expect(
             "frame variable va_int",
             substrs=[
@@ -76,3 +80,30 @@ def test_with_run_command(self):
             error=True,
             substrs=['array index 4 is not valid for "(valarray<double>) va_double"'],
         )
+
+        #
+        # std::slice_array
+        #
+
+        self.expect(
+            "frame variable sa",
+            substrs=[
+                "sa = stride=2 size=4",
+                "[0] = 1",
+                "[1] = 3",
+                "[2] = 5",
+                "[3] = 7",
+                "}",
+            ],
+        )
+
+        # check access-by-index
+        self.expect("frame variable sa[0]", substrs=["1"])
+        self.expect("frame variable sa[1]", substrs=["3"])
+        self.expect("frame variable sa[2]", substrs=["5"])
+        self.expect("frame variable sa[3]", substrs=["7"])
+        self.expect(
+            "frame variable sa[4]",
+            error=True,
+            substrs=['array index 4 is not valid for "(slice_array<int>) sa"'],
+        )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
index f32921e16fa10..1481d8b403292 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
@@ -13,5 +13,8 @@ int main() {
 
   std::valarray<double> va_double({1.0, 0.5, 0.25, 0.125});
 
+  std::valarray<int> va({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+  std::slice_array<int> sa = va[std::slice(1, 4, 2)];
+
   std::cout << "break here\n";
 }
diff --git a/lldb/test/API/macosx/indirect_symbol/TestIndirectSymbols.py b/lldb/test/API/macosx/indirect_symbol/TestIndirectSymbols.py
index fbe6db9f892d5..c4bbedc928913 100644
--- a/lldb/test/API/macosx/indirect_symbol/TestIndirectSymbols.py
+++ b/lldb/test/API/macosx/indirect_symbol/TestIndirectSymbols.py
@@ -16,6 +16,7 @@ def setUp(self):
 
     @skipUnlessDarwin
     @add_test_categories(["pyapi"])
+    @skipIf(bugnumber="rdar://120796553")
     def test_with_python_api(self):
         """Test stepping and setting breakpoints in indirect and re-exported symbols."""
         self.build()
diff --git a/lldb/test/API/macosx/tbi-honored/Makefile b/lldb/test/API/macosx/tbi-honored/Makefile
new file mode 100644
index 0000000000000..10495940055b6
--- /dev/null
+++ b/lldb/test/API/macosx/tbi-honored/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/macosx/tbi-honored/TestTBIHonored.py b/lldb/test/API/macosx/tbi-honored/TestTBIHonored.py
new file mode 100644
index 0000000000000..a5c0abd70e5a9
--- /dev/null
+++ b/lldb/test/API/macosx/tbi-honored/TestTBIHonored.py
@@ -0,0 +1,57 @@
+"""Test that lldb on Darwin ignores metadata in the top byte of addresses, both corefile and live."""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestTBIHonored(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def do_variable_access_tests(self, frame):
+        self.assertEqual(
+            frame.variables["pb"][0]
+            .GetChildMemberWithName("p")
+            .Dereference()
+            .GetValueAsUnsigned(),
+            15,
+        )
+        addr = frame.variables["pb"][0].GetChildMemberWithName("p").GetValueAsUnsigned()
+        # Confirm that there is metadata in the top byte of our pointer
+        self.assertEqual((addr >> 56) & 0xFF, 0xFE)
+        self.expect("expr -- *pb.p", substrs=["15"])
+        self.expect("frame variable *pb.p", substrs=["15"])
+        self.expect("expr -- *(int*)0x%x" % addr, substrs=["15"])
+
+    # This test is valid on AArch64 systems with TBI mode enabled,
+    # and an address mask that clears the top byte before reading
+    # from memory.
+    @skipUnlessDarwin
+    @skipIf(archs=no_match(["arm64", "arm64e"]))
+    @skipIfRemote
+    def test(self):
+        corefile = self.getBuildArtifact("process.core")
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "// break here", lldb.SBFileSpec("main.c")
+        )
+
+        # Test that we can dereference a pointer with TBI data
+        # in a live process.
+        self.do_variable_access_tests(thread.GetFrameAtIndex(0))
+
+        # Create a corefile, delete this process
+        self.runCmd("process save-core -s stack " + corefile)
+        process.Destroy()
+        self.dbg.DeleteTarget(target)
+
+        # Now load the corefile
+        target = self.dbg.CreateTarget("")
+        process = target.LoadCore(corefile)
+        thread = process.GetSelectedThread()
+        self.assertTrue(process.GetSelectedThread().IsValid())
+
+        # Test that we can dereference a pointer with TBI data
+        # in a corefile process.
+        self.do_variable_access_tests(thread.GetFrameAtIndex(0))
diff --git a/lldb/test/API/macosx/tbi-honored/main.c b/lldb/test/API/macosx/tbi-honored/main.c
new file mode 100644
index 0000000000000..3d7ad0b04cd66
--- /dev/null
+++ b/lldb/test/API/macosx/tbi-honored/main.c
@@ -0,0 +1,13 @@
+#include <stdint.h>
+#include <stdio.h>
+union ptrbytes {
+  int *p;
+  uint8_t bytes[8];
+};
+int main() {
+  int c = 15;
+  union ptrbytes pb;
+  pb.p = &c;
+  pb.bytes[7] = 0xfe;
+  printf("%d\n", *pb.p); // break here
+}
diff --git a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
index e5663c50c736e..21aca5fc85d5f 100644
--- a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
+++ b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
@@ -19,7 +19,7 @@ def classCleanup(cls):
         cls.RemoveTempFile("child_send2.txt")
         cls.RemoveTempFile("child_read2.txt")
 
-    @skipIfWindows  # llvm.org/pr22274: need a pexpect replacement for windows
+    @add_test_categories(["pexpect"])
     @no_debug_info_test
     def test_stty_dash_a_before_and_afetr_invoking_lldb_command(self):
         """Test that 'stty -a' displays the same output before and after running the lldb command."""
diff --git a/lldb/unittests/Host/AlarmTest.cpp b/lldb/unittests/Host/AlarmTest.cpp
new file mode 100644
index 0000000000000..9f6ad189dee97
--- /dev/null
+++ b/lldb/unittests/Host/AlarmTest.cpp
@@ -0,0 +1,159 @@
+//===-- AlarmTest.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/Alarm.h"
+#include "gtest/gtest.h"
+
+#include <chrono>
+#include <thread>
+
+using namespace lldb_private;
+using namespace std::chrono_literals;
+
+// Increase the timeout tenfold when running under ASan as it can have about the
+// same performance overhead.
+#if __has_feature(address_sanitizer)
+static constexpr auto TEST_TIMEOUT = 10000ms;
+#else
+static constexpr auto TEST_TIMEOUT = 1000ms;
+#endif
+
+// The time between scheduling a callback and it getting executed. This should
+// NOT be increased under ASan.
+static constexpr auto ALARM_TIMEOUT = 500ms;
+
+// If there are any pending callbacks, make sure they run before the Alarm
+// object is destroyed.
+static constexpr bool RUN_CALLBACKS_ON_EXIT = true;
+
+TEST(AlarmTest, Create) {
+  std::mutex m;
+
+  std::vector<Alarm::TimePoint> callbacks_actual;
+  std::vector<Alarm::TimePoint> callbacks_expected;
+
+  Alarm alarm(ALARM_TIMEOUT, RUN_CALLBACKS_ON_EXIT);
+
+  // Create 5 alarms some time apart.
+  for (size_t i = 0; i < 5; ++i) {
+    callbacks_actual.emplace_back();
+    callbacks_expected.emplace_back(std::chrono::system_clock::now() +
+                                    ALARM_TIMEOUT);
+
+    alarm.Create([&callbacks_actual, &m, i]() {
+      std::lock_guard<std::mutex> guard(m);
+      callbacks_actual[i] = std::chrono::system_clock::now();
+    });
+
+    std::this_thread::sleep_for(ALARM_TIMEOUT / 5);
+  }
+
+  // Leave plenty of time for all the alarms to fire.
+  std::this_thread::sleep_for(TEST_TIMEOUT);
+
+  // Make sure all the alarms fired around the expected time.
+  for (size_t i = 0; i < 5; ++i)
+    EXPECT_GE(callbacks_actual[i], callbacks_expected[i]);
+}
+
+TEST(AlarmTest, Exit) {
+  std::mutex m;
+
+  std::vector<Alarm::Handle> handles;
+  std::vector<bool> callbacks;
+
+  {
+    Alarm alarm(ALARM_TIMEOUT, RUN_CALLBACKS_ON_EXIT);
+
+    // Create 5 alarms.
+    for (size_t i = 0; i < 5; ++i) {
+      callbacks.emplace_back(false);
+
+      handles.push_back(alarm.Create([&callbacks, &m, i]() {
+        std::lock_guard<std::mutex> guard(m);
+        callbacks[i] = true;
+      }));
+    }
+
+    // Let the alarm go out of scope before any alarm had a chance to fire.
+  }
+
+  // Make sure none of the alarms fired.
+  for (bool callback : callbacks)
+    EXPECT_TRUE(callback);
+}
+
+TEST(AlarmTest, Cancel) {
+  std::mutex m;
+
+  std::vector<Alarm::Handle> handles;
+  std::vector<bool> callbacks;
+
+  Alarm alarm(ALARM_TIMEOUT, RUN_CALLBACKS_ON_EXIT);
+
+  // Create 5 alarms.
+  for (size_t i = 0; i < 5; ++i) {
+    callbacks.emplace_back(false);
+
+    handles.push_back(alarm.Create([&callbacks, &m, i]() {
+      std::lock_guard<std::mutex> guard(m);
+      callbacks[i] = true;
+    }));
+  }
+
+  // Make sure we can cancel the first 4 alarms.
+  for (size_t i = 0; i < 4; ++i)
+    EXPECT_TRUE(alarm.Cancel(handles[i]));
+
+  // Leave plenty of time for all the alarms to fire.
+  std::this_thread::sleep_for(TEST_TIMEOUT);
+
+  // Make sure none of the first 4 alarms fired.
+  for (size_t i = 0; i < 4; ++i)
+    EXPECT_FALSE(callbacks[i]);
+
+  // Make sure the fifth alarm still fired.
+  EXPECT_TRUE(callbacks[4]);
+}
+
+TEST(AlarmTest, Restart) {
+  std::mutex m;
+
+  std::vector<Alarm::Handle> handles;
+  std::vector<Alarm::TimePoint> callbacks_actual;
+  std::vector<Alarm::TimePoint> callbacks_expected;
+
+  Alarm alarm(ALARM_TIMEOUT, RUN_CALLBACKS_ON_EXIT);
+
+  // Create 5 alarms some time apart.
+  for (size_t i = 0; i < 5; ++i) {
+    callbacks_actual.emplace_back();
+    callbacks_expected.emplace_back(std::chrono::system_clock::now() +
+                                    ALARM_TIMEOUT);
+
+    handles.push_back(alarm.Create([&callbacks_actual, &m, i]() {
+      std::lock_guard<std::mutex> guard(m);
+      callbacks_actual[i] = std::chrono::system_clock::now();
+    }));
+
+    std::this_thread::sleep_for(ALARM_TIMEOUT / 5);
+  }
+
+  // Update the last 2 alarms.
+  for (size_t i = 3; i < 5; ++i) {
+    callbacks_expected[i] = std::chrono::system_clock::now() + ALARM_TIMEOUT;
+    EXPECT_TRUE(alarm.Restart(handles[i]));
+  }
+
+  // Leave plenty of time for all the alarms to fire.
+  std::this_thread::sleep_for(TEST_TIMEOUT);
+
+  // Make sure all the alarms around the expected time.
+  for (size_t i = 0; i < 5; ++i)
+    EXPECT_GE(callbacks_actual[i], callbacks_expected[i]);
+}
diff --git a/lldb/unittests/Host/CMakeLists.txt b/lldb/unittests/Host/CMakeLists.txt
index c959478970d18..7c09932b39c2a 100644
--- a/lldb/unittests/Host/CMakeLists.txt
+++ b/lldb/unittests/Host/CMakeLists.txt
@@ -1,4 +1,5 @@
 set (FILES
+  AlarmTest.cpp
   ConnectionFileDescriptorTest.cpp
   FileActionTest.cpp
   FileSystemTest.cpp
diff --git a/llvm-spirv/.github/workflows/check-code-style.yml b/llvm-spirv/.github/workflows/check-code-style.yml
index e55c400b3985e..f282adebcc9d6 100644
--- a/llvm-spirv/.github/workflows/check-code-style.yml
+++ b/llvm-spirv/.github/workflows/check-code-style.yml
@@ -32,7 +32,7 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
     - name: Checkout sources
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         # In order to gather diff from PR we need to fetch not only the latest
         # commit. Depth of 2 is enough, because GitHub Actions supply us with
diff --git a/llvm-spirv/.github/workflows/check-in-tree-build.yml b/llvm-spirv/.github/workflows/check-in-tree-build.yml
index 3de5748ebd4f6..c938b497f92e4 100644
--- a/llvm-spirv/.github/workflows/check-in-tree-build.yml
+++ b/llvm-spirv/.github/workflows/check-in-tree-build.yml
@@ -61,13 +61,13 @@ jobs:
           # pre-installed. Make sure to override these with the relevant version.
           sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ env.LLVM_VERSION }} 1000
       - name: Checkout LLVM sources
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: llvm/llvm-project
           ref: main
           path: llvm-project
       - name:  Checkout the translator sources
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           path: llvm-project/llvm/projects/SPIRV-LLVM-Translator
       - name:  Get tag for SPIR-V Headers
@@ -75,7 +75,7 @@ jobs:
         run: |
           echo "spirv_headers_tag=$(cat llvm-project/llvm/projects/SPIRV-LLVM-Translator/spirv-headers-tag.conf)" >> $GITHUB_ENV
       - name:  Checkout SPIR-V Headers
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: KhronosGroup/SPIRV-Headers
           ref: ${{ env.spirv_headers_tag }}
@@ -116,13 +116,13 @@ jobs:
     runs-on: windows-latest
     steps:
       - name: Checkout LLVM sources
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: llvm/llvm-project
           ref: main
           path: llvm-project
       - name:  Checkout the translator sources
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           path: llvm-project\\llvm\\projects\\SPIRV-LLVM-Translator
       - name:  Get tag for SPIR-V Headers
@@ -130,7 +130,7 @@ jobs:
         run: |
           echo "spirv_headers_tag=$(type llvm-project\\llvm\\projects\\SPIRV-LLVM-Translator\\spirv-headers-tag.conf)" >> $GITHUB_ENV
       - name:  Checkout SPIR-V Headers
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: KhronosGroup/SPIRV-Headers
           ref: ${{ env.spirv_headers_tag }}
@@ -168,13 +168,13 @@ jobs:
     continue-on-error: true
     steps:
       - name: Checkout LLVM sources
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: llvm/llvm-project
           ref: main
           path: llvm-project
       - name:  Checkout the translator sources
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           path: llvm-project/llvm/projects/SPIRV-LLVM-Translator
       - name:  Get tag for SPIR-V Headers
@@ -182,7 +182,7 @@ jobs:
         run: |
           echo "spirv_headers_tag=$(cat llvm-project/llvm/projects/SPIRV-LLVM-Translator/spirv-headers-tag.conf)" >> $GITHUB_ENV
       - name:  Checkout SPIR-V Headers
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: KhronosGroup/SPIRV-Headers
           ref: ${{ env.spirv_headers_tag }}
diff --git a/llvm-spirv/.github/workflows/check-out-of-tree-build.yml b/llvm-spirv/.github/workflows/check-out-of-tree-build.yml
index 3bea94c1903a3..4652195fe4f07 100644
--- a/llvm-spirv/.github/workflows/check-out-of-tree-build.yml
+++ b/llvm-spirv/.github/workflows/check-out-of-tree-build.yml
@@ -58,7 +58,7 @@ jobs:
           # pre-installed. Make sure to override these with the relevant version.
           sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ env.LLVM_VERSION }} 1000
       - name:  Checkout the translator sources
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           path: SPIRV-LLVM-Translator
       - name:  Get tag for SPIR-V Headers
@@ -66,7 +66,7 @@ jobs:
         run: |
           echo "spirv_headers_tag=$(cat SPIRV-LLVM-Translator/spirv-headers-tag.conf)" >> $GITHUB_ENV
       - name:  Checkout SPIR-V Headers
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: KhronosGroup/SPIRV-Headers
           ref: ${{ env.spirv_headers_tag }}
diff --git a/llvm-spirv/lib/SPIRV/OCLToSPIRV.cpp b/llvm-spirv/lib/SPIRV/OCLToSPIRV.cpp
index 342c3a1e95d7a..6c08c9e017506 100644
--- a/llvm-spirv/lib/SPIRV/OCLToSPIRV.cpp
+++ b/llvm-spirv/lib/SPIRV/OCLToSPIRV.cpp
@@ -52,6 +52,7 @@
 #include "llvm/Support/Debug.h"
 
 #include <algorithm>
+#include <regex>
 #include <set>
 
 using namespace llvm;
@@ -724,6 +725,13 @@ void OCLToSPIRVBase::visitCallBarrier(CallInst *CI) {
 
 void OCLToSPIRVBase::visitCallConvert(CallInst *CI, StringRef MangledName,
                                       StringRef DemangledName) {
+  // OpenCL Explicit Conversions (6.4.3) formed as below for scalars:
+  // destType convert_destType<_sat><_roundingMode>(sourceType)
+  // and for vector type:
+  // destTypeN convert_destTypeN<_sat><_roundingMode>(sourceTypeN)
+  // If the demangled name is not matching the suggested pattern and does not
+  // meet allowed destination type restrictions - this is not an OpenCL builtin,
+  // return from the function and translate such CallInst as a function call.
   if (eraseUselessConvert(CI, MangledName, DemangledName))
     return;
   Op OC = OpNop;
@@ -734,16 +742,56 @@ void OCLToSPIRVBase::visitCallConvert(CallInst *CI, StringRef MangledName,
   if (auto *VecTy = dyn_cast<VectorType>(SrcTy))
     SrcTy = VecTy->getElementType();
   auto IsTargetInt = isa<IntegerType>(TargetTy);
+  auto TargetSigned = DemangledName[8] != 'u';
 
   std::string TargetTyName(
       DemangledName.substr(strlen(kOCLBuiltinName::ConvertPrefix)));
   auto FirstUnderscoreLoc = TargetTyName.find('_');
   if (FirstUnderscoreLoc != std::string::npos)
     TargetTyName = TargetTyName.substr(0, FirstUnderscoreLoc);
+
+  // Validate target type name
+  std::regex Expr("([a-z]+)([0-9]*)$");
+  std::smatch DestTyMatch;
+  if (!std::regex_match(TargetTyName, DestTyMatch, Expr))
+    return;
+
+  // The first sub_match is the whole string; the next
+  // sub_match is the first parenthesized expression.
+  std::string DestTy = DestTyMatch[1].str();
+
+  // check it's valid type name
+  static std::unordered_set<std::string> ValidTypes = {
+      "float",  "double", "half", "char", "uchar", "short",
+      "ushort", "int",    "uint", "long", "ulong"};
+
+  if (ValidTypes.find(DestTy) == ValidTypes.end())
+    return;
+
+  // check that it's allowed vector size
+  std::string VecSize = DestTyMatch[2].str();
+  if (!VecSize.empty()) {
+    int Size = stoi(VecSize);
+    switch (Size) {
+    case 2:
+    case 3:
+    case 4:
+    case 8:
+    case 16:
+      break;
+    default:
+      return;
+    }
+  }
+  DemangledName = DemangledName.drop_front(
+      strlen(kOCLBuiltinName::ConvertPrefix) + TargetTyName.size());
   TargetTyName = std::string("_R") + TargetTyName;
 
+  if (!DemangledName.empty() && !DemangledName.starts_with("_sat") &&
+      !DemangledName.starts_with("_rt"))
+    return;
+
   std::string Sat = DemangledName.find("_sat") != StringRef::npos ? "_sat" : "";
-  auto TargetSigned = DemangledName[8] != 'u';
   if (isa<IntegerType>(SrcTy)) {
     bool Signed = isLastFuncParamSigned(MangledName);
     if (IsTargetInt) {
diff --git a/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp b/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp
index 14f1cf3198a5e..8a977513ceaba 100644
--- a/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp
+++ b/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp
@@ -168,9 +168,9 @@ void PreprocessMetadataBase::visit(Module *M) {
     // !{void (i32 addrspace(1)*)* @kernel, i32 35, i32 size}
     if (MDNode *ReqdSubgroupSize = Kernel.getMetadata(kSPIR2MD::SubgroupSize)) {
       // A primary named subgroup size is encoded as
-      // the metadata intel_reqd_sub_group_size with value 0.
+      // the metadata intel_reqd_sub_group_size with value -1.
       auto Val = getMDOperandAsInt(ReqdSubgroupSize, 0);
-      if (Val == 0)
+      if (Val == -1U)
         EM.addOp()
             .add(&Kernel)
             .add(spv::internal::ExecutionModeNamedSubgroupSizeINTEL)
diff --git a/llvm-spirv/lib/SPIRV/SPIRVLowerMemmove.cpp b/llvm-spirv/lib/SPIRV/SPIRVLowerMemmove.cpp
index 98e8e81f287e5..54238ebf160b9 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVLowerMemmove.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVLowerMemmove.cpp
@@ -72,9 +72,14 @@ void SPIRVLowerMemmoveBase::LowerMemMoveInst(MemMoveInst &I) {
       ArrayType::get(IntegerType::getInt8Ty(*Context), Length->getZExtValue());
   MaybeAlign SrcAlign = I.getSourceAlign();
 
-  auto *Alloca = Builder.CreateAlloca(AllocaTy);
-  if (SrcAlign.has_value())
-    Alloca->setAlignment(SrcAlign.value());
+  AllocaInst *Alloca;
+  {
+    IRBuilderBase::InsertPointGuard IG(Builder);
+    Builder.SetInsertPointPastAllocas(I.getParent()->getParent());
+    Alloca = Builder.CreateAlloca(AllocaTy);
+    if (SrcAlign.has_value())
+      Alloca->setAlignment(SrcAlign.value());
+  }
 
   // FIXME: Do we need to pass the size of alloca here? From LangRef:
   // > The first argument is a constant integer representing the size of the
diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index 9cca39e021035..dc14ebe684de4 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -4184,8 +4184,8 @@ bool SPIRVToLLVM::transMetadata() {
                      ->getLiterals()[0] == 0 &&
              "Invalid named sub group size");
       // On the LLVM IR side, this is represented as the metadata
-      // intel_reqd_sub_group_size with value 0.
-      auto *SizeMD = ConstantAsMetadata::get(getUInt32(M, 0));
+      // intel_reqd_sub_group_size with value -1.
+      auto *SizeMD = ConstantAsMetadata::get(getInt32(M, -1));
       F->setMetadata(kSPIR2MD::SubgroupSize, MDNode::get(*Context, SizeMD));
     }
     // Generate metadata for max_work_group_size
diff --git a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp
index a703eb4dfec36..f28159876cc36 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp
@@ -391,6 +391,70 @@ static void simplifyBuiltinVarAccesses(GlobalValue *GV) {
   }
 }
 
+namespace {
+void regularizeWithOverflowInstrinsics(StringRef MangledName, CallInst *Call,
+                                       Module *M,
+                                       std::vector<Instruction *> &ToErase) {
+  IRBuilder Builder(Call);
+  Function *Builtin = Call->getModule()->getFunction(MangledName);
+  AllocaInst *A;
+  StructType *StructBuiltinTy;
+  if (Builtin) {
+    StructBuiltinTy = cast<StructType>(Builtin->getParamStructRetType(0));
+    {
+      IRBuilderBase::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPointPastAllocas(Call->getParent()->getParent());
+      A = Builder.CreateAlloca(StructBuiltinTy);
+    }
+    CallInst *C = Builder.CreateCall(
+        Builtin, {A, Call->getArgOperand(0), Call->getArgOperand(1)});
+    auto SretAttr = Attribute::get(
+        Builder.getContext(), Attribute::AttrKind::StructRet, StructBuiltinTy);
+    C->addParamAttr(0, SretAttr);
+  } else {
+    StructBuiltinTy = StructType::create(
+        Call->getContext(),
+        {Call->getArgOperand(0)->getType(), Call->getArgOperand(1)->getType()});
+    {
+      IRBuilderBase::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPointPastAllocas(Call->getParent()->getParent());
+      A = Builder.CreateAlloca(StructBuiltinTy);
+    }
+    FunctionType *FT =
+        FunctionType::get(Builder.getVoidTy(),
+                          {A->getType(), Call->getArgOperand(0)->getType(),
+                           Call->getArgOperand(1)->getType()},
+                          false);
+    Builtin =
+        Function::Create(FT, GlobalValue::ExternalLinkage, MangledName, M);
+    Builtin->setCallingConv(CallingConv::SPIR_FUNC);
+    Builtin->addFnAttr(Attribute::NoUnwind);
+    auto SretAttr = Attribute::get(
+        Builder.getContext(), Attribute::AttrKind::StructRet, StructBuiltinTy);
+    Builtin->addParamAttr(0, SretAttr);
+    CallInst *C = Builder.CreateCall(
+        Builtin, {A, Call->getArgOperand(0), Call->getArgOperand(1)});
+    C->addParamAttr(0, SretAttr);
+  }
+  Type *RetTy = Call->getArgOperand(0)->getType();
+  Constant *ConstZero = ConstantInt::get(RetTy, 0);
+  Value *L = Builder.CreateLoad(StructBuiltinTy, A);
+  Value *V0 = Builder.CreateExtractValue(L, {0});
+  Value *V1 = Builder.CreateExtractValue(L, {1});
+  Value *V2 = Builder.CreateICmpNE(V1, ConstZero);
+  Type *StructI32I1Ty =
+      StructType::create(Call->getContext(), {RetTy, V2->getType()});
+  Value *Undef = UndefValue::get(StructI32I1Ty);
+  Value *V3 = Builder.CreateInsertValue(Undef, V0, {0});
+  Value *V4 = Builder.CreateInsertValue(V3, V2, {1});
+  SmallVector<User *> Users(Call->users());
+  for (User *U : Users) {
+    U->replaceUsesOfWith(Call, V4);
+  }
+  ToErase.push_back(Call);
+}
+} // namespace
+
 /// Remove entities not representable by SPIR-V
 bool SPIRVRegularizeLLVMBase::regularize() {
   eraseUselessFunctions(M);
@@ -430,6 +494,23 @@ bool SPIRVRegularizeLLVMBase::regularize() {
               lowerFunnelShift(II);
             else if (II->getIntrinsicID() == Intrinsic::umul_with_overflow)
               lowerUMulWithOverflow(II);
+            else if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow) {
+              BuiltinFuncMangleInfo Info;
+              std::string MangledName =
+                  mangleBuiltin("__spirv_IAddCarry",
+                                {Call->getArgOperand(0)->getType(),
+                                 Call->getArgOperand(1)->getType()},
+                                &Info);
+              regularizeWithOverflowInstrinsics(MangledName, Call, M, ToErase);
+            } else if (II->getIntrinsicID() == Intrinsic::usub_with_overflow) {
+              BuiltinFuncMangleInfo Info;
+              std::string MangledName =
+                  mangleBuiltin("__spirv_ISubBorrow",
+                                {Call->getArgOperand(0)->getType(),
+                                 Call->getArgOperand(1)->getType()},
+                                &Info);
+              regularizeWithOverflowInstrinsics(MangledName, Call, M, ToErase);
+            }
           }
         }
 
diff --git a/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.cpp b/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.cpp
index 508ad75301830..13e0c5521fb79 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVToLLVMDbgTran.cpp
@@ -1569,11 +1569,12 @@ SPIRVToLLVMDbgTran::transDebugIntrinsic(const SPIRVExtInst *DebugInst,
           AI, LocalVar.first, GetExpression(Ops[ExpressionIdx]),
           LocalVar.second, BB);
       AI->eraseFromParent();
-      return cast<Instruction *>(DbgDeclare);
+      return DbgDeclare.get<Instruction *>();
     }
-    return cast<Instruction *>(getDIBuilder(DebugInst).insertDeclare(
-        GetValue(Ops[VariableIdx]), LocalVar.first,
-        GetExpression(Ops[ExpressionIdx]), LocalVar.second, BB));
+    return getDIBuilder(DebugInst)
+        .insertDeclare(GetValue(Ops[VariableIdx]), LocalVar.first,
+                       GetExpression(Ops[ExpressionIdx]), LocalVar.second, BB)
+        .get<Instruction *>();
   }
   case SPIRVDebug::Value: {
     using namespace SPIRVDebug::Operand::DebugValue;
@@ -1589,10 +1590,10 @@ SPIRVToLLVMDbgTran::transDebugIntrinsic(const SPIRVExtInst *DebugInst,
     }
     if (!MDs.empty()) {
       DIArgList *AL = DIArgList::get(M->getContext(), MDs);
-      cast<DbgVariableIntrinsic>(cast<Instruction *>(DbgValIntr))
-                                               ->setRawLocation(AL);
+      cast<DbgVariableIntrinsic>(DbgValIntr.get<Instruction *>())
+          ->setRawLocation(AL);
     }
-    return cast<Instruction *>(DbgValIntr);
+    return DbgValIntr.get<Instruction *>();
   }
   default:
     llvm_unreachable("Unknown debug intrinsic!");
diff --git a/llvm-spirv/lib/SPIRV/SPIRVTypeScavenger.cpp b/llvm-spirv/lib/SPIRV/SPIRVTypeScavenger.cpp
index cf75eb8d1d0be..6116658036520 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVTypeScavenger.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVTypeScavenger.cpp
@@ -617,6 +617,12 @@ void SPIRVTypeScavenger::typeGlobalValue(GlobalValue &GV, Constant *Init) {
       auto It = DeducedTypes.find(C);
       if (It != DeducedTypes.end())
         return It->second;
+    } else if (auto *GEP = dyn_cast<GEPOperator>(C)) {
+      auto *ResultTy =
+          TypedPointerType::get(GEP->getResultElementType(),
+                                GEP->getType()->getPointerAddressSpace());
+      DeducedTypes[C] = ResultTy;
+      return ResultTy;
     }
 
     return getUnknownTyped(C->getType());
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index a71e4aef89847..c8222d7985760 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -1421,6 +1421,15 @@ SPIRVValue *LLVMToSPIRVBase::transConstant(Value *V) {
   }
 
   if (auto *ConstUE = dyn_cast<ConstantExpr>(V)) {
+    if (auto *GEP = dyn_cast<GEPOperator>(ConstUE)) {
+      std::vector<SPIRVValue *> Indices;
+      for (unsigned I = 0, E = GEP->getNumIndices(); I != E; ++I)
+        Indices.push_back(transValue(GEP->getOperand(I + 1), nullptr));
+      auto *TransPointerOperand = transValue(GEP->getPointerOperand(), nullptr);
+      SPIRVType *TranslatedTy = transScavengedType(GEP);
+      return BM->addPtrAccessChainInst(TranslatedTy, TransPointerOperand,
+                                       Indices, nullptr, GEP->isInBounds());
+    }
     auto *Inst = ConstUE->getAsInstruction();
     SPIRVDBG(dbgs() << "ConstantExpr: " << *ConstUE << '\n';
              dbgs() << "Instruction: " << *Inst << '\n';)
@@ -2271,6 +2280,29 @@ LLVMToSPIRVBase::transValueWithoutDecoration(Value *V, SPIRVBasicBlock *BB,
   if (AllocaInst *Alc = dyn_cast<AllocaInst>(V)) {
     SPIRVType *TranslatedTy = transScavengedType(V);
     if (Alc->isArrayAllocation()) {
+      SPIRVValue *Length = transValue(Alc->getArraySize(), BB);
+      assert(Length && "Couldn't translate array size!");
+
+      if (isSpecConstantOpCode(Length->getOpCode())) {
+        // SPIR-V arrays length can be expressed using a specialization
+        // constant.
+        //
+        // Spec Constant Length Arrays need special treatment, as the allocation
+        // type will be 'OpTypePointer(Function, OpTypeArray(ElementType,
+        // Length))', we need to bitcast the obtained pointer to the expected
+        // type: 'OpTypePointer(Function, ElementType).
+        SPIRVType *AllocationType = BM->addPointerType(
+            StorageClassFunction,
+            BM->addArrayType(transType(Alc->getAllocatedType()), Length));
+        SPIRVValue *Arr = BM->addVariable(
+            AllocationType, false, spv::internal::LinkageTypeInternal, nullptr,
+            Alc->getName().str() + "_alloca", StorageClassFunction, BB);
+        // Manually set alignment. OpBitcast created below will be decorated as
+        // that's the SPIR-V value mapped to the original LLVM one.
+        transAlign(Alc, Arr);
+        return mapValue(V, BM->addUnaryInst(OpBitcast, TranslatedTy, Arr, BB));
+      }
+
       if (!BM->checkExtension(ExtensionID::SPV_INTEL_variable_length_array,
                               SPIRVEC_InvalidInstruction,
                               toString(Alc) +
@@ -2278,8 +2310,6 @@ LLVMToSPIRVBase::transValueWithoutDecoration(Value *V, SPIRVBasicBlock *BB,
                                   "SPV_INTEL_variable_length_array extension."))
         return nullptr;
 
-      SPIRVValue *Length = transValue(Alc->getArraySize(), BB);
-      assert(Length && "Couldn't translate array size!");
       return mapValue(V,
                       BM->addInstTemplate(OpVariableLengthArrayINTEL,
                                           {Length->getId()}, BB, TranslatedTy));
@@ -6116,7 +6146,7 @@ LLVMToSPIRVBase::transBuiltinToInstWithoutDecoration(Op OC, CallInst *CI,
     OC = OpAtomicCompareExchange;
   if (isGroupOpCode(OC))
     BM->addCapability(CapabilityGroups);
-  switch (OC) {
+  switch (static_cast<size_t>(OC)) {
   case OpControlBarrier: {
     auto BArgs = transValue(getArguments(CI), BB);
     return BM->addControlBarrierInst(BArgs[0], BArgs[1], BArgs[2], BB);
@@ -6421,6 +6451,25 @@ LLVMToSPIRVBase::transBuiltinToInstWithoutDecoration(Op OC, CallInst *CI,
     return BM->addStoreInst(transValue(CI->getArgOperand(0), BB), APIntInst, {},
                             BB);
   }
+  case internal::OpTaskSequenceGetINTEL: {
+    Type *ResTy = nullptr;
+    auto OpItr = CI->value_op_begin();
+
+    if (CI->hasStructRetAttr()) {
+      assert(CI->getType()->isVoidTy() && "Return type is not void");
+      ResTy = CI->getParamStructRetType(0);
+      OpItr++;
+    }
+
+    SPIRVType *RetTy = ResTy ? transType(ResTy) : transScavengedType(CI);
+    auto *TaskSeqGet =
+        BM->addTaskSequenceGetINTELInst(RetTy, transValue(*OpItr++, BB), BB);
+
+    if (!CI->hasStructRetAttr())
+      return TaskSeqGet;
+    return BM->addStoreInst(transValue(CI->getArgOperand(0), BB), TaskSeqGet,
+                            {}, BB);
+  }
   case OpLoad: {
     std::vector<SPIRVWord> MemoryAccess;
     assert(CI->arg_size() > 0 && "Expected at least 1 operand for OpLoad call");
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVBasicBlock.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVBasicBlock.cpp
index 1f0a59fab207e..d976025a356a7 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVBasicBlock.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVBasicBlock.cpp
@@ -87,6 +87,20 @@ void SPIRVBasicBlock::encodeChildren(spv_ostream &O) const {
 
 _SPIRV_IMP_ENCDEC1(SPIRVBasicBlock, Id)
 
+SPIRVInstruction *SPIRVBasicBlock::getVariableInsertionPoint() const {
+  auto IP =
+      std::find_if(InstVec.begin(), InstVec.end(), [](SPIRVInstruction *Inst) {
+        return !(isa<OpVariable>(Inst) || isa<OpLine>(Inst) ||
+                 isa<OpNoLine>(Inst) ||
+                 // Note: OpVariable and OpPhi instructions do not belong to the
+                 // same block in a valid SPIR-V module.
+                 isa<OpPhi>(Inst));
+      });
+  if (IP == InstVec.end())
+    return nullptr;
+  return *IP;
+}
+
 void SPIRVBasicBlock::setScope(SPIRVEntry *Scope) {
   assert(Scope && Scope->getOpCode() == OpFunction && "Invalid scope");
   setParent(static_cast<SPIRVFunction *>(Scope));
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVBasicBlock.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVBasicBlock.h
index 0eb0b673c27eb..3cf00c7373974 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVBasicBlock.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVBasicBlock.h
@@ -77,6 +77,10 @@ class SPIRVBasicBlock : public SPIRVValue {
   const SPIRVInstruction *getTerminateInstr() const {
     return InstVec.empty() ? nullptr : InstVec.back();
   }
+  // OpVariable instructions must be the first instructions in the block,
+  // intermixed with OpLine and OpNoLine instructions. Return first instruction
+  // not being an OpVariable, OpLine or OpNoLine.
+  SPIRVInstruction *getVariableInsertionPoint() const;
 
   void setScope(SPIRVEntry *Scope) override;
   void setParent(SPIRVFunction *F) { ParentF = F; }
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 3a2fec68606ae..68221868bab05 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -443,8 +443,8 @@ class SPIRVVariable : public SPIRVInstruction {
                 const std::string &TheName,
                 SPIRVStorageClassKind TheStorageClass, SPIRVBasicBlock *TheBB,
                 SPIRVModule *TheM)
-      : SPIRVInstruction(TheInitializer ? 5 : 4, OpVariable, TheType, TheId,
-                         TheBB, TheM),
+      : SPIRVInstruction(TheInitializer && !TheInitializer->isUndef() ? 5 : 4,
+                         OpVariable, TheType, TheId, TheBB, TheM),
         StorageClass(TheStorageClass) {
     if (TheInitializer && !TheInitializer->isUndef())
       Initializer.push_back(TheInitializer->getId());
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
index 8d17b14922db9..0d0101e8119fc 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
@@ -245,7 +245,7 @@ class SPIRVModuleImpl : public SPIRVModule {
 
   // Type creation functions
   template <class T> T *addType(T *Ty);
-  SPIRVTypeArray *addArrayType(SPIRVType *, SPIRVConstant *) override;
+  SPIRVTypeArray *addArrayType(SPIRVType *, SPIRVValue *) override;
   SPIRVTypeBool *addBoolType() override;
   SPIRVTypeFloat *addFloatType(unsigned BitWidth) override;
   SPIRVTypeFunction *addFunctionType(SPIRVType *,
@@ -269,6 +269,8 @@ class SPIRVModuleImpl : public SPIRVModule {
   SPIRVTypeCooperativeMatrixKHR *
   addCooperativeMatrixKHRType(SPIRVType *, std::vector<SPIRVValue *>) override;
   SPIRVTypeTaskSequenceINTEL *addTaskSequenceINTELType() override;
+  SPIRVInstruction *addTaskSequenceGetINTELInst(SPIRVType *, SPIRVValue *,
+                                                SPIRVBasicBlock *) override;
   SPIRVType *addOpaqueGenericType(Op) override;
   SPIRVTypeDeviceEvent *addDeviceEventType() override;
   SPIRVTypeQueue *addQueueType() override;
@@ -968,7 +970,7 @@ SPIRVTypeVoid *SPIRVModuleImpl::addVoidType() {
 }
 
 SPIRVTypeArray *SPIRVModuleImpl::addArrayType(SPIRVType *ElementType,
-                                              SPIRVConstant *Length) {
+                                              SPIRVValue *Length) {
   return addType(new SPIRVTypeArray(this, getId(), ElementType, Length));
 }
 
@@ -1053,6 +1055,14 @@ SPIRVTypeTaskSequenceINTEL *SPIRVModuleImpl::addTaskSequenceINTELType() {
   return addType(new SPIRVTypeTaskSequenceINTEL(this, getId()));
 }
 
+SPIRVInstruction *SPIRVModuleImpl::addTaskSequenceGetINTELInst(
+    SPIRVType *RetTy, SPIRVValue *ObjPtr, SPIRVBasicBlock *BB) {
+  return addInstruction(
+      SPIRVInstTemplateBase::create(internal::OpTaskSequenceGetINTEL, RetTy,
+                                    getId(), getVec(ObjPtr->getId()), BB, this),
+      BB);
+}
+
 SPIRVType *SPIRVModuleImpl::addOpaqueGenericType(Op TheOpCode) {
   return addType(new SPIRVTypeOpaqueGeneric(TheOpCode, this, getId()));
 }
@@ -1828,7 +1838,7 @@ SPIRVInstruction *SPIRVModuleImpl::addVariable(
   SPIRVVariable *Variable = new SPIRVVariable(Type, getId(), Initializer, Name,
                                               StorageClass, BB, this);
   if (BB)
-    return addInstruction(Variable, BB);
+    return addInstruction(Variable, BB, BB->getVariableInsertionPoint());
 
   add(Variable);
   if (LinkageTy != internal::LinkageTypeInternal)
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h
index 4ec78631fce91..52083a263c514 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h
@@ -240,7 +240,7 @@ class SPIRVModule {
   virtual void eraseInstruction(SPIRVInstruction *, SPIRVBasicBlock *) = 0;
 
   // Type creation functions
-  virtual SPIRVTypeArray *addArrayType(SPIRVType *, SPIRVConstant *) = 0;
+  virtual SPIRVTypeArray *addArrayType(SPIRVType *, SPIRVValue *) = 0;
   virtual SPIRVTypeBool *addBoolType() = 0;
   virtual SPIRVTypeFloat *addFloatType(unsigned) = 0;
   virtual SPIRVTypeFunction *
@@ -266,6 +266,8 @@ class SPIRVModule {
   virtual SPIRVTypeCooperativeMatrixKHR *
   addCooperativeMatrixKHRType(SPIRVType *, std::vector<SPIRVValue *>) = 0;
   virtual SPIRVTypeTaskSequenceINTEL *addTaskSequenceINTELType() = 0;
+  virtual SPIRVInstruction *
+  addTaskSequenceGetINTELInst(SPIRVType *, SPIRVValue *, SPIRVBasicBlock *) = 0;
   virtual SPIRVTypeVoid *addVoidType() = 0;
   virtual SPIRVType *addOpaqueGenericType(Op) = 0;
   virtual SPIRVTypeDeviceEvent *addDeviceEventType() = 0;
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.cpp
index 674003b0d748f..6f634d284c5c6 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.cpp
@@ -56,7 +56,7 @@ uint64_t SPIRVType::getArrayLength() const {
   const SPIRVTypeArray *AsArray = static_cast<const SPIRVTypeArray *>(this);
   assert(AsArray->getLength()->getOpCode() == OpConstant &&
          "getArrayLength can only be called with constant array lengths");
-  return AsArray->getLength()->getZExtIntValue();
+  return static_cast<SPIRVConstant *>(AsArray->getLength())->getZExtIntValue();
 }
 
 SPIRVWord SPIRVType::getBitWidth() const {
@@ -263,7 +263,7 @@ void SPIRVTypeStruct::setPacked(bool Packed) {
 }
 
 SPIRVTypeArray::SPIRVTypeArray(SPIRVModule *M, SPIRVId TheId,
-                               SPIRVType *TheElemType, SPIRVConstant *TheLength)
+                               SPIRVType *TheElemType, SPIRVValue *TheLength)
     : SPIRVType(M, 4, OpTypeArray, TheId), ElemType(TheElemType),
       Length(TheLength->getId()) {
   validate();
@@ -273,11 +273,10 @@ void SPIRVTypeArray::validate() const {
   SPIRVEntry::validate();
   ElemType->validate();
   assert(getValue(Length)->getType()->isTypeInt());
+  assert(isConstantOpCode(getValue(Length)->getOpCode()));
 }
 
-SPIRVConstant *SPIRVTypeArray::getLength() const {
-  return get<SPIRVConstant>(Length);
-}
+SPIRVValue *SPIRVTypeArray::getLength() const { return getValue(Length); }
 
 _SPIRV_IMP_ENCDEC3(SPIRVTypeArray, Id, ElemType, Length)
 
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h
index 894d7b339f993..086fe1a6756ae 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h
@@ -390,13 +390,13 @@ class SPIRVTypeArray : public SPIRVType {
 public:
   // Complete constructor
   SPIRVTypeArray(SPIRVModule *M, SPIRVId TheId, SPIRVType *TheElemType,
-                 SPIRVConstant *TheLength);
+                 SPIRVValue *TheLength);
   // Incomplete constructor
   SPIRVTypeArray()
       : SPIRVType(OpTypeArray), ElemType(nullptr), Length(SPIRVID_INVALID) {}
 
   SPIRVType *getElementType() const { return ElemType; }
-  SPIRVConstant *getLength() const;
+  SPIRVValue *getLength() const;
   SPIRVCapVec getRequiredCapability() const override {
     return getElementType()->getRequiredCapability();
   }
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVValue.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVValue.h
index fb80bd51708dc..bd5336567ce9a 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVValue.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVValue.h
@@ -111,8 +111,10 @@ class SPIRVValue : public SPIRVEntry {
 
   void setType(SPIRVType *Ty) {
     Type = Ty;
-    assert(!Ty || !Ty->isTypeVoid() || OpCode == OpFunction);
-    if (Ty && (!Ty->isTypeVoid() || OpCode == OpFunction))
+    assert(!Ty || !Ty->isTypeVoid() || OpCode == OpFunction ||
+           OpCode == internal::OpTaskSequenceGetINTEL);
+    if (Ty && (!Ty->isTypeVoid() || OpCode == OpFunction ||
+               OpCode == internal::OpTaskSequenceGetINTEL))
       setHasType();
     else
       setHasNoType();
diff --git a/llvm-spirv/test/SpecConstants/spec-constant-length-array.ll b/llvm-spirv/test/SpecConstants/spec-constant-length-array.ll
new file mode 100644
index 0000000000000..a88b2b8cb0d92
--- /dev/null
+++ b/llvm-spirv/test/SpecConstants/spec-constant-length-array.ll
@@ -0,0 +1,77 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv -spirv-text -o - %t.bc | FileCheck %s --check-prefix CHECK-SPV
+; RUN: llvm-spirv -o %t.spv %t.bc
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r -o - %t.spv | llvm-dis | FileCheck %s --check-prefix CHECK-LLVM
+
+; CHECK-SPV-DAG: Decorate [[#I64_CONST:]] SpecId [[#]]
+; CHECK-SPV-DAG: Decorate [[#I32_CONST:]] SpecId [[#]]
+; CHECK-SPV-DAG: Decorate [[#I8_CONST:]] SpecId [[#]]
+; CHECK-SPV-DAG: Decorate [[#SCLA_0:]] Alignment 4
+; CHECK-SPV-DAG: Decorate [[#SCLA_1:]] Alignment 2
+; CHECK-SPV-DAG: Decorate [[#SCLA_2:]] Alignment 16
+
+; CHECK-SPV-DAG: TypeInt [[#I64_TY:]] 64
+; CHECK-SPV-DAG: TypeInt [[#I32_TY:]] 32
+; CHECK-SPV-DAG: TypeInt [[#I8_TY:]] 8
+
+; CHECK-SPV-DAG: SpecConstant [[#I64_TY]] [[#LENGTH_0:]]
+; CHECK-SPV-DAG: SpecConstant [[#I32_TY]] [[#LENGTH_1:]]
+; CHECK-SPV-DAG: SpecConstant [[#I8_TY]] [[#LENGTH_2:]]
+
+; CHECK-SPV-DAG: TypeFloat [[#FLOAT_TY:]] 32
+; CHECK-SPV-DAG: TypePointer [[#FLOAT_PTR_TY:]] [[#FUNCTION_SC:]] [[#FLOAT_TY]]
+; CHECK-SPV-DAG: TypeArray [[#ARR_TY_0:]] [[#FLOAT_TY]] [[#LENGTH_0]]
+; CHECK-SPV-DAG: TypePointer [[#ARR_PTR_TY_0:]] [[#FUNCTION_SC]] [[#ARR_TY_0]]
+; CHECK-SPV-DAG: TypePointer [[#I8_PTR_TY:]] [[#FUNCTION_SC]] [[#I8_TY]]
+; CHECK-SPV-DAG: TypeArray [[#ARR_TY_1:]] [[#I8_TY]] [[#LENGTH_1]]
+; CHECK-SPV-DAG: TypePointer [[#ARR_PTR_TY_1:]] [[#FUNCTION_SC]] [[#ARR_TY_1]]
+; CHECK-SPV-DAG: TypeFloat [[#DOUBLE_TY:]] 64
+; CHECK-SPV-DAG: TypeStruct [[#STR_TY:]] [[#DOUBLE_TY]] [[#DOUBLE_TY]]
+; CHECK-SPV-DAG: TypePointer [[#STR_PTR_TY:]] [[#FUNCTION_SC]] [[#STR_TY]]
+; CHECK-SPV-DAG: TypeArray [[#ARR_TY_2:]] [[#STR_TY]] [[#LENGTH_2]]
+; CHECK-SPV-DAG: TypePointer [[#ARR_PTR_TY_2:]] [[#FUNCTION_SC]] [[#ARR_TY_2:]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+%struct_type = type { double, double }
+
+define spir_kernel void @test() {
+ entry:
+  %length0 = call i64 @_Z20__spirv_SpecConstantix(i32 0, i64 1), !SYCL_SPEC_CONST_SYM_ID !0
+  %length1 = call i32 @_Z20__spirv_SpecConstantii(i32 1, i32 2), !SYCL_SPEC_CONST_SYM_ID !1
+  %length2 = call i8 @_Z20__spirv_SpecConstantic(i32 2, i8 4), !SYCL_SPEC_CONST_SYM_ID !2
+
+  ; CHECK-SPV: Variable [[#ARR_PTR_TY_0]] [[#SCLA_0]] [[#FUNCTION_SC]]
+  ; CHECK-SPV: Variable [[#ARR_PTR_TY_1]] [[#SCLA_1]] [[#FUNCTION_SC]]
+  ; CHECK-SPV: Variable [[#ARR_PTR_TY_2]] [[#SCLA_2]] [[#FUNCTION_SC]]
+
+  ; CHECK-LLVM: %[[ALLOCA0:.*]] = alloca [1 x float], align 4
+  ; CHECK-LLVM: %[[ALLOCA1:.*]] = alloca [2 x i8], align 2
+  ; CHECK-LLVM: %[[ALLOCA2:.*]] = alloca [4 x %struct_type], align 16
+
+  ; CHECK-SPV: Bitcast [[#FLOAT_PTR_TY]] [[#]] [[#SCLA_0]]
+
+  ; CHECK-LLVM: %[[VAR0:.*]] = bitcast ptr %[[ALLOCA0]] to ptr
+  %scla0 = alloca float, i64 %length0, align 4
+
+  ; CHECK-SPV: Bitcast [[#I8_PTR_TY]] [[#]] [[#SCLA_1]]
+
+  ; CHECK-LLVM: %[[VAR1:.*]] = bitcast ptr %[[ALLOCA1]] to ptr
+  %scla1 = alloca i8, i32 %length1, align 2
+
+  ; CHECK-SPV: Bitcast [[#STR_PTR_TY]] [[#]] [[#SCLA_2]]
+
+  ; CHECK-LLVM: %[[VAR2:.*]] = bitcast ptr %[[ALLOCA2]] to ptr
+  %scla2 = alloca %struct_type, i8 %length2, align 16
+  ret void
+}
+
+declare i8 @_Z20__spirv_SpecConstantic(i32, i8)
+declare i32 @_Z20__spirv_SpecConstantii(i32, i32)
+declare i64 @_Z20__spirv_SpecConstantix(i32, i64)
+
+!0 = !{!"i64_spec_const", i32 0}
+!1 = !{!"i32_spec_const", i32 1}
+!2 = !{!"i8_spec_const", i32 2}
diff --git a/llvm-spirv/test/constexpr_vector.ll b/llvm-spirv/test/constexpr_vector.ll
index 706bbbfb86af7..f918f3f05fc73 100644
--- a/llvm-spirv/test/constexpr_vector.ll
+++ b/llvm-spirv/test/constexpr_vector.ll
@@ -73,8 +73,8 @@ entry:
 
 ; CHECK-LLVM: define spir_func void @vadd()
 ; CHECK-LLVM: %Funcs = alloca <16 x i8>, align 16
-; CHECK-LLVM: store <16 x i8> <i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 0), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 1), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 2), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 3), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 4), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 5), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 6), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 7), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 0), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 1), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 2), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 3), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 4), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 5), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 6), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 7)>, ptr %Funcs, align 16
 ; CHECK-LLVM: %Funcs1 = alloca <2 x i64>, align 16
+; CHECK-LLVM: store <16 x i8> <i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 0), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 1), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 2), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 3), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 4), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 5), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 6), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 7), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 0), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 1), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 2), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 3), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 4), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 5), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 6), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 7)>, ptr %Funcs, align 16
 ; CHECK-LLVM: store <2 x i64> <i64 ptrtoint (ptr @_Z2f1u2CMvb32_j to i64), i64 ptrtoint (ptr @_Z2f2u2CMvb32_j to i64)>, ptr %Funcs1, align 16
 ; Function Attrs: noinline nounwind
 define dllexport void @vadd() {
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_subgroup_requirements/named_sub_group_sizes.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_subgroup_requirements/named_sub_group_sizes.ll
index 159271012d0b5..74cad6c95a00a 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_subgroup_requirements/named_sub_group_sizes.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_subgroup_requirements/named_sub_group_sizes.ll
@@ -19,12 +19,15 @@
 ; CHECK-SPIRV: ExecutionMode [[kernel]] 6446 0
 
 ; CHECK-LLVM: spir_kernel void @_ZTSZ4mainE7Kernel1() {{.*}} !intel_reqd_sub_group_size ![[MD:[0-9]+]]
-; CHECK-LLVM: ![[MD]] = !{i32 0}
+; CHECK-LLVM: ![[MD]] = !{i32 -1}
 
+; Check that with no SPV_INTEL_subgroup_requirements,
+; (1) No SubgroupRequirementsINTEL Capability nor SPV_INTEL_subgroup_requirements extension is attached
+; (2) the SubgroupSize (=35) ExecutionMode is attached with the value original value from the LLVM IR 
 ; CHECK-SPIRV-2-NOT: Capability SubgroupRequirementsINTEL
 ; CHECK-SPIRV-2-NOT: Extension "SPV_INTEL_subgroup_requirements"
 ; CHECK-SPIRV-2: EntryPoint 6 [[kernel:[0-9]+]] "_ZTSZ4mainE7Kernel1"
-; CHECK-SPIRV-2: ExecutionMode [[kernel]] 35 0
+; CHECK-SPIRV-2: ExecutionMode [[kernel]] 35 4294967295
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
 target triple = "spir64-unknown-unknown"
@@ -51,4 +54,4 @@ attributes #0 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trap
 !4 = !{!"clang version 18.0.0git (/ws/llvm/clang 8fd29b3c2aa9f9ce163be557b51de39c95aaf230)"}
 !5 = !{i32 358}
 !6 = !{}
-!7 = !{i32 0}
+!7 = !{i32 -1}
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_task_sequence/sret-get.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_task_sequence/sret-get.ll
new file mode 100644
index 0000000000000..af966d72b4584
--- /dev/null
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_task_sequence/sret-get.ll
@@ -0,0 +1,90 @@
+; Test translation of __spirv_TaskSequenceGetINTEL with a sret (structure return) parameter.
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_task_sequence -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis %t.rev.bc
+; RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV: TypeTaskSequenceINTEL [[#TypeTS:]]
+; CHECK-SPIRV: TypeStruct [[#TypeStruct:]]
+
+; CHECK-SPIRV: TaskSequenceCreateINTEL [[#TypeTS]] [[#TSCreateId:]]
+; CHECK-SPIRV: TaskSequenceGetINTEL [[#TypeStruct]] [[#Get:]] [[#TSCreateId]]
+; CHECK-SPIRV: Store [[#]] [[#Get]]
+
+; CHECK-LLVM: %struct.FunctionPacket = type <{ float, i8, [3 x i8] }>
+
+; CHECK-LLVM: %[[TSCreate:[a-z0-9.]+]] = call spir_func target("spirv.TaskSequenceINTEL") @_Z66__spirv_TaskSequenceCreateINTEL_RPU3AS125__spirv_TaskSequenceINTELPiiiii(ptr @_Z4multii, i32 -1, i32 -1, i32 1, i32 32)
+; CHECK-LLVM: %[[Var:[a-z0-9.]+]] = alloca %struct.FunctionPacket
+; CHECK-LLVM: %[[Ptr:[a-z0-9.]+]] = addrspacecast ptr %[[Var]] to ptr addrspace(4)
+; CHECK-LLVM: call spir_func void @_Z28__spirv_TaskSequenceGetINTEL{{.*}}(ptr addrspace(4) sret(%struct.FunctionPacket) %[[Ptr]], target("spirv.TaskSequenceINTEL") %[[TSCreate]])
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+$_ZTS8MyKernel = comdat any
+
+%struct.FunctionPacket = type <{ float, i8, [3 x i8] }>
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define weak_odr dso_local spir_kernel void @_ZTS8MyKernel(ptr addrspace(1) noundef align 4 %_arg_in, ptr addrspace(1) noundef align 4 %_arg_res) local_unnamed_addr #0 comdat !srcloc !5 !kernel_arg_buffer_location !6 !sycl_fixed_targets !7 !sycl_kernel_omit_args !8 !stall_enable !9 {
+entry:
+  %call.i1 = call spir_func noundef target("spirv.TaskSequenceINTEL") @_Z31__spirv_TaskSequenceCreateINTELIN4sycl3_V13ext5intel12experimental13task_sequenceIL_Z4multiiENS2_6oneapi12experimental10propertiesISt5tupleIJNS7_14property_valueINS4_13pipelined_keyEJSt17integral_constantIiLin1EEEEENSA_INS4_16fpga_cluster_keyEJSC_INS4_25fpga_cluster_options_enumELSG_1EEEEENSA_INS4_12balanced_keyEJEEENSA_INS4_23invocation_capacity_keyEJSC_IjLj10EEEEENSA_INS4_21response_capacity_keyEJSC_IjLj17EEEEEEEEEEEiJiiEEmPT_PFT0_DpT1_Ejjit(ptr noundef nonnull @_Z4multii, i32 noundef -1, i32 noundef -1, i32 noundef 1, i32 noundef 32) #3
+  br label %for.body10.i
+
+for.body10.i:                                     ; preds = %entry, %for.body10.i
+  %i5.0.i9 = phi i32 [ 0, %entry ], [ %inc14.i, %for.body10.i ]
+  %ref.tmp.i = alloca %struct.FunctionPacket, align 4
+  %ref.tmp.ascast.i = addrspacecast ptr %ref.tmp.i to ptr addrspace(4)
+  call spir_func void @_Z28__spirv_TaskSequenceGetINTELI14FunctionPacketET_PN5__spv25__spirv_TaskSequenceINTELE(ptr addrspace(4) dead_on_unwind writable sret(%struct.FunctionPacket) align 4 %ref.tmp.ascast.i, target("spirv.TaskSequenceINTEL") noundef %call.i1) #8
+  %inc14.i = add nuw nsw i32 %i5.0.i9, 1
+  %exitcond.not = icmp eq i32 %inc14.i, 16384
+  br i1 %exitcond.not, label %_ZZ4mainENKUlvE_clEv.exit, label %for.body10.i, !llvm.loop !10
+
+_ZZ4mainENKUlvE_clEv.exit:                        ; preds = %for.body10.i
+  ret void
+}
+
+; Function Attrs: mustprogress norecurse nounwind
+define linkonce_odr dso_local spir_func noundef i32 @_Z4multii(i32 noundef %a, i32 noundef %b) #1 !srcloc !12 {
+entry:
+  %mul = mul nsw i32 %b, %a
+  ret i32 %mul
+}
+
+; Function Attrs: convergent nounwind
+declare dso_local spir_func noundef target("spirv.TaskSequenceINTEL") @_Z31__spirv_TaskSequenceCreateINTELIN4sycl3_V13ext5intel12experimental13task_sequenceIL_Z4multiiENS2_6oneapi12experimental10propertiesISt5tupleIJNS7_14property_valueINS4_13pipelined_keyEJSt17integral_constantIiLin1EEEEENSA_INS4_16fpga_cluster_keyEJSC_INS4_25fpga_cluster_options_enumELSG_1EEEEENSA_INS4_12balanced_keyEJEEENSA_INS4_23invocation_capacity_keyEJSC_IjLj10EEEEENSA_INS4_21response_capacity_keyEJSC_IjLj17EEEEEEEEEEEiJiiEEmPT_PFT0_DpT1_Ejjit(ptr noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef zeroext) local_unnamed_addr #2
+
+; Function Attrs: convergent nounwind
+declare dso_local spir_func void @_Z28__spirv_TaskSequenceGetINTELI14FunctionPacketET_PN5__spv25__spirv_TaskSequenceINTELE(ptr addrspace(4) dead_on_unwind writable sret(%struct.FunctionPacket) align 4, target("spirv.TaskSequenceINTEL") noundef) local_unnamed_addr #2
+
+attributes #0 = { convergent mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-fpga-cluster"="1" "sycl-module-id"="test.cpp" "sycl-optlevel"="2" "sycl-single-task" "uniform-work-group-size"="true" }
+attributes #1 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-optlevel"="2" }
+attributes #2 = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #3 = { convergent nounwind }
+
+
+!opencl.spir.version = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!spirv.Source = !{!1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1}
+!llvm.ident = !{!2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2}
+!llvm.module.flags = !{!3, !4}
+!sycl.specialization-constants = !{}
+!sycl.specialization-constants-default-values = !{}
+
+!0 = !{i32 1, i32 2}
+!1 = !{i32 4, i32 100000}
+!2 = !{!"clang version 18.0.0git (https://github.com/bowenxue-intel/llvm.git bb1121cb47589e94ab65b81971a298b9d2c21ff6)"}
+!3 = !{i32 1, !"wchar_size", i32 4}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{i32 5445863}
+!6 = !{i32 -1, i32 -1}
+!7 = !{}
+!8 = !{i1 false, i1 false}
+!9 = !{i1 true}
+!10 = distinct !{!10, !11}
+!11 = !{!"llvm.loop.mustprogress"}
+!12 = !{i32 5445350}
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_task_sequence/void-get.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_task_sequence/void-get.ll
new file mode 100644
index 0000000000000..bf3e884dd8cac
--- /dev/null
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_task_sequence/void-get.ll
@@ -0,0 +1,80 @@
+; Test translation of __spirv_TaskSequenceGetINTEL with void return type.
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_task_sequence -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis %t.rev.bc
+; RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV: TypeVoid [[#VoidTy:]]
+; CHECK-SPIRV: TypeTaskSequenceINTEL [[#TypeTS:]]
+
+; CHECK-SPIRV: TaskSequenceCreateINTEL [[#TypeTS]] [[#TSCreateId:]]
+; CHECK-SPIRV: TaskSequenceGetINTEL [[#VoidTy]] [[#]] [[#TSCreateId]]
+
+; CHECK-LLVM: %[[TSCreate:[a-z0-9.]+]] = call spir_func target("spirv.TaskSequenceINTEL") @_Z66__spirv_TaskSequenceCreateINTEL_RPU3AS125__spirv_TaskSequenceINTELPiiiii(ptr @_Z4multii, i32 -1, i32 -1, i32 1, i32 32)
+; CHECK-LLVM: call spir_func void @_Z28__spirv_TaskSequenceGetINTELPU3AS125__spirv_TaskSequenceINTEL(target("spirv.TaskSequenceINTEL") %[[TSCreate]])
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+$_ZTS8MyKernel = comdat any
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define weak_odr dso_local spir_kernel void @_ZTS8MyKernel(ptr addrspace(1) noundef align 4 %_arg_in, ptr addrspace(1) noundef align 4 %_arg_res) local_unnamed_addr #0 comdat !srcloc !5 !kernel_arg_buffer_location !6 !sycl_fixed_targets !7 !sycl_kernel_omit_args !8 !stall_enable !9 {
+entry:
+  %call.i1 = call spir_func noundef target("spirv.TaskSequenceINTEL") @_Z31__spirv_TaskSequenceCreateINTELIN4sycl3_V13ext5intel12experimental13task_sequenceIL_Z4multiiENS2_6oneapi12experimental10propertiesISt5tupleIJNS7_14property_valueINS4_13pipelined_keyEJSt17integral_constantIiLin1EEEEENSA_INS4_16fpga_cluster_keyEJSC_INS4_25fpga_cluster_options_enumELSG_1EEEEENSA_INS4_12balanced_keyEJEEENSA_INS4_23invocation_capacity_keyEJSC_IjLj10EEEEENSA_INS4_21response_capacity_keyEJSC_IjLj17EEEEEEEEEEEiJiiEEmPT_PFT0_DpT1_Ejjit(ptr noundef nonnull @_Z4multii, i32 noundef -1, i32 noundef -1, i32 noundef 1, i32 noundef 32) #3
+  br label %for.body10.i
+
+for.body10.i:                                     ; preds = %entry, %for.body10.i
+  %i5.0.i9 = phi i32 [ 0, %entry ], [ %inc14.i, %for.body10.i ]
+  tail call spir_func void @_Z28__spirv_TaskSequenceGetINTELIiET_m(target("spirv.TaskSequenceINTEL") noundef %call.i1) #3
+  %inc14.i = add nuw nsw i32 %i5.0.i9, 1
+  %exitcond.not = icmp eq i32 %inc14.i, 16384
+  br i1 %exitcond.not, label %_ZZ4mainENKUlvE_clEv.exit, label %for.body10.i, !llvm.loop !10
+
+_ZZ4mainENKUlvE_clEv.exit:                        ; preds = %for.body10.i
+  ret void
+}
+
+; Function Attrs: mustprogress norecurse nounwind
+define linkonce_odr dso_local spir_func noundef i32 @_Z4multii(i32 noundef %a, i32 noundef %b) #1 !srcloc !12 {
+entry:
+  %mul = mul nsw i32 %b, %a
+  ret i32 %mul
+}
+
+; Function Attrs: convergent nounwind
+declare dso_local spir_func noundef target("spirv.TaskSequenceINTEL") @_Z31__spirv_TaskSequenceCreateINTELIN4sycl3_V13ext5intel12experimental13task_sequenceIL_Z4multiiENS2_6oneapi12experimental10propertiesISt5tupleIJNS7_14property_valueINS4_13pipelined_keyEJSt17integral_constantIiLin1EEEEENSA_INS4_16fpga_cluster_keyEJSC_INS4_25fpga_cluster_options_enumELSG_1EEEEENSA_INS4_12balanced_keyEJEEENSA_INS4_23invocation_capacity_keyEJSC_IjLj10EEEEENSA_INS4_21response_capacity_keyEJSC_IjLj17EEEEEEEEEEEiJiiEEmPT_PFT0_DpT1_Ejjit(ptr noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef zeroext) local_unnamed_addr #2
+
+; Function Attrs: convergent nounwind
+declare dso_local spir_func void @_Z28__spirv_TaskSequenceGetINTELIiET_m(target("spirv.TaskSequenceINTEL") noundef) local_unnamed_addr #2
+
+attributes #0 = { convergent mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-fpga-cluster"="1" "sycl-module-id"="test.cpp" "sycl-optlevel"="2" "sycl-single-task" "uniform-work-group-size"="true" }
+attributes #1 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-optlevel"="2" }
+attributes #2 = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #3 = { convergent nounwind }
+
+!opencl.spir.version = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!spirv.Source = !{!1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1}
+!llvm.ident = !{!2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2}
+!llvm.module.flags = !{!3, !4}
+!sycl.specialization-constants = !{}
+!sycl.specialization-constants-default-values = !{}
+
+!0 = !{i32 1, i32 2}
+!1 = !{i32 4, i32 100000}
+!2 = !{!"clang version 18.0.0git (https://github.com/bowenxue-intel/llvm.git bb1121cb47589e94ab65b81971a298b9d2c21ff6)"}
+!3 = !{i32 1, !"wchar_size", i32 4}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{i32 5445863}
+!6 = !{i32 -1, i32 -1}
+!7 = !{}
+!8 = !{i1 false, i1 false}
+!9 = !{i1 true}
+!10 = distinct !{!10, !11}
+!11 = !{!"llvm.loop.mustprogress"}
+!12 = !{i32 5445350}
diff --git a/llvm-spirv/test/layout.ll b/llvm-spirv/test/layout.ll
index f20b4e2265d1d..dc59ed639d42d 100644
--- a/llvm-spirv/test/layout.ll
+++ b/llvm-spirv/test/layout.ll
@@ -43,22 +43,27 @@
 ; CHECK-NOT: {{[0-9]*}} Variable
 ; CHECK-NOT: {{[0-9]*}} Function
 
-; CHECK: {{[0-9]*}} TypeInt [[TypeInt:[0-9]+]] 32
-; CHECK: {{[0-9]*}} TypeInt [[TypeI8:[0-9]+]] 8
-; CHECK: {{[0-9]*}} Constant [[TypeInt]] [[Two:[0-9]+]] 2
-; CHECK: {{[0-9]*}} TypePointer [[TPointer:[0-9]+]]
-; CHECK: {{[0-9]*}} TypePointer [[SConstOpType:[0-9]+]]
+; CHECK: {{[0-9]*}} TypeInt [[TypeInt32:[0-9]+]] 32
+; CHECK: {{[0-9]*}} TypeInt [[TypeInt8:[0-9]+]] 8
+; CHECK: {{[0-9]*}} Constant [[TypeInt32]] [[Int32Two:[0-9]+]] 2
+; CHECK: {{[0-9]*}} TypeArray [[TypeArrayInt32:[0-9]+]] [[TypeInt32]] [[Int32Two]]
+; CHECK: {{[0-9]*}} TypePointer [[TypePtr5ArrayInt32:[0-9]+]] 5 [[TypeArrayInt32]]
+; CHECK: {{[0-9]*}} TypePointer [[TypePtr5Int32:[0-9]+]] 5 [[TypeInt32]]
+; CHECK: {{[0-9]*}} TypePointer [[TypePtr5Ptr5Int32:[0-9]+]] 5 [[TypePtr5Int32]]
 ; CHECK: {{[0-9]*}} TypeFloat [[TypeFloat:[0-9]+]]
-; CHECK: {{[0-9]*}} TypeArray [[TypeArray:[0-9]+]] [[TypeFloat]] [[Two]]
-; CHECK: {{[0-9]*}} TypeVector [[TypeVectorInt3:[0-9]+]] [[TypeInt]] 3
-; CHECK: {{[0-9]*}} TypePointer [[TPointerI8:[0-9]+]] 8 [[TypeI8]]
-; CHECK: {{[0-9]*}} TypeStruct [[BID:[0-9]+]] {{[0-9]+}} [[TPointerI8]]
+; CHECK: {{[0-9]*}} TypeArray [[TypeArrayFloat:[0-9]+]] [[TypeFloat]] [[Int32Two]]
+; CHECK: {{[0-9]*}} TypePointer [[TypePtr8ArrayFloat:[0-9]+]] 0 [[TypeArrayFloat]]
+; CHECK: {{[0-9]*}} TypeVector [[TypeVectorInt32:[0-9]+]] [[TypeInt32]] 3
+; CHECK: {{[0-9]*}} TypePointer [[TypePtr8VectorInt32:[0-9]+]] 0 [[TypeVectorInt32]]
+; CHECK: {{[0-9]*}} TypePointer [[TypePtr0Int8:[0-9]+]] 8 [[TypeInt8]]
+; CHECK: {{[0-9]*}} TypeStruct [[BID:[0-9]+]] {{[0-9]+}} [[TypePtr0Int8]]
 ; CHECK: {{[0-9]*}} TypeStruct [[CID:[0-9]+]] {{[0-9]+}} [[BID]]
 ; CHECK: {{[0-9]*}} TypeStruct [[AID:[0-9]+]] {{[0-9]+}} [[CID]]
 ; CHECK: {{[0-9]*}} TypeVoid [[Void:[0-9]+]]
-; CHECK: {{[0-9]*}} TypeFunction [[TypeBar1:[0-9]+]] [[Void]] [[SConstOpType]]
-; CHECK: {{[0-9]*}} Variable [[TPointer]] [[Var:[0-9]+]]
-; CHECK: {{[0-9]*}} SpecConstantOp [[SConstOpType]] [[SConstOp:[0-9]+]] 70 [[Var]]
+; CHECK: {{[0-9]*}} TypePointer [[TypePtr5Int8:[0-9]+]] 5 [[TypeInt8]]
+; CHECK: {{[0-9]*}} TypeFunction [[TypeBar1:[0-9]+]] [[Void]] [[TypePtr5Int8]]
+; CHECK: {{[0-9]*}} Variable [[TypePtr5ArrayInt32]] [[Var:[0-9]+]]
+; CHECK: {{[0-9]*}} SpecConstantOp [[TypePtr5Int32]] [[SConstOp:[0-9]+]] 70 [[Var]]
 ; CHECK: {{[0-9]*}} Variable {{[0-9]+}} {{[0-9]+}} 5 [[SConstOp]]
 
 ; CHECK-NOT: {{[0-9]*}} Capability
diff --git a/llvm-spirv/test/llvm-intrinsics/frexp.ll b/llvm-spirv/test/llvm-intrinsics/frexp.ll
index 9c62f9c0fd35e..6a82293971af7 100644
--- a/llvm-spirv/test/llvm-intrinsics/frexp.ll
+++ b/llvm-spirv/test/llvm-intrinsics/frexp.ll
@@ -105,8 +105,8 @@ define { <4 x float>, <4 x i32> } @frexp_nonsplat_vector() {
 ; CHECK-SPIRV: ExtInst [[#TypeFloat]] [[#]] [[#ExtInstSetId]] frexp [[#]] [[#]]
 ; CHECK-SPIRV: ExtInst [[#TypeFloat]] [[#]] [[#ExtInstSetId]] frexp [[#]] [[#]]
 ; CHECK-LLVM: %[[#IntVar1:]] = alloca i32
-; CHECK-LLVM: %[[Frexp0:[a-z0-9.]+]] = call spir_func float @_Z5frexpfPi(float %x, ptr %[[#IntVar1]])
 ; CHECK-LLVM: %[[#IntVar2:]] = alloca i32
+; CHECK-LLVM: %[[Frexp0:[a-z0-9.]+]] = call spir_func float @_Z5frexpfPi(float %x, ptr %[[#IntVar1]])
 ; CHECK-LLVM: %[[Frexp1:[a-z0-9.]+]] = call spir_func float @_Z5frexpfPi(float %[[Frexp0]], ptr %[[#IntVar2]])
 ; CHECK-LLVM: %[[#LoadIntVar:]] = load i32, ptr %[[#IntVar2]]
 ; CHECK-LLVM: %[[#AllocaStrFloatInt:]] = alloca %[[StrTypeFloatInt]]
diff --git a/llvm-spirv/test/llvm-intrinsics/memmove.ll b/llvm-spirv/test/llvm-intrinsics/memmove.ll
index 0e6f5bca6727f..41bd04ecb89dd 100644
--- a/llvm-spirv/test/llvm-intrinsics/memmove.ll
+++ b/llvm-spirv/test/llvm-intrinsics/memmove.ll
@@ -2,6 +2,7 @@
 ; RUN: llvm-spirv %t.bc -spirv-text -o %t.txt
 ; RUN: FileCheck < %t.txt %s --check-prefix=CHECK-SPIRV
 ; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
 ; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
 ; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
@@ -21,9 +22,9 @@
 ; CHECK-SPIRV: FunctionParameter [[#I8GLOBAL_PTR]] [[#ARG_IN:]]
 ; CHECK-SPIRV: FunctionParameter [[#I8GLOBAL_PTR]] [[#ARG_OUT:]]
 ;
+; CHECK-SPIRV: Variable [[#]] [[#MEM:]]
 ; CHECK-SPIRV: Bitcast [[#]] [[#I8_ARG_IN:]] [[#ARG_IN]]
 ; CHECK-SPIRV: Bitcast [[#]] [[#I8_ARG_OUT:]] [[#ARG_OUT]]
-; CHECK-SPIRV: Variable [[#]] [[#MEM:]]
 ; CHECK-SPIRV: Bitcast [[#]] [[#TMP:]] [[#MEM]]
 ; CHECK-SPIRV: LifetimeStart [[#TMP]]
 ; CHECK-SPIRV: CopyMemorySized [[#MEM]] [[#I8_ARG_IN]] [[#C128]] 2 64
@@ -35,10 +36,10 @@
 ; CHECK-SPIRV: FunctionParameter [[#I8GLOBAL_PTR]] [[#ARG_IN:]]
 ; CHECK-SPIRV: FunctionParameter [[#I8GENERIC_PTR]] [[#ARG_OUT:]]
 ;
+; CHECK-SPIRV: Variable [[#]] [[#MEM:]]
 ; CHECK-SPIRV: Bitcast [[#]] [[#I8_ARG_IN:]] [[#ARG_IN]]
 ; CHECK-SPIRV: Bitcast [[#]] [[#I8_ARG_OUT_GENERIC:]] [[#ARG_OUT]]
 ; CHECK-SPIRV: GenericCastToPtr [[#]] [[#I8_ARG_OUT:]] [[#I8_ARG_OUT_GENERIC]]
-; CHECK-SPIRV: Variable [[#]] [[#MEM:]]
 ; CHECK-SPIRV: Bitcast [[#]] [[#TMP:]] [[#MEM]]
 ; CHECK-SPIRV: LifetimeStart [[#TMP]]
 ; CHECK-SPIRV: CopyMemorySized [[#MEM]] [[#I8_ARG_IN]] [[#C68]] 2 64
@@ -77,9 +78,9 @@
 ; CHECK-LLVM-NOT: llvm.memmove
 
 ; CHECK-LLVM-LABEL: @test_full_move
+; CHECK-LLVM: %[[local:.*]] = alloca [128 x i8]
 ; CHECK-LLVM: %[[i8_in:.*]] = bitcast ptr addrspace(1) %in to ptr addrspace(1)
 ; CHECK-LLVM: %[[i8_out:.*]] = bitcast ptr addrspace(1) %out to ptr addrspace(1)
-; CHECK-LLVM: %[[local:.*]] = alloca [128 x i8]
 ; CHECK-LLVM: %[[i8_local:.*]] = bitcast ptr %[[local]] to ptr
 ; CHECK-LLVM: call void @llvm.lifetime.start.p0({{.*}}, ptr %[[i8_local]])
 ; CHECK-LLVM: call void @llvm.memcpy.p0.p1.i32(ptr align 64 %[[local]],
@@ -90,10 +91,10 @@
 ; CHECK-LLVM: call void @llvm.lifetime.end.p0({{.*}}, ptr %[[i8_local]])
 
 ; CHECK-LLVM-LABEL: @test_partial_move
+; CHECK-LLVM: %[[local:.*]] = alloca [68 x i8]
 ; CHECK-LLVM: %[[i8_in:.*]] = bitcast ptr addrspace(1) %in to ptr addrspace(1)
 ; CHECK-LLVM: %[[i8_out_generic:.*]] = bitcast ptr addrspace(4) %out to ptr addrspace(4)
 ; CHECK-LLVM: %[[i8_out:.*]] = addrspacecast ptr addrspace(4) %[[i8_out_generic]] to ptr addrspace(1)
-; CHECK-LLVM: %[[local:.*]] = alloca [68 x i8]
 ; CHECK-LLVM: %[[i8_local:.*]] = bitcast ptr %[[local]] to ptr
 ; CHECK-LLVM: call void @llvm.lifetime.start.p0({{.*}}, ptr %[[i8_local]])
 ; CHECK-LLVM: call void @llvm.memcpy.p0.p1.i32(ptr align 64 %[[local]],
diff --git a/llvm-spirv/test/llvm-intrinsics/uadd.with.overflow.ll b/llvm-spirv/test/llvm-intrinsics/uadd.with.overflow.ll
index 7d560b5507572..33fec83f099cf 100644
--- a/llvm-spirv/test/llvm-intrinsics/uadd.with.overflow.ll
+++ b/llvm-spirv/test/llvm-intrinsics/uadd.with.overflow.ll
@@ -1,51 +1,196 @@
+; REQUIRES: spirv-dis
 ; RUN: llvm-as %s -o %t.bc
-; RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck --check-prefix CHECK-SPIRV %s
 ; RUN: llvm-spirv %t.bc -o %t.spv
-; Current implementation doesn't comply with specification and should be fixed in future.
-; TODO: spirv-val %t.spv
+; RUN: spirv-dis --raw-id %t.spv | FileCheck --check-prefix CHECK-SPIRV %s
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r -o %t.rev.bc %t.spv
+; RUN: llvm-dis -o - %t.rev.bc | FileCheck --check-prefix CHECK-LLVM %s
 
 target triple = "spir64-unknown-unknown"
 
-; CHECK-SPIRV: TypeInt [[#I16TYPE:]] 16
-; CHECK-SPIRV: TypeInt [[#I32TYPE:]] 32
-; CHECK-SPIRV: TypeInt [[#I64TYPE:]] 64
-; CHECK-SPIRV: TypeBool [[#BTYPE:]]
-; CHECK-SPIRV: TypeStruct [[#S0TYPE:]] [[#I16TYPE]] [[#BTYPE]]
-; CHECK-SPIRV: TypeStruct [[#S1TYPE:]] [[#I32TYPE]] [[#BTYPE]]
-; CHECK-SPIRV: TypeStruct [[#S2TYPE:]] [[#I64TYPE]] [[#BTYPE]]
-; CHECK-SPIRV: TypeVector [[#V4XI32TYPE:]] [[#I32TYPE]] 4
-; CHECK-SPIRV: TypeVector [[#V4XBTYPE:]] [[#BTYPE]] 4
-; CHECK-SPIRV: TypeStruct [[#S3TYPE:]] [[#V4XI32TYPE]] [[#V4XBTYPE]]
-; CHECK-SPIRV: IAddCarry [[#S0TYPE]]
-; CHECK-SPIRV: IAddCarry [[#S1TYPE]]
-; CHECK-SPIRV: IAddCarry [[#S2TYPE]]
-; CHECK-SPIRV: IAddCarry [[#S3TYPE]]
-
-define spir_func void @test_uadd_with_overflow_i16(i16 %a, i16 %b) {
+; CHECK-SPIRV-DAG:                   [[ushort:%[a-z0-9_.]+]] = OpTypeInt 16 0
+; CHECK-SPIRV-DAG:                     [[uint:%[a-z0-9_.]+]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG:                    [[ulong:%[a-z0-9_.]+]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG:                 [[ushort_0:%[a-z0-9_.]+]] = OpConstant [[ushort]] 0
+; CHECK-SPIRV-DAG:                   [[uint_0:%[a-z0-9_.]+]] = OpConstant [[uint]] 0
+; CHECK-SPIRV-DAG:                  [[ulong_0:%[a-z0-9_.]+]] = OpConstant [[ulong]] 0
+; CHECK-SPIRV-DAG:                     [[bool:%[a-z0-9_.]+]] = OpTypeBool
+; CHECK-SPIRV-DAG:                    [[var_4:%[a-z0-9_.]+]] = OpTypeFunction [[bool]] [[ushort]] [[ushort]]
+; CHECK-SPIRV-DAG:                [[_struct_9:%[a-z0-9_.]+]] = OpTypeStruct [[ushort]] [[ushort]]
+; CHECK-SPIRV-DAG:  [[_ptr_Function__struct_9:%[a-z0-9_.]+]] = OpTypePointer Function [[_struct_9]]
+; CHECK-SPIRV-DAG:               [[_struct_18:%[a-z0-9_.]+]] = OpTypeStruct [[ushort]] [[bool]]
+; CHECK-SPIRV-DAG:                   [[var_25:%[a-z0-9_.]+]] = OpTypeFunction [[bool]] [[uint]] [[uint]]
+; CHECK-SPIRV-DAG:               [[_struct_30:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[uint]]
+; CHECK-SPIRV-DAG: [[_ptr_Function__struct_30:%[a-z0-9_.]+]] = OpTypePointer Function [[_struct_30]]
+; CHECK-SPIRV-DAG:               [[_struct_39:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[bool]]
+; CHECK-SPIRV-DAG:                   [[var_46:%[a-z0-9_.]+]] = OpTypeFunction [[bool]] [[ulong]] [[ulong]]
+; CHECK-SPIRV-DAG:               [[_struct_51:%[a-z0-9_.]+]] = OpTypeStruct [[ulong]] [[ulong]]
+; CHECK-SPIRV-DAG: [[_ptr_Function__struct_51:%[a-z0-9_.]+]] = OpTypePointer Function [[_struct_51]]
+; CHECK-SPIRV-DAG:               [[_struct_60:%[a-z0-9_.]+]] = OpTypeStruct [[ulong]] [[bool]]
+; CHECK-SPIRV-DAG:                   [[v4bool:%[a-z0-9_.]+]] = OpTypeVector [[bool]] 4
+; CHECK-SPIRV-DAG:                   [[v4uint:%[a-z0-9_.]+]] = OpTypeVector [[uint]] 4
+; CHECK-SPIRV-DAG:                   [[var_68:%[a-z0-9_.]+]] = OpTypeFunction [[v4bool]] [[v4uint]] [[v4uint]]
+; CHECK-SPIRV-DAG:               [[_struct_73:%[a-z0-9_.]+]] = OpTypeStruct [[v4uint]] [[v4uint]]
+; CHECK-SPIRV-DAG: [[_ptr_Function__struct_73:%[a-z0-9_.]+]] = OpTypePointer Function [[_struct_73]]
+; CHECK-SPIRV-DAG:               [[_struct_82:%[a-z0-9_.]+]] = OpTypeStruct [[v4uint]] [[v4bool]]
+; CHECK-SPIRV-DAG:                   [[var_19:%[a-z0-9_.]+]] = OpUndef [[_struct_18]]
+; CHECK-SPIRV-DAG:                   [[var_40:%[a-z0-9_.]+]] = OpUndef [[_struct_39]]
+; CHECK-SPIRV-DAG:                   [[var_61:%[a-z0-9_.]+]] = OpUndef [[_struct_60]]
+; CHECK-SPIRV-DAG:                   [[var_80:%[a-z0-9_.]+]] = OpConstantNull [[v4uint]]
+; CHECK-SPIRV-DAG:                   [[var_83:%[a-z0-9_.]+]] = OpUndef [[_struct_82]]
+
+; CHECK-LLVM-DAG: [[structtype:%[a-z0-9._]+]] = type { i16, i16 }
+; CHECK-LLVM-DAG: [[structtype_0:%[a-z0-9._]+]] = type { i16, i1 }
+; CHECK-LLVM-DAG: [[structtype_1:%[a-z0-9._]+]] = type { i32, i32 }
+; CHECK-LLVM-DAG: [[structtype_2:%[a-z0-9._]+]] = type { i32, i1 }
+; CHECK-LLVM-DAG: [[structtype_3:%[a-z0-9._]+]] = type { i64, i64 }
+; CHECK-LLVM-DAG: [[structtype_4:%[a-z0-9._]+]] = type { i64, i1 }
+; CHECK-LLVM-DAG: [[structtype_5:%[a-z0-9._]+]] = type { <4 x i32>, <4 x i32> }
+; CHECK-LLVM-DAG: [[structtype_6:%[a-z0-9._]+]] = type { <4 x i32>, <4 x i1> }
+
+define spir_func i1 @test_uadd_with_overflow_i16(i16 %a, i16 %b) {
 entry:
   %res = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
-  ret void
+  %0 = extractvalue {i16, i1} %res, 0
+  %1 = extractvalue {i16, i1} %res, 1
+  ret i1 %1
 }
 
-define spir_func void @test_uadd_with_overflow_i32(i32 %a, i32 %b) {
+; CHECK-SPIRV:               [[a:%[a-z0-9_.]+]] = OpFunctionParameter [[ushort]]
+; CHECK-SPIRV:               [[b:%[a-z0-9_.]+]] = OpFunctionParameter [[ushort]]
+; CHECK-SPIRV:           [[entry:%[a-z0-9_.]+]] = OpLabel
+; CHECK-SPIRV:          [[var_11:%[a-z0-9_.]+]] = OpVariable [[_ptr_Function__struct_9]] Function
+; CHECK-SPIRV:          [[var_12:%[a-z0-9_.]+]] = OpIAddCarry [[_struct_9]] [[a]] [[b]]
+; CHECK-SPIRV:                                    OpStore [[var_11]] [[var_12]]
+; CHECK-SPIRV:          [[var_13:%[a-z0-9_.]+]] = OpLoad [[_struct_9]] [[var_11]] Aligned 2
+; CHECK-SPIRV:          [[var_14:%[a-z0-9_.]+]] = OpCompositeExtract [[ushort]] [[var_13]] 0
+; CHECK-SPIRV:          [[var_15:%[a-z0-9_.]+]] = OpCompositeExtract [[ushort]] [[var_13]] 1
+; CHECK-SPIRV:          [[var_17:%[a-z0-9_.]+]] = OpINotEqual [[bool]] [[var_15]] [[ushort_0]]
+; CHECK-SPIRV:          [[var_20:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_18]] [[var_14]] [[var_19]] 0
+; CHECK-SPIRV:          [[var_21:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_18]] [[var_17]] [[var_20]] 1
+; CHECK-SPIRV:          [[var_22:%[a-z0-9_.]+]] = OpCompositeExtract [[ushort]] [[var_21]] 0
+; CHECK-SPIRV:          [[var_23:%[a-z0-9_.]+]] = OpCompositeExtract [[bool]] [[var_21]] 1
+; CHECK-SPIRV:                                    OpReturnValue [[var_23]]
+
+; CHECK-LLVM:   %0 = alloca [[structtype]], align 8
+; CHECK-LLVM:   call spir_func void @_Z17__spirv_IAddCarryss(ptr sret([[structtype]]) %0, i16 %a, i16 %b)
+; CHECK-LLVM:   %1 = load [[structtype]], ptr %0, align 2
+; CHECK-LLVM:   %2 = extractvalue [[structtype]] %1, 0
+; CHECK-LLVM:   %3 = extractvalue [[structtype]] %1, 1
+; CHECK-LLVM:   %4 = icmp ne i16 %3, 0
+; CHECK-LLVM:   %5 = insertvalue [[structtype_0]] undef, i16 %2, 0
+; CHECK-LLVM:   %6 = insertvalue [[structtype_0]] %5, i1 %4, 1
+; CHECK-LLVM:   %7 = extractvalue [[structtype_0]] %6, 0
+; CHECK-LLVM:   %8 = extractvalue [[structtype_0]] %6, 1
+; CHECK-LLVM:   ret i1 %8
+define spir_func i1 @test_uadd_with_overflow_i32(i32 %a, i32 %b) {
 entry:
   %res = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
-  ret void
+  %0 = extractvalue {i32, i1} %res, 0
+  %1 = extractvalue {i32, i1} %res, 1
+  ret i1 %1
 }
 
-define spir_func void @test_uadd_with_overflow_i64(i64 %a, i64 %b) {
+; CHECK-SPIRV:             [[a_0:%[a-z0-9_.]+]] = OpFunctionParameter [[uint]]
+; CHECK-SPIRV:             [[b_0:%[a-z0-9_.]+]] = OpFunctionParameter [[uint]]
+; CHECK-SPIRV:         [[entry_0:%[a-z0-9_.]+]] = OpLabel
+; CHECK-SPIRV:          [[var_32:%[a-z0-9_.]+]] = OpVariable [[_ptr_Function__struct_30]] Function
+; CHECK-SPIRV:          [[var_33:%[a-z0-9_.]+]] = OpIAddCarry [[_struct_30]] [[a_0]] [[b_0]]
+; CHECK-SPIRV:                                    OpStore [[var_32]] [[var_33]]
+; CHECK-SPIRV:          [[var_34:%[a-z0-9_.]+]] = OpLoad [[_struct_30]] [[var_32]] Aligned 4
+; CHECK-SPIRV:          [[var_35:%[a-z0-9_.]+]] = OpCompositeExtract [[uint]] [[var_34]] 0
+; CHECK-SPIRV:          [[var_36:%[a-z0-9_.]+]] = OpCompositeExtract [[uint]] [[var_34]] 1
+; CHECK-SPIRV:          [[var_38:%[a-z0-9_.]+]] = OpINotEqual [[bool]] [[var_36]] [[uint_0]]
+; CHECK-SPIRV:          [[var_41:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_39]] [[var_35]] [[var_40]] 0
+; CHECK-SPIRV:          [[var_42:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_39]] [[var_38]] [[var_41]] 1
+; CHECK-SPIRV:          [[var_43:%[a-z0-9_.]+]] = OpCompositeExtract [[uint]] [[var_42]] 0
+; CHECK-SPIRV:          [[var_44:%[a-z0-9_.]+]] = OpCompositeExtract [[bool]] [[var_42]] 1
+; CHECK-SPIRV:                                    OpReturnValue [[var_44]]
+
+
+; CHECK-LLVM:   %0 = alloca [[structtype_1]], align 8
+; CHECK-LLVM:   call spir_func void @_Z17__spirv_IAddCarryii(ptr sret([[structtype_1]]) %0, i32 %a, i32 %b)
+; CHECK-LLVM:   %1 = load [[structtype_1]], ptr %0, align 4
+; CHECK-LLVM:   %2 = extractvalue [[structtype_1]] %1, 0
+; CHECK-LLVM:   %3 = extractvalue [[structtype_1]] %1, 1
+; CHECK-LLVM:   %4 = icmp ne i32 %3, 0
+; CHECK-LLVM:   %5 = insertvalue [[structtype_2]] undef, i32 %2, 0
+; CHECK-LLVM:   %6 = insertvalue [[structtype_2]] %5, i1 %4, 1
+; CHECK-LLVM:   %7 = extractvalue [[structtype_2]] %6, 0
+; CHECK-LLVM:   %8 = extractvalue [[structtype_2]] %6, 1
+; CHECK-LLVM:   ret i1 %8
+define spir_func i1 @test_uadd_with_overflow_i64(i64 %a, i64 %b) {
 entry:
   %res = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
-  ret void
+  %0 = extractvalue {i64, i1} %res, 0
+  %1 = extractvalue {i64, i1} %res, 1
+  ret i1 %1
 }
 
-define spir_func void @test_uadd_with_overflow_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SPIRV:             [[a_1:%[a-z0-9_.]+]] = OpFunctionParameter [[ulong]]
+; CHECK-SPIRV:             [[b_1:%[a-z0-9_.]+]] = OpFunctionParameter [[ulong]]
+; CHECK-SPIRV:         [[entry_1:%[a-z0-9_.]+]] = OpLabel
+; CHECK-SPIRV:          [[var_53:%[a-z0-9_.]+]] = OpVariable [[_ptr_Function__struct_51]] Function
+; CHECK-SPIRV:          [[var_54:%[a-z0-9_.]+]] = OpIAddCarry [[_struct_51]] [[a_1]] [[b_1]]
+; CHECK-SPIRV:                                    OpStore [[var_53]] [[var_54]]
+; CHECK-SPIRV:          [[var_55:%[a-z0-9_.]+]] = OpLoad [[_struct_51]] [[var_53]] Aligned 4
+; CHECK-SPIRV:          [[var_56:%[a-z0-9_.]+]] = OpCompositeExtract [[ulong]] [[var_55]] 0
+; CHECK-SPIRV:          [[var_57:%[a-z0-9_.]+]] = OpCompositeExtract [[ulong]] [[var_55]] 1
+; CHECK-SPIRV:          [[var_59:%[a-z0-9_.]+]] = OpINotEqual [[bool]] [[var_57]] [[ulong_0]]
+; CHECK-SPIRV:          [[var_62:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_60]] [[var_56]] [[var_61]] 0
+; CHECK-SPIRV:          [[var_63:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_60]] [[var_59]] [[var_62]] 1
+; CHECK-SPIRV:          [[var_64:%[a-z0-9_.]+]] = OpCompositeExtract [[ulong]] [[var_63]] 0
+; CHECK-SPIRV:          [[var_65:%[a-z0-9_.]+]] = OpCompositeExtract [[bool]] [[var_63]] 1
+; CHECK-SPIRV:                                    OpReturnValue [[var_65]]
+
+; CHECK-LLVM:   %0 = alloca [[structtype_3]], align 8
+; CHECK-LLVM:   call spir_func void @_Z17__spirv_IAddCarryll(ptr sret([[structtype_3]]) %0, i64 %a, i64 %b)
+; CHECK-LLVM:   %1 = load [[structtype_3]], ptr %0, align 4
+; CHECK-LLVM:   %2 = extractvalue [[structtype_3]] %1, 0
+; CHECK-LLVM:   %3 = extractvalue [[structtype_3]] %1, 1
+; CHECK-LLVM:   %4 = icmp ne i64 %3, 0
+; CHECK-LLVM:   %5 = insertvalue [[structtype_4]] undef, i64 %2, 0
+; CHECK-LLVM:   %6 = insertvalue [[structtype_4]] %5, i1 %4, 1
+; CHECK-LLVM:   %7 = extractvalue [[structtype_4]] %6, 0
+; CHECK-LLVM:   %8 = extractvalue [[structtype_4]] %6, 1
+; CHECK-LLVM:   ret i1 %8
+define spir_func <4 x i1> @test_uadd_with_overflow_v4i32(<4 x i32> %a, <4 x i32> %b) {
 entry:
- %res = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a, <4 x i32> %b) 
- ret void
+  %res = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a, <4 x i32> %b) 
+  %0 = extractvalue {<4 x i32>, <4 x i1>} %res, 0
+  %1 = extractvalue {<4 x i32>, <4 x i1>} %res, 1
+  ret <4 x i1> %1
 }
 
+; CHECK-SPIRV:             [[a_2:%[a-z0-9_.]+]] = OpFunctionParameter [[v4uint]]
+; CHECK-SPIRV:             [[b_2:%[a-z0-9_.]+]] = OpFunctionParameter [[v4uint]]
+; CHECK-SPIRV:         [[entry_2:%[a-z0-9_.]+]] = OpLabel
+; CHECK-SPIRV:          [[var_75:%[a-z0-9_.]+]] = OpVariable [[_ptr_Function__struct_73]] Function
+; CHECK-SPIRV:          [[var_76:%[a-z0-9_.]+]] = OpIAddCarry [[_struct_73]] [[a_2]] [[b_2]]
+; CHECK-SPIRV:                                    OpStore [[var_75]] [[var_76]]
+; CHECK-SPIRV:          [[var_77:%[a-z0-9_.]+]] = OpLoad [[_struct_73]] [[var_75]] Aligned 16
+; CHECK-SPIRV:          [[var_78:%[a-z0-9_.]+]] = OpCompositeExtract [[v4uint]] [[var_77]] 0
+; CHECK-SPIRV:          [[var_79:%[a-z0-9_.]+]] = OpCompositeExtract [[v4uint]] [[var_77]] 1
+; CHECK-SPIRV:          [[var_81:%[a-z0-9_.]+]] = OpINotEqual [[v4bool]] [[var_79]] [[var_80]]
+; CHECK-SPIRV:          [[var_84:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_82]] [[var_78]] [[var_83]] 0
+; CHECK-SPIRV:          [[var_85:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_82]] [[var_81]] [[var_84]] 1
+; CHECK-SPIRV:          [[var_86:%[a-z0-9_.]+]] = OpCompositeExtract [[v4uint]] [[var_85]] 0
+; CHECK-SPIRV:          [[var_87:%[a-z0-9_.]+]] = OpCompositeExtract [[v4bool]] [[var_85]] 1
+; CHECK-SPIRV:                                    OpReturnValue [[var_87]]
+
+; CHECK-LLVM:   %0 = alloca [[structtype_5]], align 16
+; CHECK-LLVM:   call spir_func void @_Z17__spirv_IAddCarryDv4_iS_(ptr sret([[structtype_5]]) %0, <4 x i32> %a, <4 x i32> %b)
+; CHECK-LLVM:   %1 = load [[structtype_5]], ptr %0, align 16
+; CHECK-LLVM:   %2 = extractvalue [[structtype_5]] %1, 0
+; CHECK-LLVM:   %3 = extractvalue [[structtype_5]] %1, 1
+; CHECK-LLVM:   %4 = icmp ne <4 x i32> %3, zeroinitializer
+; CHECK-LLVM:   %5 = insertvalue [[structtype_6]] undef, <4 x i32> %2, 0
+; CHECK-LLVM:   %6 = insertvalue [[structtype_6]] %5, <4 x i1> %4, 1
+; CHECK-LLVM:   %7 = extractvalue [[structtype_6]] %6, 0
+; CHECK-LLVM:   %8 = extractvalue [[structtype_6]] %6, 1
+; CHECK-LLVM:   ret <4 x i1> %8
 declare {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
 declare {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
 declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a, <4 x i32> %b)
+declare void @_Z17__spirv_IAddCarryii(ptr sret({i32, i32}), i32, i32)
diff --git a/llvm-spirv/test/llvm-intrinsics/usub.with.overflow.ll b/llvm-spirv/test/llvm-intrinsics/usub.with.overflow.ll
index a5df081ea947e..5225345adb931 100644
--- a/llvm-spirv/test/llvm-intrinsics/usub.with.overflow.ll
+++ b/llvm-spirv/test/llvm-intrinsics/usub.with.overflow.ll
@@ -1,51 +1,196 @@
+; REQUIRES: spirv-dis
 ; RUN: llvm-as %s -o %t.bc
-; RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck --check-prefix CHECK-SPIRV %s
 ; RUN: llvm-spirv %t.bc -o %t.spv
-; Current implementation doesn't comply with specification and should be fixed in future.
-; TODO: spirv-val %t.spv
+; RUN: spirv-dis --raw-id %t.spv | FileCheck --check-prefix CHECK-SPIRV %s
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r -o %t.rev.bc %t.spv
+; RUN: llvm-dis -o - %t.rev.bc | FileCheck --check-prefix CHECK-LLVM %s
 
 target triple = "spir64-unknown-unknown"
 
-; CHECK-SPIRV: TypeInt [[#I16TYPE:]] 16
-; CHECK-SPIRV: TypeInt [[#I32TYPE:]] 32
-; CHECK-SPIRV: TypeInt [[#I64TYPE:]] 64
-; CHECK-SPIRV: TypeBool [[#BTYPE:]]
-; CHECK-SPIRV: TypeStruct [[#S0TYPE:]] [[#I16TYPE]] [[#BTYPE]]
-; CHECK-SPIRV: TypeStruct [[#S1TYPE:]] [[#I32TYPE]] [[#BTYPE]]
-; CHECK-SPIRV: TypeStruct [[#S2TYPE:]] [[#I64TYPE]] [[#BTYPE]]
-; CHECK-SPIRV: TypeVector [[#V4XI32TYPE:]] [[#I32TYPE]] 4
-; CHECK-SPIRV: TypeVector [[#V4XBTYPE:]] [[#BTYPE]] 4
-; CHECK-SPIRV: TypeStruct [[#S3TYPE:]] [[#V4XI32TYPE]] [[#V4XBTYPE]]
-; CHECK-SPIRV: ISubBorrow [[#S0TYPE]]
-; CHECK-SPIRV: ISubBorrow [[#S1TYPE]]
-; CHECK-SPIRV: ISubBorrow [[#S2TYPE]]
-; CHECK-SPIRV: ISubBorrow [[#S3TYPE]]
-
-define spir_func void @test_usub_with_overflow_i16(i16 %a, i16 %b) {
+; CHECK-SPIRV-DAG:                   [[ushort:%[a-z0-9_.]+]] = OpTypeInt 16 0
+; CHECK-SPIRV-DAG:                     [[uint:%[a-z0-9_.]+]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG:                    [[ulong:%[a-z0-9_.]+]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG:                 [[ushort_0:%[a-z0-9_.]+]] = OpConstant [[ushort]] 0
+; CHECK-SPIRV-DAG:                   [[uint_0:%[a-z0-9_.]+]] = OpConstant [[uint]] 0
+; CHECK-SPIRV-DAG:                  [[ulong_0:%[a-z0-9_.]+]] = OpConstant [[ulong]] 0
+; CHECK-SPIRV-DAG:                     [[bool:%[a-z0-9_.]+]] = OpTypeBool
+; CHECK-SPIRV-DAG:                    [[var_4:%[a-z0-9_.]+]] = OpTypeFunction [[bool]] [[ushort]] [[ushort]]
+; CHECK-SPIRV-DAG:                [[_struct_9:%[a-z0-9_.]+]] = OpTypeStruct [[ushort]] [[ushort]]
+; CHECK-SPIRV-DAG:  [[_ptr_Function__struct_9:%[a-z0-9_.]+]] = OpTypePointer Function [[_struct_9]]
+; CHECK-SPIRV-DAG:               [[_struct_18:%[a-z0-9_.]+]] = OpTypeStruct [[ushort]] [[bool]]
+; CHECK-SPIRV-DAG:                   [[var_25:%[a-z0-9_.]+]] = OpTypeFunction [[bool]] [[uint]] [[uint]]
+; CHECK-SPIRV-DAG:               [[_struct_30:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[uint]]
+; CHECK-SPIRV-DAG: [[_ptr_Function__struct_30:%[a-z0-9_.]+]] = OpTypePointer Function [[_struct_30]]
+; CHECK-SPIRV-DAG:               [[_struct_39:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[bool]]
+; CHECK-SPIRV-DAG:                   [[var_46:%[a-z0-9_.]+]] = OpTypeFunction [[bool]] [[ulong]] [[ulong]]
+; CHECK-SPIRV-DAG:               [[_struct_51:%[a-z0-9_.]+]] = OpTypeStruct [[ulong]] [[ulong]]
+; CHECK-SPIRV-DAG: [[_ptr_Function__struct_51:%[a-z0-9_.]+]] = OpTypePointer Function [[_struct_51]]
+; CHECK-SPIRV-DAG:               [[_struct_60:%[a-z0-9_.]+]] = OpTypeStruct [[ulong]] [[bool]]
+; CHECK-SPIRV-DAG:                   [[v4bool:%[a-z0-9_.]+]] = OpTypeVector [[bool]] 4
+; CHECK-SPIRV-DAG:                   [[v4uint:%[a-z0-9_.]+]] = OpTypeVector [[uint]] 4
+; CHECK-SPIRV-DAG:                   [[var_68:%[a-z0-9_.]+]] = OpTypeFunction [[v4bool]] [[v4uint]] [[v4uint]]
+; CHECK-SPIRV-DAG:               [[_struct_73:%[a-z0-9_.]+]] = OpTypeStruct [[v4uint]] [[v4uint]]
+; CHECK-SPIRV-DAG: [[_ptr_Function__struct_73:%[a-z0-9_.]+]] = OpTypePointer Function [[_struct_73]]
+; CHECK-SPIRV-DAG:               [[_struct_82:%[a-z0-9_.]+]] = OpTypeStruct [[v4uint]] [[v4bool]]
+; CHECK-SPIRV-DAG:                   [[var_19:%[a-z0-9_.]+]] = OpUndef [[_struct_18]]
+; CHECK-SPIRV-DAG:                   [[var_40:%[a-z0-9_.]+]] = OpUndef [[_struct_39]]
+; CHECK-SPIRV-DAG:                   [[var_61:%[a-z0-9_.]+]] = OpUndef [[_struct_60]]
+; CHECK-SPIRV-DAG:                   [[var_80:%[a-z0-9_.]+]] = OpConstantNull [[v4uint]]
+; CHECK-SPIRV-DAG:                   [[var_83:%[a-z0-9_.]+]] = OpUndef [[_struct_82]]
+
+; CHECK-LLVM-DAG: [[structtype:%[a-z0-9._]+]] = type { i16, i16 }
+; CHECK-LLVM-DAG: [[structtype_0:%[a-z0-9._]+]] = type { i16, i1 }
+; CHECK-LLVM-DAG: [[structtype_1:%[a-z0-9._]+]] = type { i32, i32 }
+; CHECK-LLVM-DAG: [[structtype_2:%[a-z0-9._]+]] = type { i32, i1 }
+; CHECK-LLVM-DAG: [[structtype_3:%[a-z0-9._]+]] = type { i64, i64 }
+; CHECK-LLVM-DAG: [[structtype_4:%[a-z0-9._]+]] = type { i64, i1 }
+; CHECK-LLVM-DAG: [[structtype_5:%[a-z0-9._]+]] = type { <4 x i32>, <4 x i32> }
+; CHECK-LLVM-DAG: [[structtype_6:%[a-z0-9._]+]] = type { <4 x i32>, <4 x i1> }
+
+define spir_func i1 @test_usub_with_overflow_i16(i16 %a, i16 %b) {
 entry:
   %res = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
-  ret void
+  %0 = extractvalue {i16, i1} %res, 0
+  %1 = extractvalue {i16, i1} %res, 1
+  ret i1 %1
 }
 
-define spir_func void @test_usub_with_overflow_i32(i32 %a, i32 %b) {
+; CHECK-SPIRV:               [[a:%[a-z0-9_.]+]] = OpFunctionParameter [[ushort]]
+; CHECK-SPIRV:               [[b:%[a-z0-9_.]+]] = OpFunctionParameter [[ushort]]
+; CHECK-SPIRV:           [[entry:%[a-z0-9_.]+]] = OpLabel
+; CHECK-SPIRV:          [[var_11:%[a-z0-9_.]+]] = OpVariable [[_ptr_Function__struct_9]] Function
+; CHECK-SPIRV:          [[var_12:%[a-z0-9_.]+]] = OpISubBorrow [[_struct_9]] [[a]] [[b]]
+; CHECK-SPIRV:                                    OpStore [[var_11]] [[var_12]]
+; CHECK-SPIRV:          [[var_13:%[a-z0-9_.]+]] = OpLoad [[_struct_9]] [[var_11]] Aligned 2
+; CHECK-SPIRV:          [[var_14:%[a-z0-9_.]+]] = OpCompositeExtract [[ushort]] [[var_13]] 0
+; CHECK-SPIRV:          [[var_15:%[a-z0-9_.]+]] = OpCompositeExtract [[ushort]] [[var_13]] 1
+; CHECK-SPIRV:          [[var_17:%[a-z0-9_.]+]] = OpINotEqual [[bool]] [[var_15]] [[ushort_0]]
+; CHECK-SPIRV:          [[var_20:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_18]] [[var_14]] [[var_19]] 0
+; CHECK-SPIRV:          [[var_21:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_18]] [[var_17]] [[var_20]] 1
+; CHECK-SPIRV:          [[var_22:%[a-z0-9_.]+]] = OpCompositeExtract [[ushort]] [[var_21]] 0
+; CHECK-SPIRV:          [[var_23:%[a-z0-9_.]+]] = OpCompositeExtract [[bool]] [[var_21]] 1
+; CHECK-SPIRV:                                    OpReturnValue [[var_23]]
+
+; CHECK-LLVM:   %0 = alloca [[structtype]], align 8
+; CHECK-LLVM:   call spir_func void @_Z18__spirv_ISubBorrowss(ptr sret([[structtype]]) %0, i16 %a, i16 %b)
+; CHECK-LLVM:   %1 = load [[structtype]], ptr %0, align 2
+; CHECK-LLVM:   %2 = extractvalue [[structtype]] %1, 0
+; CHECK-LLVM:   %3 = extractvalue [[structtype]] %1, 1
+; CHECK-LLVM:   %4 = icmp ne i16 %3, 0
+; CHECK-LLVM:   %5 = insertvalue [[structtype_0]] undef, i16 %2, 0
+; CHECK-LLVM:   %6 = insertvalue [[structtype_0]] %5, i1 %4, 1
+; CHECK-LLVM:   %7 = extractvalue [[structtype_0]] %6, 0
+; CHECK-LLVM:   %8 = extractvalue [[structtype_0]] %6, 1
+; CHECK-LLVM:   ret i1 %8
+define spir_func i1 @test_usub_with_overflow_i32(i32 %a, i32 %b) {
 entry:
   %res = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
-  ret void
+  %0 = extractvalue {i32, i1} %res, 0
+  %1 = extractvalue {i32, i1} %res, 1
+  ret i1 %1
 }
 
-define spir_func void @test_usub_with_overflow_i64(i64 %a, i64 %b) {
+; CHECK-SPIRV:             [[a_0:%[a-z0-9_.]+]] = OpFunctionParameter [[uint]]
+; CHECK-SPIRV:             [[b_0:%[a-z0-9_.]+]] = OpFunctionParameter [[uint]]
+; CHECK-SPIRV:         [[entry_0:%[a-z0-9_.]+]] = OpLabel
+; CHECK-SPIRV:          [[var_32:%[a-z0-9_.]+]] = OpVariable [[_ptr_Function__struct_30]] Function
+; CHECK-SPIRV:          [[var_33:%[a-z0-9_.]+]] = OpISubBorrow [[_struct_30]] [[a_0]] [[b_0]]
+; CHECK-SPIRV:                                    OpStore [[var_32]] [[var_33]]
+; CHECK-SPIRV:          [[var_34:%[a-z0-9_.]+]] = OpLoad [[_struct_30]] [[var_32]] Aligned 4
+; CHECK-SPIRV:          [[var_35:%[a-z0-9_.]+]] = OpCompositeExtract [[uint]] [[var_34]] 0
+; CHECK-SPIRV:          [[var_36:%[a-z0-9_.]+]] = OpCompositeExtract [[uint]] [[var_34]] 1
+; CHECK-SPIRV:          [[var_38:%[a-z0-9_.]+]] = OpINotEqual [[bool]] [[var_36]] [[uint_0]]
+; CHECK-SPIRV:          [[var_41:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_39]] [[var_35]] [[var_40]] 0
+; CHECK-SPIRV:          [[var_42:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_39]] [[var_38]] [[var_41]] 1
+; CHECK-SPIRV:          [[var_43:%[a-z0-9_.]+]] = OpCompositeExtract [[uint]] [[var_42]] 0
+; CHECK-SPIRV:          [[var_44:%[a-z0-9_.]+]] = OpCompositeExtract [[bool]] [[var_42]] 1
+; CHECK-SPIRV:                                    OpReturnValue [[var_44]]
+
+
+; CHECK-LLVM:   %0 = alloca [[structtype_1]], align 8
+; CHECK-LLVM:   call spir_func void @_Z18__spirv_ISubBorrowii(ptr sret([[structtype_1]]) %0, i32 %a, i32 %b)
+; CHECK-LLVM:   %1 = load [[structtype_1]], ptr %0, align 4
+; CHECK-LLVM:   %2 = extractvalue [[structtype_1]] %1, 0
+; CHECK-LLVM:   %3 = extractvalue [[structtype_1]] %1, 1
+; CHECK-LLVM:   %4 = icmp ne i32 %3, 0
+; CHECK-LLVM:   %5 = insertvalue [[structtype_2]] undef, i32 %2, 0
+; CHECK-LLVM:   %6 = insertvalue [[structtype_2]] %5, i1 %4, 1
+; CHECK-LLVM:   %7 = extractvalue [[structtype_2]] %6, 0
+; CHECK-LLVM:   %8 = extractvalue [[structtype_2]] %6, 1
+; CHECK-LLVM:   ret i1 %8
+define spir_func i1 @test_usub_with_overflow_i64(i64 %a, i64 %b) {
 entry:
   %res = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
-  ret void
+  %0 = extractvalue {i64, i1} %res, 0
+  %1 = extractvalue {i64, i1} %res, 1
+  ret i1 %1
 }
 
-define spir_func void @test_usub_with_overflow_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SPIRV:             [[a_1:%[a-z0-9_.]+]] = OpFunctionParameter [[ulong]]
+; CHECK-SPIRV:             [[b_1:%[a-z0-9_.]+]] = OpFunctionParameter [[ulong]]
+; CHECK-SPIRV:         [[entry_1:%[a-z0-9_.]+]] = OpLabel
+; CHECK-SPIRV:          [[var_53:%[a-z0-9_.]+]] = OpVariable [[_ptr_Function__struct_51]] Function
+; CHECK-SPIRV:          [[var_54:%[a-z0-9_.]+]] = OpISubBorrow [[_struct_51]] [[a_1]] [[b_1]]
+; CHECK-SPIRV:                                    OpStore [[var_53]] [[var_54]]
+; CHECK-SPIRV:          [[var_55:%[a-z0-9_.]+]] = OpLoad [[_struct_51]] [[var_53]] Aligned 4
+; CHECK-SPIRV:          [[var_56:%[a-z0-9_.]+]] = OpCompositeExtract [[ulong]] [[var_55]] 0
+; CHECK-SPIRV:          [[var_57:%[a-z0-9_.]+]] = OpCompositeExtract [[ulong]] [[var_55]] 1
+; CHECK-SPIRV:          [[var_59:%[a-z0-9_.]+]] = OpINotEqual [[bool]] [[var_57]] [[ulong_0]]
+; CHECK-SPIRV:          [[var_62:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_60]] [[var_56]] [[var_61]] 0
+; CHECK-SPIRV:          [[var_63:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_60]] [[var_59]] [[var_62]] 1
+; CHECK-SPIRV:          [[var_64:%[a-z0-9_.]+]] = OpCompositeExtract [[ulong]] [[var_63]] 0
+; CHECK-SPIRV:          [[var_65:%[a-z0-9_.]+]] = OpCompositeExtract [[bool]] [[var_63]] 1
+; CHECK-SPIRV:                                    OpReturnValue [[var_65]]
+
+; CHECK-LLVM:   %0 = alloca [[structtype_3]], align 8
+; CHECK-LLVM:   call spir_func void @_Z18__spirv_ISubBorrowll(ptr sret([[structtype_3]]) %0, i64 %a, i64 %b)
+; CHECK-LLVM:   %1 = load [[structtype_3]], ptr %0, align 4
+; CHECK-LLVM:   %2 = extractvalue [[structtype_3]] %1, 0
+; CHECK-LLVM:   %3 = extractvalue [[structtype_3]] %1, 1
+; CHECK-LLVM:   %4 = icmp ne i64 %3, 0
+; CHECK-LLVM:   %5 = insertvalue [[structtype_4]] undef, i64 %2, 0
+; CHECK-LLVM:   %6 = insertvalue [[structtype_4]] %5, i1 %4, 1
+; CHECK-LLVM:   %7 = extractvalue [[structtype_4]] %6, 0
+; CHECK-LLVM:   %8 = extractvalue [[structtype_4]] %6, 1
+; CHECK-LLVM:   ret i1 %8
+define spir_func <4 x i1> @test_usub_with_overflow_v4i32(<4 x i32> %a, <4 x i32> %b) {
 entry:
- %res = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a, <4 x i32> %b)
- ret void
+  %res = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a, <4 x i32> %b) 
+  %0 = extractvalue {<4 x i32>, <4 x i1>} %res, 0
+  %1 = extractvalue {<4 x i32>, <4 x i1>} %res, 1
+  ret <4 x i1> %1
 }
 
+; CHECK-SPIRV:             [[a_2:%[a-z0-9_.]+]] = OpFunctionParameter [[v4uint]]
+; CHECK-SPIRV:             [[b_2:%[a-z0-9_.]+]] = OpFunctionParameter [[v4uint]]
+; CHECK-SPIRV:         [[entry_2:%[a-z0-9_.]+]] = OpLabel
+; CHECK-SPIRV:          [[var_75:%[a-z0-9_.]+]] = OpVariable [[_ptr_Function__struct_73]] Function
+; CHECK-SPIRV:          [[var_76:%[a-z0-9_.]+]] = OpISubBorrow [[_struct_73]] [[a_2]] [[b_2]]
+; CHECK-SPIRV:                                    OpStore [[var_75]] [[var_76]]
+; CHECK-SPIRV:          [[var_77:%[a-z0-9_.]+]] = OpLoad [[_struct_73]] [[var_75]] Aligned 16
+; CHECK-SPIRV:          [[var_78:%[a-z0-9_.]+]] = OpCompositeExtract [[v4uint]] [[var_77]] 0
+; CHECK-SPIRV:          [[var_79:%[a-z0-9_.]+]] = OpCompositeExtract [[v4uint]] [[var_77]] 1
+; CHECK-SPIRV:          [[var_81:%[a-z0-9_.]+]] = OpINotEqual [[v4bool]] [[var_79]] [[var_80]]
+; CHECK-SPIRV:          [[var_84:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_82]] [[var_78]] [[var_83]] 0
+; CHECK-SPIRV:          [[var_85:%[a-z0-9_.]+]] = OpCompositeInsert [[_struct_82]] [[var_81]] [[var_84]] 1
+; CHECK-SPIRV:          [[var_86:%[a-z0-9_.]+]] = OpCompositeExtract [[v4uint]] [[var_85]] 0
+; CHECK-SPIRV:          [[var_87:%[a-z0-9_.]+]] = OpCompositeExtract [[v4bool]] [[var_85]] 1
+; CHECK-SPIRV:                                    OpReturnValue [[var_87]]
+
+; CHECK-LLVM:   %0 = alloca [[structtype_5]], align 16
+; CHECK-LLVM:   call spir_func void @_Z18__spirv_ISubBorrowDv4_iS_(ptr sret([[structtype_5]]) %0, <4 x i32> %a, <4 x i32> %b)
+; CHECK-LLVM:   %1 = load [[structtype_5]], ptr %0, align 16
+; CHECK-LLVM:   %2 = extractvalue [[structtype_5]] %1, 0
+; CHECK-LLVM:   %3 = extractvalue [[structtype_5]] %1, 1
+; CHECK-LLVM:   %4 = icmp ne <4 x i32> %3, zeroinitializer
+; CHECK-LLVM:   %5 = insertvalue [[structtype_6]] undef, <4 x i32> %2, 0
+; CHECK-LLVM:   %6 = insertvalue [[structtype_6]] %5, <4 x i1> %4, 1
+; CHECK-LLVM:   %7 = extractvalue [[structtype_6]] %6, 0
+; CHECK-LLVM:   %8 = extractvalue [[structtype_6]] %6, 1
+; CHECK-LLVM:   ret <4 x i1> %8
 declare {i16, i1} @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
 declare {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
 declare {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
 declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a, <4 x i32> %b)
+declare void @_Z18__spirv_ISubBorrowii(ptr sret({i32, i32}), i32, i32)
diff --git a/llvm-spirv/test/transcoding/OpenCL/convert_functions.ll b/llvm-spirv/test/transcoding/OpenCL/convert_functions.ll
new file mode 100644
index 0000000000000..ccc83a0f4f353
--- /dev/null
+++ b/llvm-spirv/test/transcoding/OpenCL/convert_functions.ll
@@ -0,0 +1,55 @@
+; This test checks that functions with `convert_` prefix are translated as
+; OpenCL builtins only in case they match the specification. Otherwise, we
+; expect such functions to be translated to SPIR-V FunctionCall.
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s -check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.spv -r -o - | llvm-dis -o %t.rev.ll
+; RUN: FileCheck < %t.rev.ll %s -check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV: Name [[#Func:]] "_Z18convert_float_func"
+; CHECK-SPIRV: TypeVoid [[#VoidTy:]]
+; CHECK-SPIRV: TypeFloat [[#FloatTy:]] 32
+
+; CHECK-SPIRV: Function [[#VoidTy]] [[#Func]]
+; CHECK-SPIRV: ConvertSToF [[#FloatTy]] [[#ConvertId:]] [[#]]
+; CHECK-SPIRV: FunctionCall [[#VoidTy]] [[#]] [[#Func]] [[#ConvertId]]
+; CHECK-SPIRV-NOT: FConvert
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir"
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_func void @_Z18convert_float_func(float noundef %x) #0 {
+entry:
+  %x.addr = alloca float, align 4
+  store float %x, ptr %x.addr, align 4
+  ret void
+}
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_func void @convert_int_bf16(i32 noundef %x) #0 {
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+; CHECK-LLVM: %[[Call:[a-z]+]] = sitofp i32 %[[#]] to float
+  %call = call spir_func float @_Z13convert_floati(i32 noundef %0) #1
+; CHECK-LLVM: call spir_func void @_Z18convert_float_func(float %[[Call]])
+  call spir_func void @_Z18convert_float_func(float noundef %call) #0
+  ret void
+}
+
+; Function Attrs: convergent nounwind willreturn memory(none)
+declare spir_func float @_Z13convert_floati(i32 noundef) #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind willreturn memory(none) }
+
+!opencl.ocl.version = !{!0}
+!opencl.spir.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm-spirv/test/type-scavenger/globals.ll b/llvm-spirv/test/type-scavenger/globals.ll
index 89cab9f7eeda4..541459de2290f 100644
--- a/llvm-spirv/test/type-scavenger/globals.ll
+++ b/llvm-spirv/test/type-scavenger/globals.ll
@@ -9,12 +9,18 @@ target triple = "spir-unknown-unknown"
 ; CHECK-DAG: TypeInt [[I8:[0-9]+]] 8 0
 ; CHECK-DAG: TypeInt [[I16:[0-9]+]] 16 0
 ; CHECK-DAG: TypeInt [[I32:[0-9]+]] 32 0
+; CHECK-DAG: TypeInt [[I64:[0-9]+]] 64 0
 ; CHECK-DAG: TypePointer [[I8PTR:[0-9]+]] 5 [[I8]]
 ; CHECK-DAG: TypePointer [[I16PTR:[0-9]+]] 5 [[I16]]
 ; CHECK-DAG: TypePointer [[I16PTRPTR:[0-9]+]] 5 [[I16PTR]]
 ; CHECK-DAG: TypePointer [[I32PTR:[0-9]+]] 5 [[I32]]
+; CHECK-DAG: TypePointer [[I32PTRPTR:[0-9]+]] 5 [[I32PTR]]
 ; CHECK-DAG: TypeArray [[I8PTRx2:[0-9]+]] [[I8PTR]]
 ; CHECK-DAG: TypePointer [[I8PTRx2PTR:[0-9]+]] 5 [[I8PTRx2]]
+; CHECK-DAG: TypeArray [[I32x4:[0-9]+]] [[I32]]
+; CHECK-DAG: TypeArray [[I32x3x4:[0-9]+]] [[I32x4]]
+; CHECK-DAG: TypeArray [[I32x2x3x4:[0-9]+]] [[I32x3x4]]
+; CHECK-DAG: TypePointer [[I32x2x3x4PTR:[0-9]+]] 5 [[I32x2x3x4]]
 ; CHECK: Variable [[I16PTR]] [[A:[0-9]+]] 5
 ; CHECK: Variable [[I32PTR]] [[B:[0-9]+]] 5
 ; CHECK: Variable [[I16PTRPTR]] [[C:[0-9]+]] 5 [[A]]
@@ -22,12 +28,18 @@ target triple = "spir-unknown-unknown"
 ; CHECK: SpecConstantOp [[I8PTR]] [[BI8:[0-9]+]] 124 [[B]]
 ; CHECK: ConstantComposite [[I8PTRx2]] [[DINIT:[0-9]+]] [[AI8]] [[BI8]]
 ; CHECK: Variable [[I8PTRx2PTR]] [[D:[0-9]+]] 5 [[DINIT]]
+; CHECK: ConstantComposite [[I32x2x3x4]] [[FINIT:[0-9]+]]
+; CHECK: Variable [[I32x2x3x4PTR]] [[F:[0-9]+]] 5 [[FINIT]]
+; CHECK: SpecConstantOp [[I32PTR]] [[GINIT:[0-9]+]] 70 [[F]]
+; CHECK: Variable [[I32PTRPTR]] [[G:[0-9]+]] 5 [[GINIT]]
 
 @a = addrspace(1) global i16 0
 @b = external addrspace(1) global i32
 @c = addrspace(1) global ptr addrspace(1) @a
 @d = addrspace(1) global [2 x ptr addrspace(1)] [ptr addrspace(1) @a, ptr addrspace(1) @b]
 ;@e = global [1 x ptr] [ptr @foo]
+@f = addrspace(1) global [2 x [3 x [4 x i32]]] [[3 x [4 x i32]] [[4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] [i32 1, i32 2, i32 3, i32 4]], [3 x [4 x i32]] [[4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] [i32 1, i32 2, i32 3, i32 4]]]
+@g = addrspace(1) global ptr addrspace(1) getelementptr inbounds ([2 x [3 x [4 x i32]]], ptr addrspace(1) @f, i64 0, i64 1, i64 2, i64 3)
 
 ; Function Attrs: nounwind
 define spir_kernel void @foo() {
diff --git a/llvm-spirv/test/var_undef.ll b/llvm-spirv/test/var_undef.ll
new file mode 100644
index 0000000000000..49db6aaa2c574
--- /dev/null
+++ b/llvm-spirv/test/var_undef.ll
@@ -0,0 +1,28 @@
+; Ensure that encoding of variable with undef initializer
+; has correct wordcount
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv -to-text %t.spv -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix CHECK-SPIRV
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis %t.rev.bc -o %t.rev.ll
+; RUN: FileCheck < %t.rev.ll %s --check-prefix CHECK-LLVM
+
+; CHECK-SPIRV:Name [[BAR_VAR:[0-9]+]] "bar"
+;; bar variable does not have optional initializer
+;; word count must be 4
+; CHECK-SPIRV:4 Variable [[#]] [[BAR_VAR]]
+
+; CHECK-LLVM:@bar = internal addrspace(3) global %range undef, align 8
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+%anon = type { %range }
+%range = type { %array }
+%array = type { [2 x i64] }
+
+@foo = internal addrspace(3) global %anon undef, align 8
+
+@bar = internal unnamed_addr addrspace(3) global %range undef, align 8
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 14b18e6c6d4c8..5273fdc405aca 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -160,6 +160,18 @@ foreach(proj IN LISTS LLVM_ENABLE_RUNTIMES)
   endif()
 endforeach()
 
+# Set a shorthand option to enable the GPU build of the 'libc' project.
+option(LIBC_GPU_BUILD "Enable the 'libc' project targeting the GPU" OFF)
+if(LIBC_GPU_BUILD)
+  if(LLVM_RUNTIME_TARGETS)
+    list(APPEND LLVM_RUNTIME_TARGETS "nvptx64-nvidia-cuda" "amdgcn-amd-amdhsa")
+  else()
+    set(LLVM_RUNTIME_TARGETS "default;nvptx64-nvidia-cuda;amdgcn-amd-amdhsa")
+  endif()
+  list(APPEND RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "libc")
+  list(APPEND RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "libc")
+endif()
+
 set(NEED_LIBC_HDRGEN FALSE)
 if("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
   set(NEED_LIBC_HDRGEN TRUE)
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index eb9e6101bdce7..745935f140517 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -263,6 +263,7 @@ if (NOT DEFINED LLVM_LINKER_DETECTED AND NOT WIN32)
   # -no_warn_duplicate_libraries, but only in versions of the linker that
   # support that flag.
   if(NOT LLVM_USE_LINKER AND ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    include(CheckLinkerFlag)
     check_linker_flag(C "-Wl,-no_warn_duplicate_libraries" LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES)
   else()
     set(LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES OFF CACHE INTERNAL "")
@@ -1636,8 +1637,14 @@ function(add_unittest test_suite test_name)
   # The runtime benefits of LTO don't outweight the compile time costs for tests.
   if(LLVM_ENABLE_LTO)
     if((UNIX OR MINGW) AND LINKER_IS_LLD)
-      set_property(TARGET ${test_name} APPEND_STRING PROPERTY
-                    LINK_FLAGS " -Wl,--lto-O0")
+      if(LLVM_ENABLE_FATLTO AND NOT APPLE)
+        # When using FatLTO, just use relocatable linking.
+        set_property(TARGET ${test_name} APPEND_STRING PROPERTY
+                      LINK_FLAGS " -Wl,--no-fat-lto-objects")
+      else()
+        set_property(TARGET ${test_name} APPEND_STRING PROPERTY
+                      LINK_FLAGS " -Wl,--lto-O0")
+      endif()
     elseif(LINKER_IS_LLD_LINK)
       set_property(TARGET ${test_name} APPEND_STRING PROPERTY
                     LINK_FLAGS " /opt:lldlto=0")
@@ -2089,7 +2096,7 @@ function(add_lit_testsuites project directory)
 endfunction()
 
 function(llvm_install_library_symlink name dest type)
-  cmake_parse_arguments(ARG "" "COMPONENT;SOVERSION" "" ${ARGN})
+  cmake_parse_arguments(ARG "FULL_DEST" "COMPONENT" "" ${ARGN})
   foreach(path ${CMAKE_MODULE_PATH})
     if(EXISTS ${path}/LLVMInstallSymlink.cmake)
       set(INSTALL_SYMLINK ${path}/LLVMInstallSymlink.cmake)
@@ -2103,8 +2110,8 @@ function(llvm_install_library_symlink name dest type)
   endif()
 
   set(full_name ${CMAKE_${type}_LIBRARY_PREFIX}${name}${CMAKE_${type}_LIBRARY_SUFFIX})
-  if (ARG_SOVERSION)
-    set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX}.${ARG_SOVERSION})
+  if (ARG_FULL_DEST)
+    set(full_dest ${dest})
   else()
     set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX})
   endif()
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 889aca5608c0f..fd5ecce3b0326 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -32,6 +32,8 @@ endif()
 set(LLVM_ENABLE_LTO OFF CACHE STRING "Build LLVM with LTO. May be specified as Thin or Full to use a particular kind of LTO")
 string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
 
+option(LLVM_ENABLE_FATLTO "Build LLVM with -ffat-lto-objects." OFF)
+
 # Ninja Job Pool support
 # The following only works with the Ninja generator in CMake >= 3.0.
 set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
@@ -1280,6 +1282,13 @@ elseif(LLVM_ENABLE_LTO)
   endif()
 endif()
 
+if(LLVM_ENABLE_FATLTO AND UNIX AND NOT APPLE)
+  append("-ffat-lto-objects" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  if(NOT LINKER_IS_LLD_LINK)
+    append("-ffat-lto-objects" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS)
+  endif()
+endif()
+
 # Set an AIX default for LLVM_EXPORT_SYMBOLS_FOR_PLUGINS based on whether we are
 # doing dynamic linking (see below).
 set(LLVM_EXPORT_SYMBOLS_FOR_PLUGINS_AIX_default OFF)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 34863d2a9be33..f20072a263ba0 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1552,6 +1552,25 @@ The AMDGPU backend supports the following calling conventions:
 
      =============================== ==========================================================
 
+AMDGPU MCExpr
+-------------
+
+As part of the AMDGPU MC layer, AMDGPU provides the following target specific
+``MCExpr``\s.
+
+  .. table:: AMDGPU MCExpr types:
+     :name: amdgpu-mcexpr-table
+
+     =================== ================= ========================================================
+     MCExpr              Operands          Return value
+     =================== ================= ========================================================
+     ``max(arg, ...)``   1 or more         Variadic signed operation that returns the maximum
+                                           value of all its arguments.
+
+     ``or(arg, ...)``    1 or more         Variadic signed operation that returns the bitwise-or
+                                           result of all its arguments.
+
+     =================== ================= ========================================================
 
 .. _amdgpu-elf-code-object:
 
diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index af9d7f16b3619..e3f2f33224b01 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -140,10 +140,6 @@ OPTIONS
  (in bytes) to the linked dSYM. The table is sorted by the output size listing
  the object files with the largest contribution first.
 
-.. option:: --symbol-map <bcsymbolmap>
-
- Update the existing dSYMs inplace using symbol map specified.
-
 .. option:: -s, --symtab
 
  Dumps the symbol table found in *executable* or object file(s) and exits.
diff --git a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
index a78066a5eea37..54622cc61fdfd 100644
--- a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
+++ b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
@@ -16,7 +16,7 @@ DESCRIPTION
 binary object files and prints their contents in a logical view, which
 is a human readable representation that closely matches the structure
 of the original user source code. Supported object file formats include
-ELF, Mach-O, PDB and COFF.
+ELF, Mach-O, WebAssembly, PDB and COFF.
 
 The **logical view** abstracts the complexity associated with the
 different low-level representations of the debugging information that
@@ -468,8 +468,9 @@ If the <pattern> criteria is too general, a more selective option can
 be specified to target a particular category of elements:
 lines (:option:`--select-lines`), scopes (:option:`--select-scopes`),
 symbols (:option:`--select-symbols`) and types (:option:`--select-types`).
+
 These options require knowledge of the debug information format (DWARF,
-CodeView, COFF), as the given **kind** describes a very specific type
+CodeView), as the given **kind** describes a very specific type
 of element.
 
 LINES
@@ -598,7 +599,7 @@ When comparing logical views created from different debug formats, its
 accuracy depends on how close the debug information represents the
 user code. For instance, a logical view created from a binary file with
 DWARF debug information may include more detailed data than a logical
-view created from a binary file with CodeView/COFF debug information.
+view created from a binary file with CodeView debug information.
 
 The following options describe the elements to compare.
 
@@ -1952,6 +1953,315 @@ The **{Coverage}** and **{Location}** attributes describe the debug
 location and coverage for logical symbols. For optimized code, the
 coverage value decreases and it affects the program debuggability.
 
+WEBASSEMBLY SUPPORT
+~~~~~~~~~~~~~~~~~~~
+The below example is used to show the WebAssembly output generated by
+:program:`llvm-debuginfo-analyzer`. We compiled the example for a
+WebAssembly 32-bit target with Clang (-O0 -g --target=wasm32):
+
+.. code-block:: c++
+
+  1  using INTPTR = const int *;
+  2  int foo(INTPTR ParamPtr, unsigned ParamUnsigned, bool ParamBool) {
+  3    if (ParamBool) {
+  4      typedef int INTEGER;
+  5      const INTEGER CONSTANT = 7;
+  6      return CONSTANT;
+  7    }
+  8    return ParamUnsigned;
+  9  }
+
+PRINT BASIC DETAILS
+^^^^^^^^^^^^^^^^^^^
+The following command prints basic details for all the logical elements
+sorted by the debug information internal offset; it includes its lexical
+level and debug info format.
+
+.. code-block:: none
+
+  llvm-debuginfo-analyzer --attribute=level,format
+                          --output-sort=offset
+                          --print=scopes,symbols,types,lines,instructions
+                          test-clang.o
+
+or
+
+.. code-block:: none
+
+  llvm-debuginfo-analyzer --attribute=level,format
+                          --output-sort=offset
+                          --print=elements
+                          test-clang.o
+
+Each row represents an element that is present within the debug
+information. The first column represents the scope level, followed by
+the associated line number (if any), and finally the description of
+the element.
+
+.. code-block:: none
+
+  Logical View:
+  [000]           {File} 'test-clang.o' -> WASM
+
+  [001]             {CompileUnit} 'test.cpp'
+  [002]     2         {Function} extern not_inlined 'foo' -> 'int'
+  [003]     2           {Parameter} 'ParamPtr' -> 'INTPTR'
+  [003]     2           {Parameter} 'ParamUnsigned' -> 'unsigned int'
+  [003]     2           {Parameter} 'ParamBool' -> 'bool'
+  [003]                 {Block}
+  [004]     5             {Variable} 'CONSTANT' -> 'const INTEGER'
+  [004]     5             {Line}
+  [004]                   {Code} 'i32.const	7'
+  [004]                   {Code} 'local.set	10'
+  [004]                   {Code} 'local.get	5'
+  [004]                   {Code} 'local.get	10'
+  [004]                   {Code} 'i32.store	12'
+  [004]     6             {Line}
+  [004]                   {Code} 'i32.const	7'
+  [004]                   {Code} 'local.set	11'
+  [004]                   {Code} 'local.get	5'
+  [004]                   {Code} 'local.get	11'
+  [004]                   {Code} 'i32.store	28'
+  [004]                   {Code} 'br      	1'
+  [004]     -             {Line}
+  [004]                   {Code} 'end'
+  [003]     4           {TypeAlias} 'INTEGER' -> 'int'
+  [003]     2           {Line}
+  [003]                 {Code} 'nop'
+  [003]                 {Code} 'end'
+  [003]                 {Code} 'i64.div_s'
+  [003]                 {Code} 'global.get	0'
+  [003]                 {Code} 'local.set	3'
+  [003]                 {Code} 'i32.const	32'
+  [003]                 {Code} 'local.set	4'
+  [003]                 {Code} 'local.get	3'
+  [003]                 {Code} 'local.get	4'
+  [003]                 {Code} 'i32.sub'
+  [003]                 {Code} 'local.set	5'
+  [003]                 {Code} 'local.get	5'
+  [003]                 {Code} 'local.get	0'
+  [003]                 {Code} 'i32.store	24'
+  [003]                 {Code} 'local.get	5'
+  [003]                 {Code} 'local.get	1'
+  [003]                 {Code} 'i32.store	20'
+  [003]                 {Code} 'local.get	2'
+  [003]                 {Code} 'local.set	6'
+  [003]                 {Code} 'local.get	5'
+  [003]                 {Code} 'local.get	6'
+  [003]                 {Code} 'i32.store8	19'
+  [003]     3           {Line}
+  [003]                 {Code} 'local.get	5'
+  [003]                 {Code} 'i32.load8_u	19'
+  [003]                 {Code} 'local.set	7'
+  [003]     3           {Line}
+  [003]                 {Code} 'i32.const	1'
+  [003]                 {Code} 'local.set	8'
+  [003]                 {Code} 'local.get	7'
+  [003]                 {Code} 'local.get	8'
+  [003]                 {Code} 'i32.and'
+  [003]                 {Code} 'local.set	9'
+  [003]                 {Code} 'block'
+  [003]                 {Code} 'block'
+  [003]                 {Code} 'local.get	9'
+  [003]                 {Code} 'i32.eqz'
+  [003]                 {Code} 'br_if   	0'
+  [003]     8           {Line}
+  [003]                 {Code} 'local.get	5'
+  [003]                 {Code} 'i32.load	20'
+  [003]                 {Code} 'local.set	12'
+  [003]     8           {Line}
+  [003]                 {Code} 'local.get	5'
+  [003]                 {Code} 'local.get	12'
+  [003]                 {Code} 'i32.store	28'
+  [003]     -           {Line}
+  [003]                 {Code} 'end'
+  [003]     9           {Line}
+  [003]                 {Code} 'local.get	5'
+  [003]                 {Code} 'i32.load	28'
+  [003]                 {Code} 'local.set	13'
+  [003]                 {Code} 'local.get	13'
+  [003]                 {Code} 'return'
+  [003]                 {Code} 'end'
+  [003]     9           {Line}
+  [003]                 {Code} 'unreachable'
+  [002]     1         {TypeAlias} 'INTPTR' -> '* const int'
+
+SELECT LOGICAL ELEMENTS
+^^^^^^^^^^^^^^^^^^^^^^^
+The following prints all *instructions*, *symbols* and *types* that
+contain **'block'** or **'.store'** in their names or types, using a tab
+layout and given the number of matches.
+
+.. code-block:: none
+
+  llvm-debuginfo-analyzer --attribute=level
+                          --select-nocase --select-regex
+                          --select=BLOCK --select=.store
+                          --report=list
+                          --print=symbols,types,instructions,summary
+                          test-clang.o
+
+  Logical View:
+  [000]           {File} 'test-clang.o'
+
+  [001]           {CompileUnit} 'test.cpp'
+  [003]           {Code} 'block'
+  [003]           {Code} 'block'
+  [004]           {Code} 'i32.store	12'
+  [003]           {Code} 'i32.store	20'
+  [003]           {Code} 'i32.store	24'
+  [004]           {Code} 'i32.store	28'
+  [003]           {Code} 'i32.store	28'
+  [003]           {Code} 'i32.store8	19'
+
+  -----------------------------
+  Element      Total    Printed
+  -----------------------------
+  Scopes           3          0
+  Symbols          4          0
+  Types            2          0
+  Lines           62          8
+  -----------------------------
+  Total           71          8
+
+COMPARISON MODE
+^^^^^^^^^^^^^^^
+Given the previous example we found the above debug information issue
+(related to the previous invalid scope location for the **'typedef int
+INTEGER'**) by comparing against another compiler.
+
+Using GCC to generate test-dwarf-gcc.o, we can apply a selection pattern
+with the printing mode to obtain the following logical view output.
+
+.. code-block:: none
+
+  llvm-debuginfo-analyzer --attribute=level
+                          --select-regex --select-nocase --select=INTe
+                          --report=list
+                          --print=symbols,types
+                          test-clang.o test-dwarf-gcc.o
+
+  Logical View:
+  [000]           {File} 'test-clang.o'
+
+  [001]           {CompileUnit} 'test.cpp'
+  [003]     4     {TypeAlias} 'INTEGER' -> 'int'
+  [004]     5     {Variable} 'CONSTANT' -> 'const INTEGER'
+
+  Logical View:
+  [000]           {File} 'test-dwarf-gcc.o'
+
+  [001]           {CompileUnit} 'test.cpp'
+  [004]     4     {TypeAlias} 'INTEGER' -> 'int'
+  [004]     5     {Variable} 'CONSTANT' -> 'const INTEGER'
+
+The output shows that both objects contain the same elements. But the
+**'typedef INTEGER'** is located at different scope level. The GCC
+generated object, shows **'4'**, which is the correct value.
+
+There are 2 comparison methods: logical view and logical elements.
+
+LOGICAL VIEW
+""""""""""""
+It compares the logical view as a whole unit; for a match, each compared
+logical element must have the same parents and children.
+
+The output shows in view form the **missing (-), added (+)** elements,
+giving more context by swapping the reference and target object files.
+
+.. code-block:: none
+
+  llvm-debuginfo-analyzer --attribute=level
+                          --compare=types
+                          --report=view
+                          --print=symbols,types
+                          test-clang.o test-dwarf-gcc.o
+
+  Reference: 'test-clang.o'
+  Target:    'test-dwarf-gcc.o'
+
+  Logical View:
+   [000]           {File} 'test-clang.o'
+
+   [001]             {CompileUnit} 'test.cpp'
+   [002]     1         {TypeAlias} 'INTPTR' -> '* const int'
+   [002]     2         {Function} extern not_inlined 'foo' -> 'int'
+   [003]                 {Block}
+   [004]     5             {Variable} 'CONSTANT' -> 'const INTEGER'
+  +[004]     4             {TypeAlias} 'INTEGER' -> 'int'
+   [003]     2           {Parameter} 'ParamBool' -> 'bool'
+   [003]     2           {Parameter} 'ParamPtr' -> 'INTPTR'
+   [003]     2           {Parameter} 'ParamUnsigned' -> 'unsigned int'
+  -[003]     4           {TypeAlias} 'INTEGER' -> 'int'
+
+The output shows the merging view path (reference and target) with the
+missing and added elements.
+
+LOGICAL ELEMENTS
+""""""""""""""""
+It compares individual logical elements without considering if their
+parents are the same. For both comparison methods, the equal criteria
+includes the name, source code location, type, lexical scope level.
+
+.. code-block:: none
+
+  llvm-debuginfo-analyzer --attribute=level
+                          --compare=types
+                          --report=list
+                          --print=symbols,types,summary
+                          test-clang.o test-dwarf-gcc.o
+
+  Reference: 'test-clang.o'
+  Target:    'test-dwarf-gcc.o'
+
+  (1) Missing Types:
+  -[003]     4     {TypeAlias} 'INTEGER' -> 'int'
+
+  (1) Added Types:
+  +[004]     4     {TypeAlias} 'INTEGER' -> 'int'
+
+  ----------------------------------------
+  Element   Expected    Missing      Added
+  ----------------------------------------
+  Scopes           4          0          0
+  Symbols          0          0          0
+  Types            2          1          1
+  Lines            0          0          0
+  ----------------------------------------
+  Total            6          1          1
+
+Changing the *Reference* and *Target* order:
+
+.. code-block:: none
+
+  llvm-debuginfo-analyzer --attribute=level
+                          --compare=types
+                          --report=list
+                          --print=symbols,types,summary
+                          test-dwarf-gcc.o test-clang.o
+
+  Reference: 'test-dwarf-gcc.o'
+  Target:    'test-clang.o'
+
+  (1) Missing Types:
+  -[004]     4     {TypeAlias} 'INTEGER' -> 'int'
+
+  (1) Added Types:
+  +[003]     4     {TypeAlias} 'INTEGER' -> 'int'
+
+  ----------------------------------------
+  Element   Expected    Missing      Added
+  ----------------------------------------
+  Scopes           4          0          0
+  Symbols          0          0          0
+  Types            2          1          1
+  Lines            0          0          0
+  ----------------------------------------
+  Total            6          1          1
+
+As the *Reference* and *Target* are switched, the *Added Types* from
+the first case now are listed as *Missing Types*.
+
 EXIT STATUS
 -----------
 :program:`llvm-debuginfo-analyzer` returns 0 if the input files were
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 763aeb87c6880..a4247796cb659 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -323,6 +323,11 @@ The :doc:`CodeOfConduct` applies to all office hours.
     - Monthly, 4th Wednesday of the month at 9:30am PT, for 30 minutes.
     - `Google meet <https://meet.google.com/pdd-dibg-cwv>`__
     - English
+  * - Maksim Levental and Jeremy Kun
+    - MLIR newcomers and general discussion (`livestreamed <https://www.youtube.com/playlist?list=PLhxO86S3jsX2k7kOhZaV-qKWm8tNsUdAE>`__)
+    - Every two weeks, Fridays at 3:00pm US Pacific, for 90 minutes.
+    - Livestream chat or `Google meet <https://meet.google.com/wit-tvzc-dwc>`__
+    - English
   * - Rotating hosts
     - Getting Started, beginner questions, new contributors.
     - Every Tuesday at 2 PM ET (11 AM PT), for 30 minutes.
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index bf1b3cb30a52e..ac6217d08e6a6 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -790,11 +790,17 @@ G_ATOMIC_CMPXCHG
 Generic atomic cmpxchg. Expects a MachineMemOperand in addition to explicit
 operands.
 
-G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
-G_ATOMICRMW_NAND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MAX,
-G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_FADD,
-G_ATOMICRMW_FSUB, G_ATOMICRMW_FMAX, G_ATOMICRMW_FMIN
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+|all_g_atomicrmw|
+^^^^^^^^^^^^^^^^^
+
+.. |all_g_atomicrmw| replace:: G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
+                               G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
+                               G_ATOMICRMW_NAND, G_ATOMICRMW_OR,
+                               G_ATOMICRMW_XOR, G_ATOMICRMW_MAX,
+                               G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
+                               G_ATOMICRMW_UMIN, G_ATOMICRMW_FADD,
+                               G_ATOMICRMW_FSUB, G_ATOMICRMW_FMAX,
+                               G_ATOMICRMW_FMIN
 
 Generic atomicrmw. Expects a MachineMemOperand in addition to explicit
 operands.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 930e669c36ebe..448e1ff5498ec 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -14574,6 +14574,63 @@ The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
 integer element type. The argument types must match each other, and the return
 type must match the argument type.
 
+.. _int_scmp:
+
+'``llvm.scmp.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``@llvm.scmp`` on any
+integer bit width or any vector of integer elements.
+
+::
+
+      declare i2 @llvm.scmp.i2.i32(i32 %a, i32 %b)
+      declare <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview:
+"""""""""
+
+Return ``-1`` if ``%a`` is signed less than ``%b``, ``0`` if they are equal, and 
+``1`` if ``%a`` is signed greater than ``%b``. Vector intrinsics operate on a per-element basis. 
+
+Arguments:
+""""""""""
+
+The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
+integer element type. The argument types must match each other, and the return
+type must be at least as wide as ``i2``, to hold the three possible return values.
+
+.. _int_ucmp:
+
+'``llvm.ucmp.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``@llvm.ucmp`` on any
+integer bit width or any vector of integer elements.
+
+::
+
+      declare i2 @llvm.ucmp.i2.i32(i32 %a, i32 %b)
+      declare <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview:
+"""""""""
+
+Return ``-1`` if ``%a`` is unsigned less than ``%b``, ``0`` if they are equal, and 
+``1`` if ``%a`` is unsigned greater than ``%b``. Vector intrinsics operate on a per-element basis. 
+
+Arguments:
+""""""""""
+
+The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
+integer element type. The argument types must match each other, and the return
+type must be at least as wide as ``i2``, to hold the three possible return values.
 
 .. _int_memcpy:
 
diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md
index 41266b43bc29a..a8d2b4d8f5f0b 100644
--- a/llvm/docs/PointerAuth.md
+++ b/llvm/docs/PointerAuth.md
@@ -10,6 +10,9 @@ Before the pointer is used, it needs to be authenticated, i.e., have its
 signature checked.  This prevents pointer values of unknown origin from being
 used to replace the signed pointer value.
 
+For more details, see the clang documentation page for
+[Pointer Authentication](https://clang.llvm.org/docs/PointerAuthentication.html).
+
 At the IR level, it is represented using:
 
 * a [set of intrinsics](#intrinsics) (to sign/authenticate pointers)
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index a1de8596480da..2f17c9d7dda04 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -362,6 +362,15 @@ The current vendor extensions supported are:
 ``XCVbi``
   LLVM implements `version 1.0.0 of the CORE-V immediate branching custom instructions specification <https://github.com/openhwgroup/cv32e40p/blob/cv32e40p_v1.3.2/docs/source/instruction_set_extensions.rst>`__ by OpenHW Group.  All instructions are prefixed with `cv.` as described in the specification. These instructions are only available for riscv32 at this time.
 
+``XSiFivecdiscarddlone``
+  LLVM implements `the SiFive sf.cdiscard.d.l1 instruction specified in <https://sifive.cdn.prismic.io/sifive/767804da-53b2-4893-97d5-b7c030ae0a94_s76mc_core_complex_manual_21G3.pdf>`_ by SiFive.
+
+``XSiFivecflushdlone``
+  LLVM implements `the SiFive sf.cflush.d.l1 instruction specified in <https://sifive.cdn.prismic.io/sifive/767804da-53b2-4893-97d5-b7c030ae0a94_s76mc_core_complex_manual_21G3.pdf>`_ by SiFive.
+
+``XSfcease``
+  LLVM implements `the SiFive sf.cease instruction specified in <https://sifive.cdn.prismic.io/sifive/767804da-53b2-4893-97d5-b7c030ae0a94_s76mc_core_complex_manual_21G3.pdf>`_ by SiFive.
+
 Experimental C Intrinsics
 =========================
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 7be51730663bd..03691efe836fe 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -67,6 +67,8 @@ Changes to Interprocedural Optimizations
 Changes to the AArch64 Backend
 ------------------------------
 
+* Added support for Cortex-A78AE, Cortex-A520AE and Cortex-A720AE CPUs.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
@@ -106,6 +108,7 @@ Changes to the RISC-V Backend
 * The experimental Ssnpm, Smnpm, Smmpm, Sspm, and Supm 0.8.1 Pointer Masking extensions are supported.
 * The experimental Ssqosid extension is supported.
 * Zacas is no longer experimental.
+* Added the CSR names from the Resumable Non-Maskable Interrupts (Smrnmi) extension.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -131,6 +134,18 @@ Changes to the C API
 * Added ``LLVMConstStringInContext2`` function, which better matches the C++
   API by using ``size_t`` for string length. Deprecated ``LLVMConstStringInContext``. 
 
+* Added the following functions for accessing a function's prefix data:
+
+  * ``LLVMHasPrefixData``
+  * ``LLVMGetPrefixData``
+  * ``LLVMSetPrefixData``
+
+* Added the following functions for accessing a function's prologue data:
+
+  * ``LLVMHasPrologueData``
+  * ``LLVMGetPrologueData``
+  * ``LLVMSetPrologueData``
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
diff --git a/llvm/docs/RemoveDIsDebugInfo.md b/llvm/docs/RemoveDIsDebugInfo.md
index a9857733c70d3..2cb17e2b5e448 100644
--- a/llvm/docs/RemoveDIsDebugInfo.md
+++ b/llvm/docs/RemoveDIsDebugInfo.md
@@ -24,12 +24,33 @@ The debug records are not instructions, do not appear in the instruction list, a
 
 # Great, what do I need to do!
 
-Approximately nothing -- we've already instrumented all of LLVM to handle these new records ("`DPValues`") and behave identically to past LLVM behaviour. We plan on turning this on by default some time soon, with IR converted to the intrinsic form of debug info at terminals (textual IR, bitcode) for a short while, before then changing the textual IR and bitcode formats.
+Approximately nothing -- we've already instrumented all of LLVM to handle these new records ("`DbgRecords`") and behave identically to past LLVM behaviour. We plan on turning this on by default some time soon, with IR converted to the intrinsic form of debug info at terminals (textual IR, bitcode) for a short while, before then changing the textual IR and bitcode formats.
 
 There are two significant changes to be aware of. Firstly, we're adding a single bit of debug relevant data to the `BasicBlock::iterator` class (it's so that we can determine whether ranges intend on including debug info at the beginning of a block or not). That means when writing passes that insert LLVM IR instructions, you need to identify positions with `BasicBlock::iterator` rather than just a bare `Instruction *`. Most of the time this means that after identifying where you intend on inserting something, you must also call `getIterator` on the instruction position -- however when inserting at the start of a block you _must_ use `getFirstInsertionPt`, `getFirstNonPHIIt` or `begin` and use that iterator to insert, rather than just fetching a pointer to the first instruction.
 
 The second matter is that if you transfer sequences of instructions from one place to another manually, i.e. repeatedly using `moveBefore` where you might have used `splice`, then you should instead use the method `moveBeforePreserving`. `moveBeforePreserving` will transfer debug info records with the instruction they're attached to. This is something that happens automatically today -- if you use `moveBefore` on every element of an instruction sequence, then debug intrinsics will be moved in the normal course of your code, but we lose this behaviour with non-instruction debug info.
 
+# C-API changes
+
+All the functions that have been added are temporary and will be deprecated in the future. The intention is that they'll help downstream projects adapt during the transition period.
+
+```
+New functions (all to be deprecated)
+------------------------------------
+LLVMIsNewDbgInfoFormat                      # Returns true if the module is in the new non-instruction mode.
+LLVMSetIsNewDbgInfoFormat                   # Convert to the requested debug info format.
+
+LLVMDIBuilderInsertDeclareIntrinsicBefore   # Insert a debug intrinsic (old debug info format). 
+LLVMDIBuilderInsertDeclareIntrinsicAtEnd    # Same as above.
+LLVMDIBuilderInsertDbgValueIntrinsicBefore  # Same as above.
+LLVMDIBuilderInsertDbgValueIntrinsicAtEnd   # Same as above.
+
+LLVMDIBuilderInsertDeclareRecordBefore      # Insert a debug record (new debug info format). 
+LLVMDIBuilderInsertDeclareRecordAtEnd       # Same as above.
+LLVMDIBuilderInsertDbgValueRecordBefore     # Same as above.
+LLVMDIBuilderInsertDbgValueRecordAtEnd      # Same as above.
+```
+
 # Anything else?
 
 Not really, but here's an "old vs new" comparison of how to do certain things and quickstart for how this "new" debug info is structured.
@@ -40,13 +61,15 @@ This will all happen transparently without needing to think about it!
 
 ## What exactly have you replaced debug intrinsics with?
 
-We're using a dedicated C++ class called `DPValue` to store debug info, with a one-to-one relationship between each instance of a debug intrinsic and each `DPValue` object in any LLVM IR program. This class stores exactly the same information as is stored in debugging intrinsics. It also has almost entirely the same set of methods, that behave in the same way:
+We're using a dedicated C++ class called `DbgRecord` to store debug info, with a one-to-one relationship between each instance of a debug intrinsic and each `DbgRecord` object in any LLVM IR program; these `DbgRecord`s are represented in the IR as non-instruction debug records, as described in the [Source Level Debugging](project:SourceLevelDebugging.rst#Debug Records) document. This class has a set of subclasses that store exactly the same information as is stored in debugging intrinsics. Each one also has almost entirely the same set of methods, that behave in the same way:
 
-  https://llvm.org/docs/doxygen/classllvm_1_1DPValue.html
+  https://llvm.org/docs/doxygen/classllvm_1_1DbgRecord.html
+  https://llvm.org/docs/doxygen/classllvm_1_1DbgVariableRecord.html
+  https://llvm.org/docs/doxygen/classllvm_1_1DPLabel.html
 
-This allows you to treat a `DPValue` as if it's a `dbg.value` intrinsic most of the time, for example in generic (auto-param) lambdas.
+This allows you to treat a `DbgVariableRecord` as if it's a `dbg.value`/`dbg.declare`/`dbg.assign` intrinsic most of the time, for example in generic (auto-param) lambdas, and the same for `DPLabel` and `dbg.label`s.
 
-## How do these DPValues fit into the instruction stream?
+## How do these `DbgRecords` fit into the instruction stream?
 
 Like so:
 
@@ -60,37 +83,37 @@ Like so:
                          |
                          v
                   +------------+
-            <-----+  DPMarker  |<----
-           /      +------------+     \
-          /                           \
-         /                             \
-        v                               ^
- +-----------+    +-----------+   +-----------+
- |  DPValue  +--->|  DPValue  +-->|  DPValue  |
- +-----------+    +-----------+   +-----------+
+          <-------+  DPMarker  |<-------
+         /        +------------+        \
+        /                                \
+       /                                  \
+      v                                    ^
+ +-------------+    +-------------+   +-------------+
+ |  DbgRecord  +--->|  DbgRecord  +-->|  DbgRecord  |
+ +-------------+    +-------------+   +-------------+
 ```
 
-Each instruction has a pointer to a `DPMarker` (which will become optional), that contains a list of `DPValue` objects. No debugging records appear in the instruction list at all. `DPValue`s have a parent pointer to their owning `DPMarker`, and each `DPMarker` has a pointer back to it's owning instruction.
+Each instruction has a pointer to a `DPMarker` (which will become optional), that contains a list of `DbgRecord` objects. No debugging records appear in the instruction list at all. `DbgRecord`s have a parent pointer to their owning `DPMarker`, and each `DPMarker` has a pointer back to it's owning instruction.
 
-Not shown are the links from DPValues to other parts of the `Value`/`Metadata` hierachy: `DPValue`s have raw pointers to `DILocalVariable`, `DIExpression` and `DILocation` objects, and references to `Value`s are stored in a `DebugValueUser` base class. This refers to a `ValueAsMetadata` object referring to `Value`s, via the `TrackingMetadata` facility.
+Not shown are the links from DbgRecord to other parts of the `Value`/`Metadata` hierachy: `DbgRecord` subclasses have tracking pointers to the DIMetadata that they use, and `DbgVariableRecord` has references to `Value`s that are stored in a `DebugValueUser` base class. This refers to a `ValueAsMetadata` object referring to `Value`s, via the `TrackingMetadata` facility.
 
-The various kinds of debug intrinsic (value, declare, assign) are all stored in the `DPValue` object, with a "Type" field disamgibuating which is which.
+The various kinds of debug intrinsic (value, declare, assign, label) are all stored in `DbgRecord` subclasses, with a "RecordKind" field distinguishing `DPLabel`s from `DbgVariableRecord`s, and a `LocationType` field in the `DbgVariableRecord` class further disambiguating the various debug variable intrinsics it can represent.
 
 ## Finding debug info records
 
-Utilities such as `findDbgUsers` and the like now have an optional argument that will return the set of `DPValue` records that refer to a `Value`. You should be able to treat them the same as intrinsics.
+Utilities such as `findDbgUsers` and the like now have an optional argument that will return the set of `DbgVariableRecord` records that refer to a `Value`. You should be able to treat them the same as intrinsics.
 
 ## Examining debug info records at positions
 
-Call `Instruction::getDbgRecordRange()` to get the range of `DPValue` objects that are attached to an instruction.
+Call `Instruction::getDbgRecordRange()` to get the range of `DbgRecord` objects that are attached to an instruction.
 
 ## Moving around, deleting
 
-You can use `DPValue::removeFromParent` to unlink a `DPValue` from it's marker, and then `BasicBlock::insertDbgRecordBefore` or `BasicBlock::insertDbgRecordAfter` to re-insert the `DPValue` somewhere else. You cannot insert a `DPValue` at an arbitary point in a list of `DPValue`s (if you're doing this with `dbg.value`s then it's unlikely to be correct).
+You can use `DbgRecord::removeFromParent` to unlink a `DbgRecord` from it's marker, and then `BasicBlock::insertDbgRecordBefore` or `BasicBlock::insertDbgRecordAfter` to re-insert the `DbgRecord` somewhere else. You cannot insert a `DbgRecord` at an arbitary point in a list of `DbgRecord`s (if you're doing this with `dbg.value`s then it's unlikely to be correct).
 
-Erase `DPValue`s by calling `eraseFromParent` or `deleteInstr` if it's already been removed.
+Erase `DbgRecord`s by calling `eraseFromParent` or `deleteInstr` if it's already been removed.
 
-## What about dangling `DPValue`s?
+## What about dangling `DbgRecord`s?
 
 If you have a block like so:
 
@@ -101,6 +124,6 @@ If you have a block like so:
       br label %xyzzy
 ```
 
-your optimisation pass may wish to erase the terminator and then do something to the block. This is easy to do when debug info is kept in instructions, but with `DPValue`s there is no trailing instruction to attach the variable information to in the block above, once the terminator is erased. For such degenerate blocks, `DPValue`s are stored temporarily in a map in `LLVMContext`, and are re-inserted when a terminator is reinserted to the block or other instruction inserted at `end()`.
+your optimisation pass may wish to erase the terminator and then do something to the block. This is easy to do when debug info is kept in instructions, but with `DbgRecord`s there is no trailing instruction to attach the variable information to in the block above, once the terminator is erased. For such degenerate blocks, `DbgRecord`s are stored temporarily in a map in `LLVMContext`, and are re-inserted when a terminator is reinserted to the block or other instruction inserted at `end()`.
 
 This can technically lead to trouble in the vanishingly rare scenario where an optimisation pass erases a terminator and then decides to erase the whole block. (We recommend not doing that).
diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst
index 2eeb076192033..155cb3361669c 100644
--- a/llvm/docs/UserGuides.rst
+++ b/llvm/docs/UserGuides.rst
@@ -181,7 +181,7 @@ Optimizations
 
 :doc:`RemoveDIsDebugInfo`
    This is a migration guide describing how to move from debug info using
-   intrinsics such as dbg.value to using the non-instruction DPValue object.
+   intrinsics such as dbg.value to using the non-instruction DbgRecord object.
 
 :doc:`InstrProfileFormat`
    This document explains two binary formats of instrumentation-based profiles.
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 7cfe4dc4f775f..f56a6c961aad7 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -744,6 +744,24 @@ LLVMModuleRef LLVMCloneModule(LLVMModuleRef M);
  */
 void LLVMDisposeModule(LLVMModuleRef M);
 
+/**
+ * Soon to be deprecated.
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Returns true if the module is in the new debug info mode which uses
+ * non-instruction debug records instead of debug intrinsics for variable
+ * location tracking.
+ */
+LLVMBool LLVMIsNewDbgInfoFormat(LLVMModuleRef M);
+
+/**
+ * Soon to be deprecated.
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Convert module into desired debug info format.
+ */
+void LLVMSetIsNewDbgInfoFormat(LLVMModuleRef M, LLVMBool UseNewFormat);
+
 /**
  * Obtain the identifier of a module.
  *
@@ -2730,6 +2748,44 @@ const char *LLVMGetGC(LLVMValueRef Fn);
  */
 void LLVMSetGC(LLVMValueRef Fn, const char *Name);
 
+/**
+ * Gets the prefix data associated with a function. Only valid on functions, and
+ * only if LLVMHasPrefixData returns true.
+ * See https://llvm.org/docs/LangRef.html#prefix-data
+ */
+LLVMValueRef LLVMGetPrefixData(LLVMValueRef Fn);
+
+/**
+ * Check if a given function has prefix data. Only valid on functions.
+ * See https://llvm.org/docs/LangRef.html#prefix-data
+ */
+LLVMBool LLVMHasPrefixData(LLVMValueRef Fn);
+
+/**
+ * Sets the prefix data for the function. Only valid on functions.
+ * See https://llvm.org/docs/LangRef.html#prefix-data
+ */
+void LLVMSetPrefixData(LLVMValueRef Fn, LLVMValueRef prefixData);
+
+/**
+ * Gets the prologue data associated with a function. Only valid on functions,
+ * and only if LLVMHasPrologueData returns true.
+ * See https://llvm.org/docs/LangRef.html#prologue-data
+ */
+LLVMValueRef LLVMGetPrologueData(LLVMValueRef Fn);
+
+/**
+ * Check if a given function has prologue data. Only valid on functions.
+ * See https://llvm.org/docs/LangRef.html#prologue-data
+ */
+LLVMBool LLVMHasPrologueData(LLVMValueRef Fn);
+
+/**
+ * Sets the prologue data for the function. Only valid on functions.
+ * See https://llvm.org/docs/LangRef.html#prologue-data
+ */
+void LLVMSetPrologueData(LLVMValueRef Fn, LLVMValueRef prologueData);
+
 /**
  * Add an attribute to a function.
  *
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index 5924294708cc3..b23ff63c862f8 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -1248,7 +1248,24 @@ LLVMMetadataRef LLVMDIBuilderCreateTempGlobalVariableFwdDecl(
     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
     LLVMMetadataRef Decl, uint32_t AlignInBits);
 
+/*
+ * Insert a new llvm.dbg.declare intrinsic call before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new intrinsic.
+ */
+LLVMValueRef
+LLVMDIBuilderInsertDeclareBefore(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
+                                 LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
+                                 LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
 /**
+ * Soon to be deprecated.
+ * Only use in "old debug mode" (LLVMIsNewDbgFormat() is false).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
  * Insert a new llvm.dbg.declare intrinsic call before the given instruction.
  * \param Builder     The DIBuilder.
  * \param Storage     The storage of the variable to declare.
@@ -1257,9 +1274,25 @@ LLVMMetadataRef LLVMDIBuilderCreateTempGlobalVariableFwdDecl(
  * \param DebugLoc    Debug info location.
  * \param Instr       Instruction acting as a location for the new intrinsic.
  */
-LLVMValueRef LLVMDIBuilderInsertDeclareBefore(
-  LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
-  LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
+LLVMValueRef LLVMDIBuilderInsertDeclareIntrinsicBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
+/**
+ * Soon to be deprecated.
+ * Only use in "new debug mode" (LLVMIsNewDbgFormat() is true).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a Declare DbgRecord before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new record.
+ */
+LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
 
 /**
  * Insert a new llvm.dbg.declare intrinsic call at the end of the given basic
@@ -1275,6 +1308,42 @@ LLVMValueRef LLVMDIBuilderInsertDeclareBefore(
 LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(
     LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
     LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
+/**
+ * Soon to be deprecated.
+ * Only use in "old debug mode" (LLVMIsNewDbgFormat() is false).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.declare intrinsic call at the end of the given basic
+ * block. If the basic block has a terminator instruction, the intrinsic is
+ * inserted before that terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDeclareIntrinsicAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
+/**
+ * Soon to be deprecated.
+ * Only use in "new debug mode" (LLVMIsNewDbgFormat() is true).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a Declare DbgRecord at the end of the given basic block. If the basic
+ * block has a terminator instruction, the record is inserted before that
+ * terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new record.
+ */
+LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
 
 /**
  * Insert a new llvm.dbg.value intrinsic call before the given instruction.
@@ -1285,12 +1354,42 @@ LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(
  * \param DebugLoc    Debug info location.
  * \param Instr       Instruction acting as a location for the new intrinsic.
  */
-LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder,
-                                               LLVMValueRef Val,
-                                               LLVMMetadataRef VarInfo,
-                                               LLVMMetadataRef Expr,
-                                               LLVMMetadataRef DebugLoc,
-                                               LLVMValueRef Instr);
+LLVMValueRef
+LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder, LLVMValueRef Val,
+                                  LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
+                                  LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
+/**
+ * Soon to be deprecated.
+ * Only use in "old debug mode" (Module::IsNewDbgInfoFormat is false).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.value intrinsic call before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDbgValueIntrinsicBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
+/**
+ * Soon to be deprecated.
+ * Only use in "new debug mode" (Module::IsNewDbgInfoFormat is true).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.value intrinsic call before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new intrinsic.
+ */
+LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
 
 /**
  * Insert a new llvm.dbg.value intrinsic call at the end of the given basic
@@ -1303,12 +1402,45 @@ LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder,
  * \param DebugLoc    Debug info location.
  * \param Block       Basic block acting as a location for the new intrinsic.
  */
-LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(LLVMDIBuilderRef Builder,
-                                              LLVMValueRef Val,
-                                              LLVMMetadataRef VarInfo,
-                                              LLVMMetadataRef Expr,
-                                              LLVMMetadataRef DebugLoc,
-                                              LLVMBasicBlockRef Block);
+LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
+/**
+ * Soon to be deprecated.
+ * Only use in "old debug mode" (Module::IsNewDbgInfoFormat is false).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.value intrinsic call at the end of the given basic
+ * block. If the basic block has a terminator instruction, the intrinsic is
+ * inserted before that terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDbgValueIntrinsicAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
+/**
+ * Soon to be deprecated.
+ * Only use in "new debug mode" (Module::IsNewDbgInfoFormat is true).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.value intrinsic call at the end of the given basic
+ * block. If the basic block has a terminator instruction, the intrinsic is
+ * inserted before that terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new intrinsic.
+ */
+LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
 
 /**
  * Create a new descriptor for a local auto variable.
diff --git a/llvm/include/llvm-c/Types.h b/llvm/include/llvm-c/Types.h
index d5474d986309f..4681500ef9da3 100644
--- a/llvm/include/llvm-c/Types.h
+++ b/llvm/include/llvm-c/Types.h
@@ -169,6 +169,11 @@ typedef struct LLVMOpaqueJITEventListener *LLVMJITEventListenerRef;
  */
 typedef struct LLVMOpaqueBinary *LLVMBinaryRef;
 
+/**
+ * @see llvm::DbgRecord
+ */
+typedef struct LLVMOpaqueDbgRecord *LLVMDbgRecordRef;
+
 /**
  * @}
  */
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index bea3e28adf308..1abea9eb24a3c 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -997,6 +997,12 @@ class [[nodiscard]] APInt {
   APInt ushl_ov(const APInt &Amt, bool &Overflow) const;
   APInt ushl_ov(unsigned Amt, bool &Overflow) const;
 
+  /// Signed integer floor division operation.
+  ///
+  /// Rounds towards negative infinity, i.e. 5 / -2 = -3. Iff minimum value
+  /// divided by -1 set Overflow to true.
+  APInt sfloordiv_ov(const APInt &RHS, bool &Overflow) const;
+
   // Operations that saturate
   APInt sadd_sat(const APInt &RHS) const;
   APInt uadd_sat(const APInt &RHS) const;
@@ -2198,6 +2204,18 @@ inline const APInt abdu(const APInt &A, const APInt &B) {
   return A.uge(B) ? (A - B) : (B - A);
 }
 
+/// Compute the floor of the signed average of C1 and C2
+APInt avgFloorS(const APInt &C1, const APInt &C2);
+
+/// Compute the floor of the unsigned average of C1 and C2
+APInt avgFloorU(const APInt &C1, const APInt &C2);
+
+/// Compute the ceil of the signed average of C1 and C2
+APInt avgCeilS(const APInt &C1, const APInt &C2);
+
+/// Compute the ceil of the unsigned average of C1 and C2
+APInt avgCeilU(const APInt &C1, const APInt &C2);
+
 /// Compute GCD of two unsigned APInt values.
 ///
 /// This function returns the greatest common divisor of the two APInt values
diff --git a/llvm/include/llvm/Analysis/ConstantFolding.h b/llvm/include/llvm/Analysis/ConstantFolding.h
index fd885a8c0ea3e..c54b1e8f01d2b 100644
--- a/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -22,6 +22,11 @@
 #include <stdint.h>
 
 namespace llvm {
+
+namespace Intrinsic {
+using ID = unsigned;
+}
+
 class APInt;
 template <typename T> class ArrayRef;
 class CallBase;
@@ -187,6 +192,10 @@ Constant *ConstantFoldCall(const CallBase *Call, Function *F,
                            ArrayRef<Constant *> Operands,
                            const TargetLibraryInfo *TLI = nullptr);
 
+Constant *ConstantFoldBinaryIntrinsic(Intrinsic::ID ID, Constant *LHS,
+                                      Constant *RHS, Type *Ty,
+                                      Instruction *FMFSource);
+
 /// ConstantFoldLoadThroughBitcast - try to cast constant to destination type
 /// returning null if unsuccessful. Can cast pointer to pointer or pointer to
 /// integer and vice versa if their sizes are equal.
diff --git a/llvm/include/llvm/Analysis/InstSimplifyFolder.h b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
index 23e2ea80e8cbe..8a3269d6add0e 100644
--- a/llvm/include/llvm/Analysis/InstSimplifyFolder.h
+++ b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
@@ -117,6 +117,12 @@ class InstSimplifyFolder final : public IRBuilderFolder {
     return simplifyCastInst(Op, V, DestTy, SQ);
   }
 
+  Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                             Instruction *FMFSource) const override {
+    return simplifyBinaryIntrinsic(ID, Ty, LHS, RHS, SQ,
+                                   dyn_cast_if_present<CallBase>(FMFSource));
+  }
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h
index a29955a06cf4e..03d7ad12c12d8 100644
--- a/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -186,6 +186,11 @@ Value *simplifyExtractElementInst(Value *Vec, Value *Idx,
 Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
                         const SimplifyQuery &Q);
 
+/// Given operands for a BinaryIntrinsic, fold the result or return null.
+Value *simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType, Value *Op0,
+                               Value *Op1, const SimplifyQuery &Q,
+                               const CallBase *Call);
+
 /// Given operands for a ShuffleVectorInst, fold the result or return null.
 /// See class ShuffleVectorInst for a description of the mask representation.
 Value *simplifyShuffleVectorInst(Value *Op0, Value *Op1, ArrayRef<int> Mask,
diff --git a/llvm/include/llvm/Analysis/MemoryLocation.h b/llvm/include/llvm/Analysis/MemoryLocation.h
index b72a27cab86b3..830eed5d60ee4 100644
--- a/llvm/include/llvm/Analysis/MemoryLocation.h
+++ b/llvm/include/llvm/Analysis/MemoryLocation.h
@@ -191,8 +191,14 @@ class LocationSize {
     return Value == Other.Value;
   }
 
+  bool operator==(const TypeSize &Other) const {
+    return hasValue() && getValue() == Other;
+  }
+
   bool operator!=(const LocationSize &Other) const { return !(*this == Other); }
 
+  bool operator!=(const TypeSize &Other) const { return !(*this == Other); }
+
   // Ordering operators are not provided, since it's unclear if there's only one
   // reasonable way to compare:
   // - values that don't exist against values that do, and
@@ -292,9 +298,10 @@ class MemoryLocation {
   }
 
   // Return the exact size if the exact size is known at compiletime,
-  // otherwise return MemoryLocation::UnknownSize.
-  static uint64_t getSizeOrUnknown(const TypeSize &T) {
-    return T.isScalable() ? UnknownSize : T.getFixedValue();
+  // otherwise return LocationSize::beforeOrAfterPointer().
+  static LocationSize getSizeOrUnknown(const TypeSize &T) {
+    return T.isScalable() ? LocationSize::beforeOrAfterPointer()
+                          : LocationSize::precise(T.getFixedValue());
   }
 
   MemoryLocation() : Ptr(nullptr), Size(LocationSize::beforeOrAfterPointer()) {}
diff --git a/llvm/include/llvm/Analysis/TargetFolder.h b/llvm/include/llvm/Analysis/TargetFolder.h
index 978e1002515fc..b4105ad76c02e 100644
--- a/llvm/include/llvm/Analysis/TargetFolder.h
+++ b/llvm/include/llvm/Analysis/TargetFolder.h
@@ -191,6 +191,15 @@ class TargetFolder final : public IRBuilderFolder {
     return nullptr;
   }
 
+  Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                             Instruction *FMFSource) const override {
+    auto *C1 = dyn_cast<Constant>(LHS);
+    auto *C2 = dyn_cast<Constant>(RHS);
+    if (C1 && C2)
+      return ConstantFoldBinaryIntrinsic(ID, C1, C2, Ty, FMFSource);
+    return nullptr;
+  }
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 4eab357f1b33b..c43a1b5c1b2aa 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1247,13 +1247,16 @@ class TargetTransformInfo {
   /// cases or optimizations based on those values.
   /// \p CxtI is the optional original context instruction, if one exists, to
   /// provide even more information.
+  /// \p TLibInfo is used to search for platform specific vector library
+  /// functions for instructions that might be converted to calls (e.g. frem).
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
       TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None},
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
-      const Instruction *CxtI = nullptr) const;
+      const Instruction *CxtI = nullptr,
+      const TargetLibraryInfo *TLibInfo = nullptr) const;
 
   /// Returns the cost estimation for alternating opcode pattern that can be
   /// lowered to a single instruction on the target. In X86 this is for the
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index a28e19edb4c6a..532f9481766a9 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -141,7 +141,7 @@ enum class PartType {
 #include "DXContainerConstants.def"
 };
 
-#define SHADER_FEATURE_FLAG(Num, Val, Str) Val = 1ull << Num,
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str) Val = 1ull << Num,
 enum class FeatureFlags : uint64_t {
 #include "DXContainerConstants.def"
 };
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 80ed86bc3a499..62dc573555198 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -9,46 +9,63 @@ CONTAINER_PART(OSG1)
 CONTAINER_PART(PSG1)
 
 #undef CONTAINER_PART
-#endif 
+#endif // CONTAINER_PART
 
 #ifdef SHADER_FEATURE_FLAG
 
-SHADER_FEATURE_FLAG(0, Doubles, "Double-precision floating point")
-SHADER_FEATURE_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
-SHADER_FEATURE_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
-SHADER_FEATURE_FLAG(3, Max64UAVs, "64 UAV slots")
-SHADER_FEATURE_FLAG(4, MinimumPrecision, "Minimum-precision data types")
-SHADER_FEATURE_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
-SHADER_FEATURE_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
-SHADER_FEATURE_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
-SHADER_FEATURE_FLAG(8, TiledResources, "Tiled resources")
-SHADER_FEATURE_FLAG(9, StencilRef, "PS Output Stencil Ref")
-SHADER_FEATURE_FLAG(10, InnerCoverage, "PS Inner Coverage")
-SHADER_FEATURE_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
-SHADER_FEATURE_FLAG(12, ROVs, "Raster Ordered UAVs")
-SHADER_FEATURE_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
-SHADER_FEATURE_FLAG(14, WaveOps, "Wave level operations")
-SHADER_FEATURE_FLAG(15, Int64Ops, "64-Bit integer")
-SHADER_FEATURE_FLAG(16, ViewID, "View Instancing")
-SHADER_FEATURE_FLAG(17, Barycentrics, "Barycentrics")
-SHADER_FEATURE_FLAG(18, NativeLowPrecision, "Use native low precision")
-SHADER_FEATURE_FLAG(19, ShadingRate, "Shading Rate")
-SHADER_FEATURE_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
-SHADER_FEATURE_FLAG(21, SamplerFeedback, "Sampler feedback")
-SHADER_FEATURE_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
-SHADER_FEATURE_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
-SHADER_FEATURE_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
-SHADER_FEATURE_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
-SHADER_FEATURE_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
-SHADER_FEATURE_FLAG(27, RESERVED, "<RESERVED>")
-SHADER_FEATURE_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
-SHADER_FEATURE_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
-SHADER_FEATURE_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
-
-SHADER_FEATURE_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
+// SHADER_FEATURE_FLAG(bit offset for the shader info flag, bit offset for DXIL module flag, name, description.
+
+SHADER_FEATURE_FLAG(0,   2, Doubles, "Double-precision floating point")
+SHADER_FEATURE_FLAG(1,  17, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
+SHADER_FEATURE_FLAG(2,  16, UAVsAtEveryStage, "UAVs at every shader stage")
+SHADER_FEATURE_FLAG(3,  15, Max64UAVs, "64 UAV slots")
+SHADER_FEATURE_FLAG(4,  -1, MinimumPrecision, "Minimum-precision data types")
+SHADER_FEATURE_FLAG(5,   6, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
+SHADER_FEATURE_FLAG(6,   7, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
+SHADER_FEATURE_FLAG(7,  14, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
+SHADER_FEATURE_FLAG(8,  12, TiledResources, "Tiled resources")
+SHADER_FEATURE_FLAG(9,  11, StencilRef, "PS Output Stencil Ref")
+SHADER_FEATURE_FLAG(10, 10, InnerCoverage, "PS Inner Coverage")
+SHADER_FEATURE_FLAG(11, 13, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
+SHADER_FEATURE_FLAG(12, 18, ROVs, "Raster Ordered UAVs")
+SHADER_FEATURE_FLAG(13,  9, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
+SHADER_FEATURE_FLAG(14, 19, WaveOps, "Wave level operations")
+SHADER_FEATURE_FLAG(15, 20, Int64Ops, "64-Bit integer")
+SHADER_FEATURE_FLAG(16, 21, ViewID, "View Instancing")
+SHADER_FEATURE_FLAG(17, 22, Barycentrics, "Barycentrics")
+SHADER_FEATURE_FLAG(18, -1, NativeLowPrecision, "Use native low precision")
+SHADER_FEATURE_FLAG(19, 24, ShadingRate, "Shading Rate")
+SHADER_FEATURE_FLAG(20, 25, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
+SHADER_FEATURE_FLAG(21, 26, SamplerFeedback, "Sampler feedback")
+SHADER_FEATURE_FLAG(22, 27, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
+SHADER_FEATURE_FLAG(23, 28, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
+SHADER_FEATURE_FLAG(24, 29, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
+SHADER_FEATURE_FLAG(25, 30, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
+SHADER_FEATURE_FLAG(26, 31, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
+SHADER_FEATURE_FLAG(27, 63, RESERVED, "<RESERVED>")
+SHADER_FEATURE_FLAG(28, 32, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
+SHADER_FEATURE_FLAG(29, 34, AdvancedTextureOps, "Advanced Texture Ops")
+SHADER_FEATURE_FLAG(30, 35, WriteableMSAATextures, "Writeable MSAA Textures")
+
+SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
 
 #undef SHADER_FEATURE_FLAG
-#endif
+#endif // SHADER_FEATURE_FLAG
+
+#ifdef DXIL_MODULE_FLAG
+
+// Only save DXIL module flags which not map to feature flags here.
+DXIL_MODULE_FLAG( 0,  DisableOptimizations,   "D3D11_1_SB_GLOBAL_FLAG_SKIP_OPTIMIZATION")
+DXIL_MODULE_FLAG( 1,  DisableMathRefactoring, "D3D10_SB_GLOBAL_FLAG_REFACTORING_ALLOWED")
+DXIL_MODULE_FLAG( 3,  ForceEarlyDepthStencil, "D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL")
+DXIL_MODULE_FLAG( 4,  EnableRawAndStructuredBuffers, "D3D11_SB_GLOBAL_FLAG_ENABLE_RAW_AND_STRUCTURED_BUFFERS")
+DXIL_MODULE_FLAG( 5,  LowPrecisionPresent, "D3D11_1_SB_GLOBAL_FLAG_ENABLE_MINIMUM_PRECISION")
+DXIL_MODULE_FLAG( 8,  AllResourcesBound, "D3D12_SB_GLOBAL_FLAG_ALL_RESOURCES_BOUND")
+DXIL_MODULE_FLAG(23,  UseNativeLowPrecision, "Native 16bit types enabled")
+DXIL_MODULE_FLAG(33,  ResMayNotAlias, "Any UAV may not alias any other UAV")
+
+#undef DXIL_MODULE_FLAG
+#endif // DXIL_MODULE_FLAG
 
 #ifdef SEMANTIC_KIND
 
@@ -86,7 +103,7 @@ SEMANTIC_KIND(30, CullPrimitive)
 SEMANTIC_KIND(30, Invalid)
 
 #undef SEMANTIC_KIND
-#endif
+#endif // SEMANTIC_KIND
 
 #ifdef COMPONENT_TYPE
 
@@ -102,7 +119,7 @@ COMPONENT_TYPE(8, SInt64)
 COMPONENT_TYPE(9, Float64)
 
 #undef COMPONENT_TYPE
-#endif
+#endif // COMPONENT_TYPE
 
 #ifdef COMPONENT_PRECISION
 
@@ -116,7 +133,7 @@ COMPONENT_PRECISION(0xf0, Any16)
 COMPONENT_PRECISION(0xf1, Any10)
 
 #undef COMPONENT_PRECISION
-#endif
+#endif // COMPONENT_PRECISION
 
 #ifdef INTERPOLATION_MODE
 
@@ -131,7 +148,7 @@ INTERPOLATION_MODE(7, LinearNoperspectiveSample)
 INTERPOLATION_MODE(8, Invalid)
 
 #undef INTERPOLATION_MODE
-#endif
+#endif // INTERPOLATION_MODE
 
 #ifdef D3D_SYSTEM_VALUE
 
@@ -165,4 +182,4 @@ D3D_SYSTEM_VALUE(70, InnerCoverage)
 
 #undef D3D_SYSTEM_VALUE
 
-#endif
+#endif // D3D_SYSTEM_VALUE
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index bace3a92677a8..877f3f7862c8b 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1141,6 +1141,8 @@ enum : unsigned {
 
   SHT_CSKY_ATTRIBUTES = 0x70000001U,
 
+  SHT_HEXAGON_ATTRIBUTES = 0x70000003U,
+
   SHT_HIPROC = 0x7fffffff, // Highest processor arch-specific type.
   SHT_LOUSER = 0x80000000, // Lowest type reserved for applications.
   SHT_HIUSER = 0xffffffff  // Highest type reserved for applications.
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index c0a52d64a101d..39303e6485214 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -624,6 +624,17 @@ enum FunctionCodes {
                                   //             operation, align, vol,
                                   //             ordering, synchscope]
   FUNC_CODE_BLOCKADDR_USERS = 60, // BLOCKADDR_USERS: [value...]
+
+  FUNC_CODE_DEBUG_RECORD_VALUE =
+      61, // [DILocation, DILocalVariable, DIExpression, ValueAsMetadata]
+  FUNC_CODE_DEBUG_RECORD_DECLARE =
+      62, // [DILocation, DILocalVariable, DIExpression, ValueAsMetadata]
+  FUNC_CODE_DEBUG_RECORD_ASSIGN =
+      63, // [DILocation, DILocalVariable, DIExpression, ValueAsMetadata,
+          //  DIAssignID, DIExpression (addr), ValueAsMetadata (addr)]
+  FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE =
+      64, // [DILocation, DILocalVariable, DIExpression, Value]
+  FUNC_CODE_DEBUG_RECORD_LABEL = 65, // [DILocation, DILabel]
 };
 
 enum UseListCodes {
diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 31af3014afe4e..45a47d7333e35 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -191,7 +191,7 @@ class FunctionLoweringInfo {
   /// Collection of dbg.declare instructions handled after argument
   /// lowering and before ISel proper.
   SmallPtrSet<const DbgDeclareInst *, 8> PreprocessedDbgDeclares;
-  SmallPtrSet<const DPValue *, 8> PreprocessedDPVDeclares;
+  SmallPtrSet<const DbgVariableRecord *, 8> PreprocessedDVRDeclares;
 
   /// set - Initialize this FunctionLoweringInfo with the given Function
   /// and its associated MachineFunction.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 3c3b91661d578..4c187a3068d82 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -117,6 +117,9 @@ class CallLowering {
     /// vreg that the swifterror should be copied into after the call.
     Register SwiftErrorVReg;
 
+    /// Valid if the call is a controlled convergent operation.
+    Register ConvergenceCtrlToken;
+
     /// Original IR callsite corresponding to this call, if available.
     const CallBase *CB = nullptr;
 
@@ -584,6 +587,7 @@ class CallLowering {
   bool lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &Call,
                  ArrayRef<Register> ResRegs,
                  ArrayRef<ArrayRef<Register>> ArgRegs, Register SwiftErrorVReg,
+                 Register ConvergenceCtrlToken,
                  std::function<unsigned()> GetCalleeReg) const;
 
   /// For targets which want to use big-endian can enable it with
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 23728636498ba..9e8fc5d635c50 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -696,10 +696,6 @@ class CombinerHelper {
   /// (G_*MULO x, 0) -> 0 + no carry out
   bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Match:
-  /// (G_*ADDO x, 0) -> x + no carry out
-  bool matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
-
   /// Match:
   /// (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
   /// (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
@@ -810,12 +806,15 @@ class CombinerHelper {
   /// Combine selects.
   bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Combine ands,
+  /// Combine ands.
   bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Combine ors,
+  /// Combine ors.
   bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Combine addos.
+  bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -919,6 +918,7 @@ class CombinerHelper {
   bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
   bool isConstantSplatVector(Register Src, int64_t SplatValue,
                              bool AllowUndefs);
+  bool isConstantOrConstantVectorI(Register Src) const;
 
   std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index 8e456015cd373..c73ac2c9f55b7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -647,15 +647,15 @@ bool GIMatchTableExecutor::executeMatchTable(
 
       unsigned Size = MRI.getType(MO.getReg()).getSizeInBits();
       if (MatcherOpcode == GIM_CheckMemorySizeEqualToLLT &&
-          MMO->getSizeInBits() != Size) {
+          MMO->getSizeInBits().getValue() != Size) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeLessThanLLT &&
-                 MMO->getSizeInBits() >= Size) {
+                 MMO->getSizeInBits().getValue() >= Size) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeGreaterThanLLT &&
-                 MMO->getSizeInBits() <= Size)
+                 MMO->getSizeInBits().getValue() <= Size)
         if (handleReject() == RejectAndGiveUp)
           return false;
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index f5a6528d10a97..261cfcf504d5f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -54,9 +54,9 @@ class GMemOperation : public GenericMachineInstr {
   bool isUnordered() const { return getMMO().isUnordered(); }
 
   /// Returns the size in bytes of the memory access.
-  uint64_t getMemSize() const { return getMMO().getSize(); }
+  LocationSize getMemSize() const { return getMMO().getSize(); }
   /// Returns the size in bits of the memory access.
-  uint64_t getMemSizeInBits() const { return getMMO().getSizeInBits(); }
+  LocationSize getMemSizeInBits() const { return getMMO().getSizeInBits(); }
 
   static bool classof(const MachineInstr *MI) {
     return GenericMachineInstr::classof(MI) && MI->hasOneMemOperand();
@@ -359,6 +359,8 @@ class GBinOpCarryOut : public GenericMachineInstr {
   Register getCarryOutReg() const { return getReg(1); }
   MachineOperand &getLHS() { return getOperand(2); }
   MachineOperand &getRHS() { return getOperand(3); }
+  Register getLHSReg() const { return getOperand(2).getReg(); }
+  Register getRHSReg() const { return getOperand(3).getReg(); }
 
   static bool classof(const MachineInstr *MI) {
     switch (MI->getOpcode()) {
@@ -429,6 +431,23 @@ class GAddSubCarryOut : public GBinOpCarryOut {
   }
 };
 
+/// Represents overflowing add operations.
+/// G_UADDO, G_SADDO
+class GAddCarryOut : public GBinOpCarryOut {
+public:
+  bool isSigned() const { return getOpcode() == TargetOpcode::G_SADDO; }
+
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_UADDO:
+    case TargetOpcode::G_SADDO:
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+
 /// Represents overflowing add/sub operations that also consume a carry-in.
 /// G_UADDE, G_SADDE, G_USUBE, G_SSUBE
 class GAddSubCarryInOut : public GAddSubCarryOut {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index bfac54a65c5b4..6ae7c14409079 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -205,7 +205,7 @@ class IRTranslator : public MachineFunctionPass {
   bool translate(const Constant &C, Register Reg);
 
   /// Examine any debug-info attached to the instruction (in the form of
-  /// DPValues) and translate it.
+  /// DbgRecords) and translate it.
   void translateDbgInfo(const Instruction &Inst,
                           MachineIRBuilder &MIRBuilder);
 
@@ -243,6 +243,14 @@ class IRTranslator : public MachineFunctionPass {
   bool translateMemFunc(const CallInst &CI, MachineIRBuilder &MIRBuilder,
                         unsigned Opcode);
 
+  // Translate @llvm.experimental.vector.interleave2 and
+  // @llvm.experimental.vector.deinterleave2 intrinsics for fixed-width vector
+  // types into vector shuffles.
+  bool translateVectorInterleave2Intrinsic(const CallInst &CI,
+                                           MachineIRBuilder &MIRBuilder);
+  bool translateVectorDeinterleave2Intrinsic(const CallInst &CI,
+                                             MachineIRBuilder &MIRBuilder);
+
   void getStackGuard(Register DstReg, MachineIRBuilder &MIRBuilder);
 
   bool translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
@@ -579,6 +587,10 @@ class IRTranslator : public MachineFunctionPass {
     return false;
   }
 
+  bool translateConvergenceControlIntrinsic(const CallInst &CI,
+                                            Intrinsic::ID ID,
+                                            MachineIRBuilder &MIRBuilder);
+
   /// @}
 
   // Builder for machine instruction a la IRBuilder.
@@ -697,6 +709,23 @@ class IRTranslator : public MachineFunctionPass {
     return Regs[0];
   }
 
+  Register getOrCreateConvergenceTokenVReg(const Value &Token) {
+    assert(Token.getType()->isTokenTy());
+    auto &Regs = *VMap.getVRegs(Token);
+    if (!Regs.empty()) {
+      assert(Regs.size() == 1 &&
+             "Expected a single register for convergence tokens.");
+      return Regs[0];
+    }
+
+    auto Reg = MRI->createGenericVirtualRegister(LLT::token());
+    Regs.push_back(Reg);
+    auto &Offsets = *VMap.getOffsets(Token);
+    if (Offsets.empty())
+      Offsets.push_back(0);
+    return Reg;
+  }
+
   /// Allocate some vregs and offsets in the VMap. Then populate just the
   /// offsets while leaving the vregs empty.
   ValueToVRegInfo::VRegListT &allocateVRegs(const Value &Val);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index da330b517c280..ca62f38061b11 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -432,9 +432,13 @@ class LegalizationArtifactCombiner {
             DestTy.isVector() ? CastSrcTy.getNumElements() / NumDefs : 1;
         LLT UnmergeTy = CastSrcTy.changeElementCount(
             ElementCount::getFixed(UnmergeNumElts));
+        LLT SrcWideTy =
+            SrcTy.changeElementCount(ElementCount::getFixed(UnmergeNumElts));
 
         if (isInstUnsupported(
-                {TargetOpcode::G_UNMERGE_VALUES, {UnmergeTy, CastSrcTy}}))
+                {TargetOpcode::G_UNMERGE_VALUES, {UnmergeTy, CastSrcTy}}) ||
+            LI.getAction({TargetOpcode::G_TRUNC, {SrcWideTy, UnmergeTy}})
+                    .Action == LegalizeActions::MoreElements)
           return false;
 
         Builder.setInstr(MI);
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 7d11d63d4066f..0fe73fec7ee67 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -638,13 +638,17 @@ class MachineFrameInfo {
   bool hasTailCall() const { return HasTailCall; }
   void setHasTailCall(bool V = true) { HasTailCall = V; }
 
-  /// Computes the maximum size of a callframe and the AdjustsStack property.
+  /// Computes the maximum size of a callframe.
   /// This only works for targets defining
   /// TargetInstrInfo::getCallFrameSetupOpcode(), getCallFrameDestroyOpcode(),
   /// and getFrameSize().
   /// This is usually computed by the prologue epilogue inserter but some
   /// targets may call this to compute it earlier.
-  void computeMaxCallFrameSize(const MachineFunction &MF);
+  /// If FrameSDOps is passed, the frame instructions in the MF will be
+  /// inserted into it.
+  void computeMaxCallFrameSize(
+      MachineFunction &MF,
+      std::vector<MachineBasicBlock::iterator> *FrameSDOps = nullptr);
 
   /// Return the maximum size of a call frame that must be
   /// allocated for an outgoing function call.  This is only available if
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index a2c90c9f42f38..dfbf7a1e7aae5 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -1026,18 +1026,27 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   /// MachineMemOperands are owned by the MachineFunction and need not be
   /// explicitly deallocated.
   MachineMemOperand *getMachineMemOperand(
-      MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
+      MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy,
       Align base_alignment, const AAMDNodes &AAInfo = AAMDNodes(),
       const MDNode *Ranges = nullptr, SyncScope::ID SSID = SyncScope::System,
       AtomicOrdering Ordering = AtomicOrdering::NotAtomic,
       AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic);
-
   MachineMemOperand *getMachineMemOperand(
-      MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy,
-      Align base_alignment, const AAMDNodes &AAInfo = AAMDNodes(),
+      MachinePointerInfo PtrInfo, MachineMemOperand::Flags F, LocationSize Size,
+      Align BaseAlignment, const AAMDNodes &AAInfo = AAMDNodes(),
       const MDNode *Ranges = nullptr, SyncScope::ID SSID = SyncScope::System,
       AtomicOrdering Ordering = AtomicOrdering::NotAtomic,
       AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic);
+  MachineMemOperand *getMachineMemOperand(
+      MachinePointerInfo PtrInfo, MachineMemOperand::Flags F, uint64_t Size,
+      Align BaseAlignment, const AAMDNodes &AAInfo = AAMDNodes(),
+      const MDNode *Ranges = nullptr, SyncScope::ID SSID = SyncScope::System,
+      AtomicOrdering Ordering = AtomicOrdering::NotAtomic,
+      AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic) {
+    return getMachineMemOperand(PtrInfo, F, LocationSize::precise(Size),
+                                BaseAlignment, AAInfo, Ranges, SSID, Ordering,
+                                FailureOrdering);
+  }
 
   /// getMachineMemOperand - Allocate a new MachineMemOperand by copying
   /// an existing one, adjusting by an offset and using the given size.
@@ -1046,9 +1055,16 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                           int64_t Offset, LLT Ty);
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
-                                          int64_t Offset, uint64_t Size) {
+                                          int64_t Offset, LocationSize Size) {
     return getMachineMemOperand(
-        MMO, Offset, Size == ~UINT64_C(0) ? LLT() : LLT::scalar(8 * Size));
+        MMO, Offset,
+        !Size.hasValue() || Size.isScalable()
+            ? LLT()
+            : LLT::scalar(8 * Size.getValue().getKnownMinValue()));
+  }
+  MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
+                                          int64_t Offset, uint64_t Size) {
+    return getMachineMemOperand(MMO, Offset, LocationSize::precise(Size));
   }
 
   /// getMachineMemOperand - Allocate a new MachineMemOperand by copying
@@ -1057,10 +1073,15 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   /// explicitly deallocated.
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                           const MachinePointerInfo &PtrInfo,
-                                          uint64_t Size);
+                                          LocationSize Size);
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                           const MachinePointerInfo &PtrInfo,
                                           LLT Ty);
+  MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
+                                          const MachinePointerInfo &PtrInfo,
+                                          uint64_t Size) {
+    return getMachineMemOperand(MMO, PtrInfo, LocationSize::precise(Size));
+  }
 
   /// Allocate a new MachineMemOperand by copying an existing one,
   /// replacing only AliasAnalysis information. MachineMemOperands are owned
diff --git a/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
index 445c9b1c3bc00..967c4a70ca469 100644
--- a/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -89,6 +89,9 @@ class MachineLoop : public LoopBase<MachineBasicBlock, MachineLoop> {
 private:
   friend class LoopInfoBase<MachineBasicBlock, MachineLoop>;
 
+  /// Returns true if the given physreg has no defs inside the loop.
+  bool isLoopInvariantImplicitPhysReg(Register Reg) const;
+
   explicit MachineLoop(MachineBasicBlock *MBB)
     : LoopBase<MachineBasicBlock, MachineLoop>(MBB) {}
 
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 12c18aaea5b26..da4ca582cb9e4 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGenTypes/LowLevelType.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -186,7 +187,7 @@ class MachineMemOperand {
   /// and atomic ordering requirements must also be specified. For cmpxchg
   /// atomic operations the atomic ordering requirements when store does not
   /// occur must also be specified.
-  MachineMemOperand(MachinePointerInfo PtrInfo, Flags flags, uint64_t s,
+  MachineMemOperand(MachinePointerInfo PtrInfo, Flags flags, LocationSize TS,
                     Align a, const AAMDNodes &AAInfo = AAMDNodes(),
                     const MDNode *Ranges = nullptr,
                     SyncScope::ID SSID = SyncScope::System,
@@ -235,13 +236,17 @@ class MachineMemOperand {
   LLT getMemoryType() const { return MemoryType; }
 
   /// Return the size in bytes of the memory reference.
-  uint64_t getSize() const {
-    return MemoryType.isValid() ? MemoryType.getSizeInBytes() : ~UINT64_C(0);
+  LocationSize getSize() const {
+    return MemoryType.isValid()
+               ? LocationSize::precise(MemoryType.getSizeInBytes())
+               : LocationSize::beforeOrAfterPointer();
   }
 
   /// Return the size in bits of the memory reference.
-  uint64_t getSizeInBits() const {
-    return MemoryType.isValid() ? MemoryType.getSizeInBits() : ~UINT64_C(0);
+  LocationSize getSizeInBits() const {
+    return MemoryType.isValid()
+               ? LocationSize::precise(MemoryType.getSizeInBits())
+               : LocationSize::beforeOrAfterPointer();
   }
 
   LLT getType() const {
diff --git a/llvm/include/llvm/CodeGen/MachinePassManager.h b/llvm/include/llvm/CodeGen/MachinePassManager.h
index 7713c55661ccc..3faffe5c4cab2 100644
--- a/llvm/include/llvm/CodeGen/MachinePassManager.h
+++ b/llvm/include/llvm/CodeGen/MachinePassManager.h
@@ -56,7 +56,7 @@ struct MachinePassConcept
 };
 
 template <typename PassT> struct MachinePassModel : MachinePassConcept {
-  explicit MachinePassModel(PassT Pass) : Pass(std::move(Pass)) {}
+  explicit MachinePassModel(PassT &&Pass) : Pass(std::move(Pass)) {}
   // We have to explicitly define all the special member functions because MSVC
   // refuses to generate them.
   MachinePassModel(const MachinePassModel &Arg) : Pass(Arg.Pass) {}
@@ -72,6 +72,7 @@ template <typename PassT> struct MachinePassModel : MachinePassConcept {
     return *this;
   }
 
+  MachinePassModel &operator=(const MachinePassModel &) = delete;
   PreservedAnalyses run(MachineFunction &IR,
                         MachineFunctionAnalysisManager &AM) override {
     return Pass.run(IR, AM);
diff --git a/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h b/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
index 3dc0731f5a04e..4b8eed7bdb1b5 100644
--- a/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
+++ b/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
@@ -27,10 +27,7 @@ class NonRelocatableStringpool {
   /// order.
   using MapTy = StringMap<DwarfStringPoolEntry, BumpPtrAllocator>;
 
-  NonRelocatableStringpool(
-      std::function<StringRef(StringRef Input)> Translator = nullptr,
-      bool PutEmptyString = false)
-      : Translator(Translator) {
+  NonRelocatableStringpool(bool PutEmptyString = false) {
     if (PutEmptyString)
       getEntry("");
   }
@@ -59,7 +56,6 @@ class NonRelocatableStringpool {
   MapTy Strings;
   uint64_t CurrentEndOffset = 0;
   unsigned NumEntries = 0;
-  std::function<StringRef(StringRef Input)> Translator;
 };
 
 /// Helper for making strong types.
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index a86c7400fa094..541c0ecb5be37 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -510,6 +510,26 @@ inline BinaryOpc_match<LHS, RHS, true> m_Xor(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS, true>(ISD::XOR, L, R);
 }
 
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_SMin(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::SMIN, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_SMax(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::SMAX, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_UMin(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::UMIN, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_UMax(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::UMAX, L, R);
+}
+
 template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS, false> m_UDiv(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS, false>(ISD::UDIV, L, R);
@@ -705,6 +725,18 @@ inline auto m_False() {
       },
       m_Value()};
 }
+
+/// Match a negate as a sub(0, v)
+template <typename ValTy>
+inline BinaryOpc_match<SpecificInt_match, ValTy> m_Neg(const ValTy &V) {
+  return m_Sub(m_Zero(), V);
+}
+
+/// Match a Not as a xor(v, -1) or xor(-1, v)
+template <typename ValTy>
+inline BinaryOpc_match<ValTy, SpecificInt_match, true> m_Not(const ValTy &V) {
+  return m_Xor(V, m_AllOnes());
+}
 } // namespace SDPatternMatch
 } // namespace llvm
 #endif
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 25e6c525b672a..4785e93d72d1c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1299,7 +1299,7 @@ class SelectionDAG {
       EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
       MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
                                        MachineMemOperand::MOStore,
-      uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes());
+      LocationSize Size = 0, const AAMDNodes &AAInfo = AAMDNodes());
 
   inline SDValue getMemIntrinsicNode(
       unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
@@ -1307,7 +1307,7 @@ class SelectionDAG {
       MaybeAlign Alignment = std::nullopt,
       MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
                                        MachineMemOperand::MOStore,
-      uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
+      LocationSize Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
     // Ensure that codegen never sees alignment 0
     return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, PtrInfo,
                                Alignment.value_or(getEVTAlign(MemVT)), Flags,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index d1015630b05d1..261f7e49e5c8c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -558,6 +558,7 @@ BEGIN_TWO_BYTE_PACK()
 
   class LoadSDNodeBitfields {
     friend class LoadSDNode;
+    friend class AtomicSDNode;
     friend class VPLoadSDNode;
     friend class VPStridedLoadSDNode;
     friend class MaskedLoadSDNode;
@@ -1475,6 +1476,16 @@ class AtomicSDNode : public MemSDNode {
             MMO->isAtomic()) && "then why are we using an AtomicSDNode?");
   }
 
+  void setExtensionType(ISD::LoadExtType ETy) {
+    assert(getOpcode() == ISD::ATOMIC_LOAD && "Only used for atomic loads.");
+    LoadSDNodeBits.ExtTy = ETy;
+  }
+
+  ISD::LoadExtType getExtensionType() const {
+    assert(getOpcode() == ISD::ATOMIC_LOAD && "Only used for atomic loads.");
+    return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
+  }
+
   const SDValue &getBasePtr() const {
     return getOpcode() == ISD::ATOMIC_STORE ? getOperand(2) : getOperand(1);
   }
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index be4ee5b6f9e29..9fd0ebe6956fb 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -204,6 +204,7 @@ class TargetInstrInfo : public MCInstrInfo {
   /// if they exist (-1 otherwise).  Some targets use pseudo instructions in
   /// order to abstract away the difference between operating with a frame
   /// pointer and operating without, through the use of these two instructions.
+  /// A FrameSetup MI in MF implies MFI::AdjustsStack.
   ///
   unsigned getCallFrameSetupOpcode() const { return CallFrameSetupOpcode; }
   unsigned getCallFrameDestroyOpcode() const { return CallFrameDestroyOpcode; }
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index e7c9ecd2e1851..117d3f7182974 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -572,6 +572,12 @@ class TargetRegisterInfo : public MCRegisterInfo {
     return false;
   }
 
+  /// Returns true if MachineLoopInfo should analyze the given physreg
+  /// for loop invariance.
+  virtual bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const {
+    return false;
+  }
+
   /// Physical registers that may be modified within a function but are
   /// guaranteed to be restored before any uses. This is useful for targets that
   /// have call sequences where a GOT register may be updated by the caller
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index 5a16cffb80b32..62ee28cfac99c 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -45,6 +45,14 @@ class LLT {
                /*AddressSpace=*/0};
   }
 
+  /// Get a low-level token; just a scalar with zero bits (or no size).
+  static constexpr LLT token() {
+    return LLT{/*isPointer=*/false, /*isVector=*/false,
+               /*isScalar=*/true,   ElementCount::getFixed(0),
+               /*SizeInBits=*/0,
+               /*AddressSpace=*/0};
+  }
+
   /// Get a low-level pointer in the given address space.
   static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits) {
     assert(SizeInBits > 0 && "invalid pointer size");
@@ -134,17 +142,17 @@ class LLT {
 
   explicit LLT(MVT VT);
 
-  constexpr bool isValid() const { return IsScalar || IsPointer || IsVector; }
-
+  constexpr bool isValid() const { return IsScalar || RawData != 0; }
   constexpr bool isScalar() const { return IsScalar; }
-
-  constexpr bool isPointer() const { return IsPointer && !IsVector; }
-
-  constexpr bool isPointerVector() const { return IsPointer && IsVector; }
-
-  constexpr bool isPointerOrPointerVector() const { return IsPointer; }
-
-  constexpr bool isVector() const { return IsVector; }
+  constexpr bool isToken() const { return IsScalar && RawData == 0; };
+  constexpr bool isVector() const { return isValid() && IsVector; }
+  constexpr bool isPointer() const {
+    return isValid() && IsPointer && !IsVector;
+  }
+  constexpr bool isPointerVector() const { return IsPointer && isVector(); }
+  constexpr bool isPointerOrPointerVector() const {
+    return IsPointer && isValid();
+  }
 
   /// Returns the number of elements in a vector LLT. Must only be called on
   /// vector types.
@@ -314,6 +322,28 @@ class LLT {
   /// described in static const *Field variables. Each of these variables
   /// is a 2-element array, with the first element describing the bitfield size
   /// and the second element describing the bitfield offset.
+  ///
+  /// +--------+---------+--------+----------+----------------------+
+  /// |isScalar|isPointer|isVector| RawData  |Notes                 |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    0    |   0    |    0     |Invalid               |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    0    |   1    |    0     |Tombstone Key         |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    1    |   0    |    0     |Empty Key             |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   1    |    0    |   0    |    0     |Token                 |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   1    |    0    |   0    | non-zero |Scalar                |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    1    |   0    | non-zero |Pointer               |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    0    |   1    | non-zero |Vector of non-pointer |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    1    |   1    | non-zero |Vector of pointer     |
+  /// +--------+---------+--------+----------+----------------------+
+  ///
+  /// Everything else is reserved.
   typedef int BitFieldInfo[2];
   ///
   /// This is how the bitfields are packed per Kind:
diff --git a/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h b/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h
index bebdfd5e60257..e7a1a3cd838c2 100644
--- a/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h
+++ b/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h
@@ -45,16 +45,13 @@ class DwarfStreamer : public DwarfEmitter {
 public:
   DwarfStreamer(DWARFLinkerBase::OutputFileType OutFileType,
                 raw_pwrite_stream &OutFile,
-                DWARFLinkerBase::TranslatorFuncTy Translator,
                 DWARFLinkerBase::MessageHandlerTy Warning)
-      : OutFile(OutFile), OutFileType(OutFileType), Translator(Translator),
-        WarningHandler(Warning) {}
+      : OutFile(OutFile), OutFileType(OutFileType), WarningHandler(Warning) {}
   virtual ~DwarfStreamer() = default;
 
   static Expected<std::unique_ptr<DwarfStreamer>> createStreamer(
       const Triple &TheTriple, DWARFLinkerBase::OutputFileType FileType,
-      raw_pwrite_stream &OutFile, DWARFLinkerBase::TranslatorFuncTy Translator,
-      DWARFLinkerBase::MessageHandlerTy Warning);
+      raw_pwrite_stream &OutFile, DWARFLinkerBase::MessageHandlerTy Warning);
 
   Error init(Triple TheTriple, StringRef Swift5ReflectionSegmentName);
 
@@ -295,7 +292,6 @@ class DwarfStreamer : public DwarfEmitter {
   /// The output file we stream the linked Dwarf to.
   raw_pwrite_stream &OutFile;
   DWARFLinker::OutputFileType OutFileType = DWARFLinker::OutputFileType::Object;
-  std::function<StringRef(StringRef Input)> Translator;
 
   uint64_t RangesSectionSize = 0;
   uint64_t RngListsSectionSize = 0;
diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h b/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h
index 5c811b668f0a3..70b25cf02df7f 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h
@@ -82,7 +82,6 @@ class DWARFLinkerBase {
       std::function<void(const DWARFFile &File, llvm::StringRef Output)>;
   using ObjectPrefixMapTy = std::map<std::string, std::string>;
   using CompileUnitHandlerTy = function_ref<void(const DWARFUnit &Unit)>;
-  using TranslatorFuncTy = std::function<StringRef(StringRef)>;
   using SwiftInterfacesMapTy = std::map<std::string, std::string>;
   /// Type of output file.
   enum class OutputFileType : uint8_t {
diff --git a/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h
index 5312712a4a5b8..8acc046d07249 100644
--- a/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h
+++ b/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h
@@ -123,8 +123,7 @@ class DWARFLinker : public DWARFLinkerBase {
 
   /// Creates dwarf linker instance.
   static std::unique_ptr<DWARFLinker>
-  createLinker(MessageHandlerTy ErrorHandler, MessageHandlerTy WarningHandler,
-               TranslatorFuncTy StringsTranslator = nullptr);
+  createLinker(MessageHandlerTy ErrorHandler, MessageHandlerTy WarningHandler);
 
   /// Set output DWARF handler. Result of linking DWARF is set of sections
   /// containing final debug info. DWARFLinkerBase::link() pass generated
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
index a66cf4608823b..f76f2ecd3e212 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
@@ -122,6 +122,48 @@ class LVBinaryReader : public LVReader {
   std::unique_ptr<MCContext> MC;
   std::unique_ptr<MCInstPrinter> MIP;
 
+  // https://yurydelendik.github.io/webassembly-dwarf/
+  // 2. Consuming and Generating DWARF for WebAssembly Code
+  // Note: Some DWARF constructs don't map one-to-one onto WebAssembly
+  // constructs. We strive to enumerate and resolve any ambiguities here.
+  //
+  // 2.1. Code Addresses
+  // Note: DWARF associates various bits of debug info
+  // with particular locations in the program via its code address (instruction
+  // pointer or PC). However, WebAssembly's linear memory address space does not
+  // contain WebAssembly instructions.
+  //
+  // Wherever a code address (see 2.17 of [DWARF]) is used in DWARF for
+  // WebAssembly, it must be the offset of an instruction relative within the
+  // Code section of the WebAssembly file. The DWARF is considered malformed if
+  // a PC offset is between instruction boundaries within the Code section.
+  //
+  // Note: It is expected that a DWARF consumer does not know how to decode
+  // WebAssembly instructions. The instruction pointer is selected as the offset
+  // in the binary file of the first byte of the instruction, and it is
+  // consistent with the WebAssembly Web API conventions definition of the code
+  // location.
+  //
+  // EXAMPLE: .DEBUG_LINE INSTRUCTION POINTERS
+  // The .debug_line DWARF section maps instruction pointers to source
+  // locations. With WebAssembly, the .debug_line section maps Code
+  // section-relative instruction offsets to source locations.
+  //
+  // EXAMPLE: DW_AT_* ATTRIBUTES
+  // For entities with a single associated code address, DWARF uses
+  // the DW_AT_low_pc attribute to specify the associated code address value.
+  // For WebAssembly, the DW_AT_low_pc's value is a Code section-relative
+  // instruction offset.
+  //
+  // For entities with a single contiguous range of code, DWARF uses a
+  // pair of DW_AT_low_pc and DW_AT_high_pc attributes to specify the associated
+  // contiguous range of code address values. For WebAssembly, these attributes
+  // are Code section-relative instruction offsets.
+  //
+  // For entities with multiple ranges of code, DWARF uses the DW_AT_ranges
+  // attribute, which refers to the array located at the .debug_ranges section.
+  LVAddress WasmCodeSectionOffset = 0;
+
   // Loads all info for the architecture of the provided object file.
   Error loadGenericTargetInfo(StringRef TheTriple, StringRef TheFeatures);
 
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVELFReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
similarity index 88%
rename from llvm/include/llvm/DebugInfo/LogicalView/Readers/LVELFReader.h
rename to llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
index 0837b886a273a..22e804a459f87 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVELFReader.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
@@ -1,4 +1,4 @@
-//===-- LVELFReader.h -------------------------------------------*- C++ -*-===//
+//===-- LVDWARFReader.h -----------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the LVELFReader class, which is used to describe a
+// This file defines the LVDWARFReader class, which is used to describe a
 // debug information (DWARF) reader.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVELFREADER_H
-#define LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVELFREADER_H
+#ifndef LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVDWARFREADER_H
+#define LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVDWARFREADER_H
 
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
@@ -30,7 +30,7 @@ class LVType;
 
 using AttributeSpec = DWARFAbbreviationDeclaration::AttributeSpec;
 
-class LVELFReader final : public LVBinaryReader {
+class LVDWARFReader final : public LVBinaryReader {
   object::ObjectFile &Obj;
 
   // Indicates if ranges data are available; in the case of split DWARF any
@@ -127,14 +127,14 @@ class LVELFReader final : public LVBinaryReader {
   void sortScopes() override;
 
 public:
-  LVELFReader() = delete;
-  LVELFReader(StringRef Filename, StringRef FileFormatName,
-              object::ObjectFile &Obj, ScopedPrinter &W)
+  LVDWARFReader() = delete;
+  LVDWARFReader(StringRef Filename, StringRef FileFormatName,
+                object::ObjectFile &Obj, ScopedPrinter &W)
       : LVBinaryReader(Filename, FileFormatName, W, LVBinaryType::ELF),
         Obj(Obj) {}
-  LVELFReader(const LVELFReader &) = delete;
-  LVELFReader &operator=(const LVELFReader &) = delete;
-  ~LVELFReader() = default;
+  LVDWARFReader(const LVDWARFReader &) = delete;
+  LVDWARFReader &operator=(const LVDWARFReader &) = delete;
+  ~LVDWARFReader() = default;
 
   LVAddress getCUBaseAddress() const { return CUBaseAddress; }
   void setCUBaseAddress(LVAddress Address) { CUBaseAddress = Address; }
@@ -158,4 +158,4 @@ class LVELFReader final : public LVBinaryReader {
 } // end namespace logicalview
 } // end namespace llvm
 
-#endif // LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVELFREADER_H
+#endif // LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVDWARFREADER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index e928faf478855..2ffde1f9eb10e 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -60,9 +60,9 @@ class MachOPlatform : public Platform {
 
     struct BuildVersionOpts {
 
-      // Derive platform from triple.
-      static BuildVersionOpts fromTriple(const Triple &TT, uint32_t MinOS,
-                                         uint32_t SDK);
+      // Derive platform from triple if possible.
+      static std::optional<BuildVersionOpts>
+      fromTriple(const Triple &TT, uint32_t MinOS, uint32_t SDK);
 
       uint32_t Platform; // Platform.
       uint32_t MinOS;    // X.Y.Z is encoded in nibbles xxxx.yy.zz
@@ -73,13 +73,12 @@ class MachOPlatform : public Platform {
     /// will be used.
     std::optional<Dylib> IDDylib;
 
-    /// Override for LC_BUILD_VERSION. If this is nullopt then
-    std::optional<BuildVersionOpts> BuildVersion;
-
     /// List of LC_LOAD_DYLIBs.
     std::vector<Dylib> LoadDylibs;
     /// List of LC_RPATHs.
     std::vector<std::string> RPaths;
+    /// List of LC_BUILD_VERSIONs.
+    std::vector<BuildVersionOpts> BuildVersions;
 
     HeaderOptions() = default;
     HeaderOptions(Dylib D) : IDDylib(std::move(D)) {}
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/VTuneSharedStructs.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/VTuneSharedStructs.h
index 667d3446faff7..3517d0b091719 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/VTuneSharedStructs.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/VTuneSharedStructs.h
@@ -13,6 +13,10 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_VTUNESHAREDSTRUCTS_H
 #define LLVM_EXECUTIONENGINE_ORC_SHARED_VTUNESHAREDSTRUCTS_H
 
+#include "ExecutorAddress.h"
+#include <utility>
+#include <vector>
+
 namespace llvm {
 namespace orc {
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 1bfaf3718552e..338b56226f204 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -72,7 +72,10 @@ enum class IdentFlag {
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
 // Version of the kernel argument format used by the omp runtime.
-#define OMP_KERNEL_ARG_VERSION 2
+#define OMP_KERNEL_ARG_VERSION 3
+
+// Minimum version of the compiler that generates a kernel dynamic pointer.
+#define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3
 
 /// \note This needs to be kept in sync with kmp.h enum sched_type.
 /// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 1ef32bcb121be..152f781ffa9b3 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -88,9 +88,6 @@ namespace llvm {
   /// info. Return true if module is modified.
   bool UpgradeDebugInfo(Module &M);
 
-  /// Copies module attributes to the functions in the module.
-  void CopyModuleAttrToFunctions(Module &M);
-
   /// Check whether a string looks like an old loop attachment tag.
   inline bool mayBeOldLoopAttachmentTag(StringRef Name) {
     return Name.starts_with("llvm.vectorizer.");
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 5bac113c9b7b4..51444c7f8c9cc 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -14,6 +14,7 @@
 #define LLVM_IR_BASICBLOCK_H
 
 #include "llvm-c/Types.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
@@ -21,7 +22,6 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/DebugProgramInstruction.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/DebugProgramInstruction.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/Value.h"
 #include <cassert>
@@ -38,7 +38,7 @@ class LLVMContext;
 class Module;
 class PHINode;
 class ValueSymbolTable;
-class DPValue;
+class DbgVariableRecord;
 class DPMarker;
 
 /// LLVM Basic Block Representation
@@ -78,13 +78,13 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   DPMarker *createMarker(InstListType::iterator It);
 
   /// Convert variable location debugging information stored in dbg.value
-  /// intrinsics into DPMarker / DPValue records. Deletes all dbg.values in
+  /// intrinsics into DPMarkers / DbgRecords. Deletes all dbg.values in
   /// the process and sets IsNewDbgInfoFormat = true. Only takes effect if
   /// the UseNewDbgInfoFormat LLVM command line option is given.
   void convertToNewDbgValues();
 
   /// Convert variable location debugging information stored in DPMarkers and
-  /// DPValues into the dbg.value intrinsic representation. Sets
+  /// DbgRecords into the dbg.value intrinsic representation. Sets
   /// IsNewDbgInfoFormat = false.
   void convertFromNewDbgValues();
 
@@ -93,50 +93,50 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   /// if necessary.
   void setIsNewDbgInfoFormat(bool NewFlag);
 
-  /// Record that the collection of DPValues in \p M "trails" after the last
+  /// Record that the collection of DbgRecords in \p M "trails" after the last
   /// instruction of this block. These are equivalent to dbg.value intrinsics
   /// that exist at the end of a basic block with no terminator (a transient
   /// state that occurs regularly).
   void setTrailingDbgRecords(DPMarker *M);
 
-  /// Fetch the collection of DPValues that "trail" after the last instruction
+  /// Fetch the collection of DbgRecords that "trail" after the last instruction
   /// of this block, see \ref setTrailingDbgRecords. If there are none, returns
   /// nullptr.
   DPMarker *getTrailingDbgRecords();
 
-  /// Delete any trailing DPValues at the end of this block, see
+  /// Delete any trailing DbgRecords at the end of this block, see
   /// \ref setTrailingDbgRecords.
   void deleteTrailingDbgRecords();
 
   void dumpDbgValues() const;
 
-  /// Return the DPMarker for the position given by \p It, so that DPValues can
-  /// be inserted there. This will either be nullptr if not present, a DPMarker,
-  /// or TrailingDPValues if It is end().
+  /// Return the DPMarker for the position given by \p It, so that DbgRecords
+  /// can be inserted there. This will either be nullptr if not present, a
+  /// DPMarker, or TrailingDbgRecords if It is end().
   DPMarker *getMarker(InstListType::iterator It);
 
   /// Return the DPMarker for the position that comes after \p I. \see
   /// BasicBlock::getMarker, this can be nullptr, a DPMarker, or
-  /// TrailingDPValues if there is no next instruction.
+  /// TrailingDbgRecords if there is no next instruction.
   DPMarker *getNextMarker(Instruction *I);
 
-  /// Insert a DPValue into a block at the position given by \p I.
-  void insertDbgRecordAfter(DbgRecord *DPV, Instruction *I);
+  /// Insert a DbgRecord into a block at the position given by \p I.
+  void insertDbgRecordAfter(DbgRecord *DR, Instruction *I);
 
-  /// Insert a DPValue into a block at the position given by \p Here.
-  void insertDbgRecordBefore(DbgRecord *DPV, InstListType::iterator Here);
+  /// Insert a DbgRecord into a block at the position given by \p Here.
+  void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here);
 
-  /// Eject any debug-info trailing at the end of a block. DPValues can
+  /// Eject any debug-info trailing at the end of a block. DbgRecords can
   /// transiently be located "off the end" of a block if the blocks terminator
   /// is temporarily removed. Once a terminator is re-inserted this method will
-  /// move such DPValues back to the right place (ahead of the terminator).
-  void flushTerminatorDbgValues();
+  /// move such DbgRecords back to the right place (ahead of the terminator).
+  void flushTerminatorDbgRecords();
 
   /// In rare circumstances instructions can be speculatively removed from
   /// blocks, and then be re-inserted back into that position later. When this
   /// happens in RemoveDIs debug-info mode, some special patching-up needs to
   /// occur: inserting into the middle of a sequence of dbg.value intrinsics
-  /// does not have an equivalent with DPValues.
+  /// does not have an equivalent with DbgRecords.
   void reinsertInstInDbgRecords(Instruction *I,
                                 std::optional<DbgRecord::self_iterator> Pos);
 
@@ -522,7 +522,7 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
                                  BasicBlock::iterator FromEndIt);
 
   /// Perform any debug-info specific maintenence for the given splice
-  /// activity. In the DPValue debug-info representation, debug-info is not
+  /// activity. In the DbgRecord debug-info representation, debug-info is not
   /// in instructions, and so it does not automatically move from one block
   /// to another.
   void spliceDebugInfo(BasicBlock::iterator ToIt, BasicBlock *FromBB,
@@ -774,6 +774,32 @@ BasicBlock::iterator skipDebugIntrinsics(BasicBlock::iterator It);
 inline void BasicBlock::validateInstrOrdering() const {}
 #endif
 
+// Specialize DenseMapInfo for iterators, so that ththey can be installed into
+// maps and sets. The iterator is made up of its node pointer, and the
+// debug-info "head" bit.
+template <> struct DenseMapInfo<BasicBlock::iterator> {
+  static inline BasicBlock::iterator getEmptyKey() {
+    return BasicBlock::iterator(nullptr);
+  }
+
+  static inline BasicBlock::iterator getTombstoneKey() {
+    BasicBlock::iterator It(nullptr);
+    It.setHeadBit(true);
+    return It;
+  }
+
+  static unsigned getHashValue(const BasicBlock::iterator &It) {
+    return DenseMapInfo<void *>::getHashValue(
+               reinterpret_cast<void *>(It.getNodePtr())) ^
+           It.getHeadBit();
+  }
+
+  static bool isEqual(const BasicBlock::iterator &LHS,
+                      const BasicBlock::iterator &RHS) {
+    return LHS == RHS && LHS.getHeadBit() == RHS.getHeadBit();
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_BASICBLOCK_H
diff --git a/llvm/include/llvm/IR/ConstantFolder.h b/llvm/include/llvm/IR/ConstantFolder.h
index c2b30a65e32e2..3e74a563a5842 100644
--- a/llvm/include/llvm/IR/ConstantFolder.h
+++ b/llvm/include/llvm/IR/ConstantFolder.h
@@ -18,8 +18,8 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/ConstantFold.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilderFolder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Operator.h"
@@ -89,7 +89,7 @@ class ConstantFolder final : public IRBuilderFolder {
   }
 
   Value *FoldUnOpFMF(Instruction::UnaryOps Opc, Value *V,
-                      FastMathFlags FMF) const override {
+                     FastMathFlags FMF) const override {
     if (Constant *C = dyn_cast<Constant>(V))
       return ConstantFoldUnaryInstruction(Opc, C);
     return nullptr;
@@ -183,6 +183,12 @@ class ConstantFolder final : public IRBuilderFolder {
     return nullptr;
   }
 
+  Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                             Instruction *FMFSource) const override {
+    // Use TargetFolder or InstSimplifyFolder instead.
+    return nullptr;
+  }
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index c0ac9a4aa6750..c5e1e19d64982 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -1289,14 +1289,13 @@ class ConstantExpr : public Constant {
                             Type *SrcTy = nullptr) const;
 
   /// Returns an Instruction which implements the same operation as this
-  /// ConstantExpr. If \p InsertBefore is not null, the new instruction is
-  /// inserted before it, otherwise it is not inserted into any basic block.
+  /// ConstantExpr. It is not inserted into any basic block.
   ///
   /// A better approach to this could be to have a constructor for Instruction
   /// which would take a ConstantExpr parameter, but that would have spread
   /// implementation details of ConstantExpr outside of Constants.cpp, which
   /// would make it harder to remove ConstantExprs altogether.
-  Instruction *getAsInstruction(Instruction *InsertBefore = nullptr) const;
+  Instruction *getAsInstruction() const;
 
   /// Whether creating a constant expression for this binary operator is
   /// desirable.
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index 94af17af8160e..002f2db6da544 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -101,9 +101,10 @@ namespace llvm {
     DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
                            BasicBlock *InsertBB, Instruction *InsertBefore);
 
-    /// Internal helper. Track metadata if untracked and insert \p DPV.
-    void insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
-                       Instruction *InsertBefore, bool InsertAtHead = false);
+    /// Internal helper. Track metadata if untracked and insert \p DVR.
+    void insertDbgVariableRecord(DbgVariableRecord *DVR, BasicBlock *InsertBB,
+                                 Instruction *InsertBefore,
+                                 bool InsertAtHead = false);
 
     /// Internal helper with common code used by insertDbg{Value,Addr}Intrinsic.
     Instruction *insertDbgIntrinsic(llvm::Function *Intrinsic, llvm::Value *Val,
@@ -270,6 +271,13 @@ namespace llvm {
                       std::optional<unsigned> DWARFAddressSpace = std::nullopt,
                       StringRef Name = "", DINodeArray Annotations = nullptr);
 
+    /// Create a __ptrauth qualifier.
+    DIDerivedType *createPtrAuthQualifiedType(DIType *FromTy, unsigned Key,
+                                              bool IsAddressDiscriminated,
+                                              unsigned ExtraDiscriminator,
+                                              bool IsaPointer,
+                                              bool authenticatesNullValues);
+
     /// Create debugging information entry for a pointer to member.
     /// \param PointeeTy Type pointed to by this pointer.
     /// \param SizeInBits  Size.
diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 6673908d3ed35..53cede5409e26 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -34,23 +34,25 @@ namespace llvm {
 class DbgDeclareInst;
 class DbgValueInst;
 class DbgVariableIntrinsic;
-class DPValue;
+class DbgVariableRecord;
 class Instruction;
 class Module;
 
 /// Finds dbg.declare intrinsics declaring local variables as living in the
 /// memory that 'V' points to.
 TinyPtrVector<DbgDeclareInst *> findDbgDeclares(Value *V);
-/// As above, for DPVDeclares.
-TinyPtrVector<DPValue *> findDPVDeclares(Value *V);
+/// As above, for DVRDeclares.
+TinyPtrVector<DbgVariableRecord *> findDVRDeclares(Value *V);
 
 /// Finds the llvm.dbg.value intrinsics describing a value.
-void findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues,
-                   Value *V, SmallVectorImpl<DPValue *> *DPValues = nullptr);
+void findDbgValues(
+    SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
+    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords = nullptr);
 
 /// Finds the debug info intrinsics describing a value.
-void findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgInsts,
-                  Value *V, SmallVectorImpl<DPValue *> *DPValues = nullptr);
+void findDbgUsers(
+    SmallVectorImpl<DbgVariableIntrinsic *> &DbgInsts, Value *V,
+    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords = nullptr);
 
 /// Find subprogram that is enclosing this scope.
 DISubprogram *getDISubprogram(const MDNode *Scope);
@@ -58,7 +60,7 @@ DISubprogram *getDISubprogram(const MDNode *Scope);
 /// Produce a DebugLoc to use for each dbg.declare that is promoted to a
 /// dbg.value.
 DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII);
-DebugLoc getDebugValueLoc(DPValue *DPV);
+DebugLoc getDebugValueLoc(DbgVariableRecord *DVR);
 
 /// Strip debug info in the module if it exists.
 ///
@@ -109,7 +111,8 @@ class DebugInfoFinder {
   void processVariable(const Module &M, const DILocalVariable *DVI);
   /// Process debug info location.
   void processLocation(const Module &M, const DILocation *Loc);
-  /// Process a DbgRecord (e.g, treat a DPValue like a DbgVariableIntrinsic).
+  /// Process a DbgRecord (e.g, treat a DbgVariableRecord like a
+  /// DbgVariableIntrinsic).
   void processDbgRecord(const Module &M, const DbgRecord &DR);
 
   /// Process subprogram.
@@ -193,10 +196,10 @@ inline AssignmentInstRange getAssignmentInsts(const DbgAssignIntrinsic *DAI) {
   return getAssignmentInsts(DAI->getAssignID());
 }
 
-inline AssignmentInstRange getAssignmentInsts(const DPValue *DPV) {
-  assert(DPV->isDbgAssign() &&
-         "Can't get assignment instructions for non-assign DPV!");
-  return getAssignmentInsts(DPV->getAssignID());
+inline AssignmentInstRange getAssignmentInsts(const DbgVariableRecord *DVR) {
+  assert(DVR->isDbgAssign() &&
+         "Can't get assignment instructions for non-assign DVR!");
+  return getAssignmentInsts(DVR->getAssignID());
 }
 
 //
@@ -231,9 +234,10 @@ inline AssignmentMarkerRange getAssignmentMarkers(const Instruction *Inst) {
     return make_range(Value::user_iterator(), Value::user_iterator());
 }
 
-inline SmallVector<DPValue *> getDPVAssignmentMarkers(const Instruction *Inst) {
+inline SmallVector<DbgVariableRecord *>
+getDVRAssignmentMarkers(const Instruction *Inst) {
   if (auto *ID = Inst->getMetadata(LLVMContext::MD_DIAssignID))
-    return cast<DIAssignID>(ID)->getAllDPValueUsers();
+    return cast<DIAssignID>(ID)->getAllDbgVariableRecordUsers();
   return {};
 }
 
@@ -261,7 +265,7 @@ bool calculateFragmentIntersect(
     std::optional<DIExpression::FragmentInfo> &Result);
 bool calculateFragmentIntersect(
     const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
-    uint64_t SliceSizeInBits, const DPValue *DPVAssign,
+    uint64_t SliceSizeInBits, const DbgVariableRecord *DVRAssign,
     std::optional<DIExpression::FragmentInfo> &Result);
 
 /// Helper struct for trackAssignments, below. We don't use the similar
@@ -276,8 +280,8 @@ struct VarRecord {
 
   VarRecord(DbgVariableIntrinsic *DVI)
       : Var(DVI->getVariable()), DL(getDebugValueLoc(DVI)) {}
-  VarRecord(DPValue *DPV)
-      : Var(DPV->getVariable()), DL(getDebugValueLoc(DPV)) {}
+  VarRecord(DbgVariableRecord *DVR)
+      : Var(DVR->getVariable()), DL(getDebugValueLoc(DVR)) {}
   VarRecord(DILocalVariable *Var, DILocation *DL) : Var(Var), DL(DL) {}
   friend bool operator<(const VarRecord &LHS, const VarRecord &RHS) {
     return std::tie(LHS.Var, LHS.DL) < std::tie(RHS.Var, RHS.DL);
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 156f6eb49253d..2805f6c478057 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -65,7 +65,7 @@ enum Tag : uint16_t;
 }
 
 class DbgVariableIntrinsic;
-class DPValue;
+class DbgVariableRecord;
 
 extern cl::opt<bool> EnableFSDiscriminator;
 
@@ -323,8 +323,8 @@ class DIAssignID : public MDNode {
   // This node has no operands to replace.
   void replaceOperandWith(unsigned I, Metadata *New) = delete;
 
-  SmallVector<DPValue *> getAllDPValueUsers() {
-    return Context.getReplaceableUses()->getAllDPValueUsers();
+  SmallVector<DbgVariableRecord *> getAllDbgVariableRecordUsers() {
+    return Context.getReplaceableUses()->getAllDbgVariableRecordUsers();
   }
 
   static DIAssignID *getDistinct(LLVMContext &Context) {
@@ -745,7 +745,7 @@ class DIType : public DIScope {
 
   unsigned getLine() const { return Line; }
   uint64_t getSizeInBits() const { return SizeInBits; }
-  uint32_t getAlignInBits() const { return SubclassData32; }
+  uint32_t getAlignInBits() const;
   uint32_t getAlignInBytes() const { return getAlignInBits() / CHAR_BIT; }
   uint64_t getOffsetInBits() const { return OffsetInBits; }
   DIFlags getFlags() const { return Flags; }
@@ -972,6 +972,35 @@ class DIStringType : public DIType {
 ///
 /// TODO: Split out members (inheritance, fields, methods, etc.).
 class DIDerivedType : public DIType {
+public:
+  /// Pointer authentication (__ptrauth) metadata.
+  struct PtrAuthData {
+    // RawData layout:
+    // - Bits 0..3:  Key
+    // - Bit  4:     IsAddressDiscriminated
+    // - Bits 5..20: ExtraDiscriminator
+    // - Bit  21:    IsaPointer
+    // - Bit  22:    AuthenticatesNullValues
+    unsigned RawData;
+
+    PtrAuthData(unsigned FromRawData) : RawData(FromRawData) {}
+    PtrAuthData(unsigned Key, bool IsDiscr, unsigned Discriminator,
+                bool IsaPointer, bool AuthenticatesNullValues) {
+      assert(Key < 16);
+      assert(Discriminator <= 0xffff);
+      RawData = (Key << 0) | (IsDiscr ? (1 << 4) : 0) | (Discriminator << 5) |
+                (IsaPointer ? (1 << 21) : 0) |
+                (AuthenticatesNullValues ? (1 << 22) : 0);
+    }
+
+    unsigned key() { return (RawData >> 0) & 0b1111; }
+    bool isAddressDiscriminated() { return (RawData >> 4) & 1; }
+    unsigned extraDiscriminator() { return (RawData >> 5) & 0xffff; }
+    bool isaPointer() { return (RawData >> 21) & 1; }
+    bool authenticatesNullValues() { return (RawData >> 22) & 1; }
+  };
+
+private:
   friend class LLVMContextImpl;
   friend class MDNode;
 
@@ -982,59 +1011,70 @@ class DIDerivedType : public DIType {
   DIDerivedType(LLVMContext &C, StorageType Storage, unsigned Tag,
                 unsigned Line, uint64_t SizeInBits, uint32_t AlignInBits,
                 uint64_t OffsetInBits,
-                std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+                std::optional<unsigned> DWARFAddressSpace,
+                std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
                 ArrayRef<Metadata *> Ops)
       : DIType(C, DIDerivedTypeKind, Storage, Tag, Line, SizeInBits,
                AlignInBits, OffsetInBits, Flags, Ops),
-        DWARFAddressSpace(DWARFAddressSpace) {}
+        DWARFAddressSpace(DWARFAddressSpace) {
+    if (PtrAuthData)
+      SubclassData32 = PtrAuthData->RawData;
+  }
   ~DIDerivedType() = default;
   static DIDerivedType *
   getImpl(LLVMContext &Context, unsigned Tag, StringRef Name, DIFile *File,
           unsigned Line, DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
           uint32_t AlignInBits, uint64_t OffsetInBits,
-          std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+          std::optional<unsigned> DWARFAddressSpace,
+          std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
           Metadata *ExtraData, DINodeArray Annotations, StorageType Storage,
           bool ShouldCreate = true) {
     return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
                    Line, Scope, BaseType, SizeInBits, AlignInBits, OffsetInBits,
-                   DWARFAddressSpace, Flags, ExtraData, Annotations.get(),
-                   Storage, ShouldCreate);
+                   DWARFAddressSpace, PtrAuthData, Flags, ExtraData,
+                   Annotations.get(), Storage, ShouldCreate);
   }
   static DIDerivedType *
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
           unsigned Line, Metadata *Scope, Metadata *BaseType,
           uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
-          std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+          std::optional<unsigned> DWARFAddressSpace,
+          std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
           Metadata *ExtraData, Metadata *Annotations, StorageType Storage,
           bool ShouldCreate = true);
 
   TempDIDerivedType cloneImpl() const {
-    return getTemporary(
-        getContext(), getTag(), getName(), getFile(), getLine(), getScope(),
-        getBaseType(), getSizeInBits(), getAlignInBits(), getOffsetInBits(),
-        getDWARFAddressSpace(), getFlags(), getExtraData(), getAnnotations());
+    return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
+                        getScope(), getBaseType(), getSizeInBits(),
+                        getAlignInBits(), getOffsetInBits(),
+                        getDWARFAddressSpace(), getPtrAuthData(), getFlags(),
+                        getExtraData(), getAnnotations());
   }
 
 public:
-  DEFINE_MDNODE_GET(
-      DIDerivedType,
-      (unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
-       Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-       uint32_t AlignInBits, uint64_t OffsetInBits,
-       std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-       Metadata *ExtraData = nullptr, Metadata *Annotations = nullptr),
-      (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
-       OffsetInBits, DWARFAddressSpace, Flags, ExtraData, Annotations))
+  DEFINE_MDNODE_GET(DIDerivedType,
+                    (unsigned Tag, MDString *Name, Metadata *File,
+                     unsigned Line, Metadata *Scope, Metadata *BaseType,
+                     uint64_t SizeInBits, uint32_t AlignInBits,
+                     uint64_t OffsetInBits,
+                     std::optional<unsigned> DWARFAddressSpace,
+                     std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
+                     Metadata *ExtraData = nullptr,
+                     Metadata *Annotations = nullptr),
+                    (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                     AlignInBits, OffsetInBits, DWARFAddressSpace, PtrAuthData,
+                     Flags, ExtraData, Annotations))
   DEFINE_MDNODE_GET(DIDerivedType,
                     (unsigned Tag, StringRef Name, DIFile *File, unsigned Line,
                      DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
                      uint32_t AlignInBits, uint64_t OffsetInBits,
-                     std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+                     std::optional<unsigned> DWARFAddressSpace,
+                     std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
                      Metadata *ExtraData = nullptr,
                      DINodeArray Annotations = nullptr),
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                     AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                     ExtraData, Annotations))
+                     AlignInBits, OffsetInBits, DWARFAddressSpace, PtrAuthData,
+                     Flags, ExtraData, Annotations))
 
   TempDIDerivedType clone() const { return cloneImpl(); }
 
@@ -1048,6 +1088,8 @@ class DIDerivedType : public DIType {
     return DWARFAddressSpace;
   }
 
+  std::optional<PtrAuthData> getPtrAuthData() const;
+
   /// Get extra data associated with this derived type.
   ///
   /// Class type for pointer-to-members, objective-c property node for ivars,
@@ -1087,6 +1129,16 @@ class DIDerivedType : public DIType {
   }
 };
 
+inline bool operator==(DIDerivedType::PtrAuthData Lhs,
+                       DIDerivedType::PtrAuthData Rhs) {
+  return Lhs.RawData == Rhs.RawData;
+}
+
+inline bool operator!=(DIDerivedType::PtrAuthData Lhs,
+                       DIDerivedType::PtrAuthData Rhs) {
+  return !(Lhs == Rhs);
+}
+
 /// Composite types.
 ///
 /// TODO: Detach from DerivedTypeBase (split out MDEnumType?).
@@ -3788,8 +3840,8 @@ class DIArgList : public Metadata, ReplaceableMetadataImpl {
     return MD->getMetadataID() == DIArgListKind;
   }
 
-  SmallVector<DPValue *> getAllDPValueUsers() {
-    return ReplaceableMetadataImpl::getAllDPValueUsers();
+  SmallVector<DbgVariableRecord *> getAllDbgVariableRecordUsers() {
+    return ReplaceableMetadataImpl::getAllDbgVariableRecordUsers();
   }
 
   void handleChangedOperand(void *Ref, Metadata *New);
@@ -3819,7 +3871,7 @@ class DebugVariable {
 
 public:
   DebugVariable(const DbgVariableIntrinsic *DII);
-  DebugVariable(const DPValue *DPV);
+  DebugVariable(const DbgVariableRecord *DVR);
 
   DebugVariable(const DILocalVariable *Var,
                 std::optional<FragmentInfo> FragmentInfo,
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 507b652feeb01..db41e9acc7be4 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -1,4 +1,4 @@
-//===-- llvm/DebugProgramInstruction.h - Stream of debug info -------*- C++ -*-===//
+//===-- llvm/DebugProgramInstruction.h - Stream of debug info ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,10 +15,10 @@
 //    %bar = void call @ext(%foo);
 //
 // and all information is stored in the Value / Metadata hierachy defined
-// elsewhere in LLVM. In the "DPValue" design, each instruction /may/ have a
-// connection with a DPMarker, which identifies a position immediately before the
-// instruction, and each DPMarker /may/ then have connections to DPValues which
-// record the variable assignment information. To illustrate:
+// elsewhere in LLVM. In the "DbgRecord" design, each instruction /may/ have a
+// connection with a DPMarker, which identifies a position immediately before
+// the instruction, and each DPMarker /may/ then have connections to DbgRecords
+// which record the variable assignment information. To illustrate:
 //
 //    %foo = add i32 1, %0
 //       ; foo->DbgMarker == nullptr
@@ -26,13 +26,14 @@
 //       ;; the instruction for %foo, therefore it has no DbgMarker.
 //    %bar = void call @ext(%foo)
 //       ; bar->DbgMarker = {
-//       ;   StoredDPValues = {
-//       ;     DPValue(metadata i32 %foo, ...)
+//       ;   StoredDbgRecords = {
+//       ;     DbgVariableRecord(metadata i32 %foo, ...)
 //       ;   }
 //       ; }
 //       ;; There is a debug-info record in front of the %bar instruction,
 //       ;; thus it points at a DPMarker object. That DPMarker contains a
-//       ;; DPValue in it's ilist, storing the equivalent information to the
+//       ;; DbgVariableRecord in it's ilist, storing the equivalent information
+//       to the
 //       ;; dbg.value above: the Value, DILocalVariable, etc.
 //
 // This structure separates the two concerns of the position of the debug-info
@@ -66,7 +67,7 @@ class DbgInfoIntrinsic;
 class DbgLabelInst;
 class DIAssignID;
 class DPMarker;
-class DPValue;
+class DbgVariableRecord;
 class raw_ostream;
 
 /// A typed tracking MDNode reference that does not require a definition for its
@@ -119,7 +120,7 @@ template <typename T> class DbgRecordParamRef {
 /// Base class for non-instruction debug metadata records that have positions
 /// within IR. Features various methods copied across from the Instruction
 /// class to aid ease-of-use. DbgRecords should always be linked into a
-/// DPMarker's StoredDPValues list. The marker connects a DbgRecord back to
+/// DPMarker's StoredDbgRecords list. The marker connects a DbgRecord back to
 /// it's position in the BasicBlock.
 ///
 /// We need a discriminator for dyn/isa casts. In order to avoid paying for a
@@ -219,7 +220,8 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DbgRecord &R) {
 
 /// Records a position in IR for a source label (DILabel). Corresponds to the
 /// llvm.dbg.label intrinsic.
-/// FIXME: Rename DbgLabelRecord when DPValue is renamed to DbgVariableRecord.
+/// FIXME: Rename DbgLabelRecord when DbgVariableRecord is renamed to
+/// DbgVariableRecord.
 class DPLabel : public DbgRecord {
   DbgRecordParamRef<DILabel> Label;
 
@@ -257,7 +259,7 @@ class DPLabel : public DbgRecord {
 ///
 /// This class inherits from DebugValueUser to allow LLVM's metadata facilities
 /// to update our references to metadata beneath our feet.
-class DPValue : public DbgRecord, protected DebugValueUser {
+class DbgVariableRecord : public DbgRecord, protected DebugValueUser {
   friend class DebugValueUser;
 
 public:
@@ -269,9 +271,10 @@ class DPValue : public DbgRecord, protected DebugValueUser {
     End, ///< Marks the end of the concrete types.
     Any, ///< To indicate all LocationTypes in searches.
   };
-  /// Classification of the debug-info record that this DPValue represents.
-  /// Essentially, "is this a dbg.value or dbg.declare?". dbg.declares are not
-  /// currently supported, but it would be trivial to do so.
+  /// Classification of the debug-info record that this DbgVariableRecord
+  /// represents. Essentially, "is this a dbg.value or dbg.declare?".
+  /// dbg.declares are not currently supported, but it would be trivial to do
+  /// so.
   /// FIXME: We could use spare padding bits from DbgRecord for this.
   LocationType Type;
 
@@ -284,63 +287,69 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   DbgRecordParamRef<DIExpression> AddressExpression;
 
 public:
-  /// Create a new DPValue representing the intrinsic \p DVI, for example the
-  /// assignment represented by a dbg.value.
-  DPValue(const DbgVariableIntrinsic *DVI);
-  DPValue(const DPValue &DPV);
-  /// Directly construct a new DPValue representing a dbg.value intrinsic
-  /// assigning \p Location to the DV / Expr / DI variable.
-  DPValue(Metadata *Location, DILocalVariable *DV, DIExpression *Expr,
-          const DILocation *DI, LocationType Type = LocationType::Value);
-  DPValue(Metadata *Value, DILocalVariable *Variable, DIExpression *Expression,
-          DIAssignID *AssignID, Metadata *Address,
-          DIExpression *AddressExpression, const DILocation *DI);
+  /// Create a new DbgVariableRecord representing the intrinsic \p DVI, for
+  /// example the assignment represented by a dbg.value.
+  DbgVariableRecord(const DbgVariableIntrinsic *DVI);
+  DbgVariableRecord(const DbgVariableRecord &DVR);
+  /// Directly construct a new DbgVariableRecord representing a dbg.value
+  /// intrinsic assigning \p Location to the DV / Expr / DI variable.
+  DbgVariableRecord(Metadata *Location, DILocalVariable *DV, DIExpression *Expr,
+                    const DILocation *DI,
+                    LocationType Type = LocationType::Value);
+  DbgVariableRecord(Metadata *Value, DILocalVariable *Variable,
+                    DIExpression *Expression, DIAssignID *AssignID,
+                    Metadata *Address, DIExpression *AddressExpression,
+                    const DILocation *DI);
 
 private:
   /// Private constructor for creating new instances during parsing only. Only
-  /// called through `createUnresolvedDPValue` below, which makes clear that
-  /// this is used for parsing only, and will later return a subclass depending
-  /// on which Type is passed.
-  DPValue(LocationType Type, Metadata *Val, MDNode *Variable,
-          MDNode *Expression, MDNode *AssignID, Metadata *Address,
-          MDNode *AddressExpression, MDNode *DI);
+  /// called through `createUnresolvedDbgVariableRecord` below, which makes
+  /// clear that this is used for parsing only, and will later return a subclass
+  /// depending on which Type is passed.
+  DbgVariableRecord(LocationType Type, Metadata *Val, MDNode *Variable,
+                    MDNode *Expression, MDNode *AssignID, Metadata *Address,
+                    MDNode *AddressExpression, MDNode *DI);
 
 public:
-  /// Used to create DPValues during parsing, where some metadata references may
-  /// still be unresolved. Although for some fields a generic `Metadata*`
-  /// argument is accepted for forward type-references, the verifier and
-  /// accessors will reject incorrect types later on. The function is used for
-  /// all types of DPValues for simplicity while parsing, but asserts if any
-  /// necessary fields are empty or unused fields are not empty, i.e. if the
-  /// #dbg_assign fields are used for a non-dbg-assign type.
-  static DPValue *createUnresolvedDPValue(LocationType Type, Metadata *Val,
-                                          MDNode *Variable, MDNode *Expression,
-                                          MDNode *AssignID, Metadata *Address,
-                                          MDNode *AddressExpression,
-                                          MDNode *DI);
-
-  static DPValue *createDPVAssign(Value *Val, DILocalVariable *Variable,
-                                  DIExpression *Expression,
-                                  DIAssignID *AssignID, Value *Address,
-                                  DIExpression *AddressExpression,
-                                  const DILocation *DI);
-  static DPValue *createLinkedDPVAssign(Instruction *LinkedInstr, Value *Val,
-                                        DILocalVariable *Variable,
-                                        DIExpression *Expression,
-                                        Value *Address,
-                                        DIExpression *AddressExpression,
-                                        const DILocation *DI);
-
-  static DPValue *createDPValue(Value *Location, DILocalVariable *DV,
-                                DIExpression *Expr, const DILocation *DI);
-  static DPValue *createDPValue(Value *Location, DILocalVariable *DV,
-                                DIExpression *Expr, const DILocation *DI,
-                                DPValue &InsertBefore);
-  static DPValue *createDPVDeclare(Value *Address, DILocalVariable *DV,
-                                   DIExpression *Expr, const DILocation *DI);
-  static DPValue *createDPVDeclare(Value *Address, DILocalVariable *DV,
-                                   DIExpression *Expr, const DILocation *DI,
-                                   DPValue &InsertBefore);
+  /// Used to create DbgVariableRecords during parsing, where some metadata
+  /// references may still be unresolved. Although for some fields a generic
+  /// `Metadata*` argument is accepted for forward type-references, the verifier
+  /// and accessors will reject incorrect types later on. The function is used
+  /// for all types of DbgVariableRecords for simplicity while parsing, but
+  /// asserts if any necessary fields are empty or unused fields are not empty,
+  /// i.e. if the #dbg_assign fields are used for a non-dbg-assign type.
+  static DbgVariableRecord *
+  createUnresolvedDbgVariableRecord(LocationType Type, Metadata *Val,
+                                    MDNode *Variable, MDNode *Expression,
+                                    MDNode *AssignID, Metadata *Address,
+                                    MDNode *AddressExpression, MDNode *DI);
+
+  static DbgVariableRecord *
+  createDVRAssign(Value *Val, DILocalVariable *Variable,
+                  DIExpression *Expression, DIAssignID *AssignID,
+                  Value *Address, DIExpression *AddressExpression,
+                  const DILocation *DI);
+  static DbgVariableRecord *
+  createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val,
+                        DILocalVariable *Variable, DIExpression *Expression,
+                        Value *Address, DIExpression *AddressExpression,
+                        const DILocation *DI);
+
+  static DbgVariableRecord *createDbgVariableRecord(Value *Location,
+                                                    DILocalVariable *DV,
+                                                    DIExpression *Expr,
+                                                    const DILocation *DI);
+  static DbgVariableRecord *
+  createDbgVariableRecord(Value *Location, DILocalVariable *DV,
+                          DIExpression *Expr, const DILocation *DI,
+                          DbgVariableRecord &InsertBefore);
+  static DbgVariableRecord *createDVRDeclare(Value *Address,
+                                             DILocalVariable *DV,
+                                             DIExpression *Expr,
+                                             const DILocation *DI);
+  static DbgVariableRecord *
+  createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr,
+                   const DILocation *DI, DbgVariableRecord &InsertBefore);
 
   /// Iterator for ValueAsMetadata that internally uses direct pointer iteration
   /// over either a ValueAsMetadata* or a ValueAsMetadata**, dereferencing to the
@@ -412,7 +421,8 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   unsigned getNumVariableLocationOps() const;
 
   bool hasArgList() const { return isa<DIArgList>(getRawLocation()); }
-  /// Returns true if this DPValue has no empty MDNodes in its location list.
+  /// Returns true if this DbgVariableRecord has no empty MDNodes in its
+  /// location list.
   bool hasValidLocation() const { return getVariableLocationOp(0) != nullptr; }
 
   /// Does this describe the address of a local variable. True for dbg.addr
@@ -445,10 +455,10 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   /// replaceVariableLocationOp and addVariableLocationOps should be used where
   /// possible to avoid creating invalid state.
   void setRawLocation(Metadata *NewLocation) {
-    assert(
-        (isa<ValueAsMetadata>(NewLocation) || isa<DIArgList>(NewLocation) ||
-         isa<MDNode>(NewLocation)) &&
-        "Location for a DPValue must be either ValueAsMetadata or DIArgList");
+    assert((isa<ValueAsMetadata>(NewLocation) || isa<DIArgList>(NewLocation) ||
+            isa<MDNode>(NewLocation)) &&
+           "Location for a DbgVariableRecord must be either ValueAsMetadata or "
+           "DIArgList");
     resetDebugValue(0, NewLocation);
   }
 
@@ -456,12 +466,12 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   /// is described.
   std::optional<uint64_t> getFragmentSizeInBits() const;
 
-  bool isEquivalentTo(const DPValue &Other) const {
+  bool isEquivalentTo(const DbgVariableRecord &Other) const {
     return DbgLoc == Other.DbgLoc && isIdenticalToWhenDefined(Other);
   }
   // Matches the definition of the Instruction version, equivalent to above but
   // without checking DbgLoc.
-  bool isIdenticalToWhenDefined(const DPValue &Other) const {
+  bool isIdenticalToWhenDefined(const DbgVariableRecord &Other) const {
     return std::tie(Type, DebugValues, Variable, Expression,
                     AddressExpression) ==
            std::tie(Other.Type, Other.DebugValues, Other.Variable,
@@ -497,10 +507,10 @@ class DPValue : public DbgRecord, protected DebugValueUser {
 
   /// @}
 
-  DPValue *clone() const;
-  /// Convert this DPValue back into a dbg.value intrinsic.
+  DbgVariableRecord *clone() const;
+  /// Convert this DbgVariableRecord back into a dbg.value intrinsic.
   /// \p InsertBefore Optional position to insert this intrinsic.
-  /// \returns A new dbg.value intrinsic representiung this DPValue.
+  /// \returns A new dbg.value intrinsic representiung this DbgVariableRecord.
   DbgVariableIntrinsic *createDebugIntrinsic(Module *M,
                                              Instruction *InsertBefore) const;
 
@@ -511,20 +521,21 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const;
 
-  /// Filter the DbgRecord range to DPValue types only and downcast.
-  static inline auto
-  filter(iterator_range<simple_ilist<DbgRecord>::iterator> R) {
-    return map_range(
-        make_filter_range(R, [](DbgRecord &E) { return isa<DPValue>(E); }),
-        [](DbgRecord &E) { return std::ref(cast<DPValue>(E)); });
-  }
-
   /// Support type inquiry through isa, cast, and dyn_cast.
   static bool classof(const DbgRecord *E) {
     return E->getRecordKind() == ValueKind;
   }
 };
 
+/// Filter the DbgRecord range to DbgVariableRecord types only and downcast.
+static inline auto
+filterDbgVars(iterator_range<simple_ilist<DbgRecord>::iterator> R) {
+  return map_range(
+      make_filter_range(R,
+                        [](DbgRecord &E) { return isa<DbgVariableRecord>(E); }),
+      [](DbgRecord &E) { return std::ref(cast<DbgVariableRecord>(E)); });
+}
+
 /// Per-instruction record of debug-info. If an Instruction is the position of
 /// some debugging information, it points at a DPMarker storing that info. Each
 /// marker points back at the instruction that owns it. Various utilities are
@@ -557,8 +568,8 @@ class DPMarker {
   /// intrinsics. There is a one-to-one relationship between each debug
   /// intrinsic in a block and each DbgRecord once the representation has been
   /// converted, and the ordering is meaningful in the same way.
-  simple_ilist<DbgRecord> StoredDPValues;
-  bool empty() const { return StoredDPValues.empty(); }
+  simple_ilist<DbgRecord> StoredDbgRecords;
+  bool empty() const { return StoredDbgRecords.empty(); }
 
   const BasicBlock *getParent() const;
   BasicBlock *getParent();
@@ -576,54 +587,56 @@ class DPMarker {
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const;
 
-  /// Produce a range over all the DPValues in this Marker.
+  /// Produce a range over all the DbgRecords in this Marker.
   iterator_range<simple_ilist<DbgRecord>::iterator> getDbgRecordRange();
   iterator_range<simple_ilist<DbgRecord>::const_iterator>
   getDbgRecordRange() const;
-  /// Transfer any DPValues from \p Src into this DPMarker. If \p InsertAtHead
-  /// is true, place them before existing DPValues, otherwise afterwards.
+  /// Transfer any DbgRecords from \p Src into this DPMarker. If \p InsertAtHead
+  /// is true, place them before existing DbgRecords, otherwise afterwards.
   void absorbDebugValues(DPMarker &Src, bool InsertAtHead);
-  /// Transfer the DPValues in \p Range from \p Src into this DPMarker. If
-  /// \p InsertAtHead is true, place them before existing DPValues, otherwise
+  /// Transfer the DbgRecords in \p Range from \p Src into this DPMarker. If
+  /// \p InsertAtHead is true, place them before existing DbgRecords, otherwise
   // afterwards.
   void absorbDebugValues(iterator_range<DbgRecord::self_iterator> Range,
                          DPMarker &Src, bool InsertAtHead);
-  /// Insert a DPValue into this DPMarker, at the end of the list. If
+  /// Insert a DbgRecord into this DPMarker, at the end of the list. If
   /// \p InsertAtHead is true, at the start.
   void insertDbgRecord(DbgRecord *New, bool InsertAtHead);
-  /// Insert a DPValue prior to a DPValue contained within this marker.
+  /// Insert a DbgRecord prior to a DbgRecord contained within this marker.
   void insertDbgRecord(DbgRecord *New, DbgRecord *InsertBefore);
-  /// Insert a DPValue after a DPValue contained within this marker.
+  /// Insert a DbgRecord after a DbgRecord contained within this marker.
   void insertDbgRecordAfter(DbgRecord *New, DbgRecord *InsertAfter);
   /// Clone all DPMarkers from \p From into this marker. There are numerous
   /// options to customise the source/destination, due to gnarliness, see class
   /// comment.
-  /// \p FromHere If non-null, copy from FromHere to the end of From's DPValues
-  /// \p InsertAtHead Place the cloned DPValues at the start of StoredDPValues
-  /// \returns Range over all the newly cloned DPValues
+  /// \p FromHere If non-null, copy from FromHere to the end of From's
+  /// DbgRecords
+  /// \p InsertAtHead Place the cloned DbgRecords at the start of
+  /// StoredDbgRecords
+  /// \returns Range over all the newly cloned DbgRecords
   iterator_range<simple_ilist<DbgRecord>::iterator>
   cloneDebugInfoFrom(DPMarker *From,
                      std::optional<simple_ilist<DbgRecord>::iterator> FromHere,
                      bool InsertAtHead = false);
-  /// Erase all DPValues in this DPMarker.
+  /// Erase all DbgRecords in this DPMarker.
   void dropDbgRecords();
   /// Erase a single DbgRecord from this marker. In an ideal future, we would
   /// never erase an assignment in this way, but it's the equivalent to
   /// erasing a debug intrinsic from a block.
   void dropOneDbgRecord(DbgRecord *DR);
 
-  /// We generally act like all llvm Instructions have a range of DPValues
+  /// We generally act like all llvm Instructions have a range of DbgRecords
   /// attached to them, but in reality sometimes we don't allocate the DPMarker
-  /// to save time and memory, but still have to return ranges of DPValues. When
-  /// we need to describe such an unallocated DPValue range, use this static
-  /// markers range instead. This will bite us if someone tries to insert a
-  /// DPValue in that range, but they should be using the Official (TM) API for
-  /// that.
+  /// to save time and memory, but still have to return ranges of DbgRecords.
+  /// When we need to describe such an unallocated DbgRecord range, use this
+  /// static markers range instead. This will bite us if someone tries to insert
+  /// a DbgRecord in that range, but they should be using the Official (TM) API
+  /// for that.
   static DPMarker EmptyDPMarker;
   static iterator_range<simple_ilist<DbgRecord>::iterator>
   getEmptyDbgRecordRange() {
-    return make_range(EmptyDPMarker.StoredDPValues.end(),
-                      EmptyDPMarker.StoredDPValues.end());
+    return make_range(EmptyDPMarker.StoredDbgRecords.end(),
+                      EmptyDPMarker.StoredDbgRecords.end());
   }
 };
 
@@ -632,7 +645,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DPMarker &Marker) {
   return OS;
 }
 
-/// Inline helper to return a range of DPValues attached to a marker. It needs
+/// Inline helper to return a range of DbgRecords attached to a marker. It needs
 /// to be inlined as it's frequently called, but also come after the declaration
 /// of DPMarker. Thus: it's pre-declared by users like Instruction, then an
 /// inlineable body defined here.
@@ -643,6 +656,8 @@ getDbgRecordRange(DPMarker *DbgMarker) {
   return DbgMarker->getDbgRecordRange();
 }
 
+DEFINE_ISA_CONVERSION_FUNCTIONS(DbgRecord, LLVMDbgRecordRef)
+
 } // namespace llvm
 
 #endif // LLVM_IR_DEBUGPROGRAMINSTRUCTION_H
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 324dc30d70f61..7f8503634de73 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -967,9 +967,9 @@ class IRBuilderBase {
 
   /// Create a call to intrinsic \p ID with 2 operands which is mangled on the
   /// first type.
-  CallInst *CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS,
-                                  Instruction *FMFSource = nullptr,
-                                  const Twine &Name = "");
+  Value *CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS,
+                               Instruction *FMFSource = nullptr,
+                               const Twine &Name = "");
 
   /// Create a call to intrinsic \p ID with \p Args, mangled using \p Types. If
   /// \p FMFSource is provided, copy fast-math-flags from that instruction to
@@ -988,7 +988,7 @@ class IRBuilderBase {
                             const Twine &Name = "");
 
   /// Create call to the minnum intrinsic.
-  CallInst *CreateMinNum(Value *LHS, Value *RHS, const Twine &Name = "") {
+  Value *CreateMinNum(Value *LHS, Value *RHS, const Twine &Name = "") {
     if (IsFPConstrained) {
       return CreateConstrainedFPUnroundedBinOp(
           Intrinsic::experimental_constrained_minnum, LHS, RHS, nullptr, Name);
@@ -998,7 +998,7 @@ class IRBuilderBase {
   }
 
   /// Create call to the maxnum intrinsic.
-  CallInst *CreateMaxNum(Value *LHS, Value *RHS, const Twine &Name = "") {
+  Value *CreateMaxNum(Value *LHS, Value *RHS, const Twine &Name = "") {
     if (IsFPConstrained) {
       return CreateConstrainedFPUnroundedBinOp(
           Intrinsic::experimental_constrained_maxnum, LHS, RHS, nullptr, Name);
@@ -1008,19 +1008,19 @@ class IRBuilderBase {
   }
 
   /// Create call to the minimum intrinsic.
-  CallInst *CreateMinimum(Value *LHS, Value *RHS, const Twine &Name = "") {
+  Value *CreateMinimum(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS, nullptr, Name);
   }
 
   /// Create call to the maximum intrinsic.
-  CallInst *CreateMaximum(Value *LHS, Value *RHS, const Twine &Name = "") {
+  Value *CreateMaximum(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name);
   }
 
   /// Create call to the copysign intrinsic.
-  CallInst *CreateCopySign(Value *LHS, Value *RHS,
-                           Instruction *FMFSource = nullptr,
-                           const Twine &Name = "") {
+  Value *CreateCopySign(Value *LHS, Value *RHS,
+                        Instruction *FMFSource = nullptr,
+                        const Twine &Name = "") {
     return CreateBinaryIntrinsic(Intrinsic::copysign, LHS, RHS, FMFSource,
                                  Name);
   }
diff --git a/llvm/include/llvm/IR/IRBuilderFolder.h b/llvm/include/llvm/IR/IRBuilderFolder.h
index bd2324dfc5f1b..3020f2684ee45 100644
--- a/llvm/include/llvm/IR/IRBuilderFolder.h
+++ b/llvm/include/llvm/IR/IRBuilderFolder.h
@@ -73,6 +73,10 @@ class IRBuilderFolder {
   virtual Value *FoldCast(Instruction::CastOps Op, Value *V,
                           Type *DestTy) const = 0;
 
+  virtual Value *
+  FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                      Instruction *FMFSource = nullptr) const = 0;
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 817abd6afbcae..d6cf155775238 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -64,47 +64,48 @@ class Instruction : public User,
 
   /// Clone any debug-info attached to \p From onto this instruction. Used to
   /// copy debugging information from one block to another, when copying entire
-  /// blocks. \see DebugProgramInstruction.h , because the ordering of DPValues
-  /// is still important, fine grain control of which instructions are moved and
-  /// where they go is necessary.
+  /// blocks. \see DebugProgramInstruction.h , because the ordering of
+  /// DbgRecords is still important, fine grain control of which instructions
+  /// are moved and where they go is necessary.
   /// \p From The instruction to clone debug-info from.
-  /// \p from_here Optional iterator to limit DPValues cloned to be a range from
+  /// \p from_here Optional iterator to limit DbgRecords cloned to be a range
+  /// from
   ///    from_here to end().
-  /// \p InsertAtHead Whether the cloned DPValues should be placed at the end
-  ///    or the beginning of existing DPValues attached to this.
-  /// \returns A range over the newly cloned DPValues.
+  /// \p InsertAtHead Whether the cloned DbgRecords should be placed at the end
+  ///    or the beginning of existing DbgRecords attached to this.
+  /// \returns A range over the newly cloned DbgRecords.
   iterator_range<simple_ilist<DbgRecord>::iterator> cloneDebugInfoFrom(
       const Instruction *From,
       std::optional<simple_ilist<DbgRecord>::iterator> FromHere = std::nullopt,
       bool InsertAtHead = false);
 
-  /// Return a range over the DPValues attached to this instruction.
+  /// Return a range over the DbgRecords attached to this instruction.
   iterator_range<simple_ilist<DbgRecord>::iterator> getDbgRecordRange() const {
     return llvm::getDbgRecordRange(DbgMarker);
   }
 
-  /// Return an iterator to the position of the "Next" DPValue after this
+  /// Return an iterator to the position of the "Next" DbgRecord after this
   /// instruction, or std::nullopt. This is the position to pass to
   /// BasicBlock::reinsertInstInDbgRecords when re-inserting an instruction.
   std::optional<simple_ilist<DbgRecord>::iterator> getDbgReinsertionPosition();
 
-  /// Returns true if any DPValues are attached to this instruction.
+  /// Returns true if any DbgRecords are attached to this instruction.
   bool hasDbgRecords() const;
 
-  /// Transfer any DPValues on the position \p It onto this instruction,
-  /// by simply adopting the sequence of DPValues (which is efficient) if
+  /// Transfer any DbgRecords on the position \p It onto this instruction,
+  /// by simply adopting the sequence of DbgRecords (which is efficient) if
   /// possible, by merging two sequences otherwise.
   void adoptDbgRecords(BasicBlock *BB, InstListType::iterator It,
                        bool InsertAtHead);
 
-  /// Erase any DPValues attached to this instruction.
+  /// Erase any DbgRecords attached to this instruction.
   void dropDbgRecords();
 
-  /// Erase a single DPValue \p I that is attached to this instruction.
+  /// Erase a single DbgRecord \p I that is attached to this instruction.
   void dropOneDbgRecord(DbgRecord *I);
 
   /// Handle the debug-info implications of this instruction being removed. Any
-  /// attached DPValues need to "fall" down onto the next instruction.
+  /// attached DbgRecords need to "fall" down onto the next instruction.
   void handleMarkerRemoval();
 
 protected:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index f416db87f9944..9694fc5dbec3e 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1630,6 +1630,12 @@ def int_umax : DefaultAttrsIntrinsic<
 def int_umin : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
     [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_scmp : DefaultAttrsIntrinsic<
+    [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_ucmp : DefaultAttrsIntrinsic<
+    [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 7229292e377a8..1164b241ba7b0 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -21,20 +21,33 @@ def int_dx_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">,
     Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
 
 def int_dx_any  : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
+def int_dx_clamp : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+def int_dx_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; 
 
 def int_dx_dot : 
     Intrinsic<[LLVMVectorElementType<0>], 
-    [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+    [llvm_anyfloat_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+    [IntrNoMem, IntrWillReturn, Commutative] >;
+def int_dx_sdot : 
+    Intrinsic<[LLVMVectorElementType<0>], 
+    [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+    [IntrNoMem, IntrWillReturn, Commutative] >;
+def int_dx_udot : 
+    Intrinsic<[LLVMVectorElementType<0>], 
+    [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
     [IntrNoMem, IntrWillReturn, Commutative] >;
 
 def int_dx_frac  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 
-def int_dx_lerp :
-    Intrinsic<[LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
-    [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>,LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+def int_dx_isinf :
+    DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+    [llvm_anyfloat_ty]>;
+
+def int_dx_lerp : Intrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], 
     [IntrNoMem, IntrWillReturn] >;
 
 def int_dx_imad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 def int_dx_umad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 def int_dx_rcp  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+def int_dx_rsqrt  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 766b3b542a9e0..0eb09b1699aff 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -40,6 +40,18 @@ let TargetPrefix = "spv" in {
   def int_spv_assume : Intrinsic<[], [llvm_i1_ty]>;
   def int_spv_expect : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
 
+  // Memory Use Markers
+  def int_spv_lifetime_start : Intrinsic<[],
+                                [llvm_i64_ty, llvm_anyptr_ty],
+                                [IntrArgMemOnly, IntrWillReturn,
+                                NoCapture<ArgIndex<1>>,
+                                ImmArg<ArgIndex<0>>]>;
+  def int_spv_lifetime_end   : Intrinsic<[],
+                                [llvm_i64_ty, llvm_anyptr_ty],
+                                [IntrArgMemOnly, IntrWillReturn,
+                                NoCapture<ArgIndex<1>>,
+                                ImmArg<ArgIndex<0>>]>;
+
   // The following intrinsic(s) are mirrored from IntrinsicsDirectX.td for HLSL support.
   def int_spv_thread_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
   def int_spv_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">,
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index da6744fdd0916..22da54a1f03c5 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -43,7 +43,7 @@ namespace llvm {
 class Module;
 class ModuleSlotTracker;
 class raw_ostream;
-class DPValue;
+class DbgVariableRecord;
 template <typename T> class StringMapEntry;
 template <typename ValueTy> class StringMapEntryStorage;
 class Type;
@@ -205,23 +205,23 @@ class MetadataAsValue : public Value {
 /// Base class for tracking ValueAsMetadata/DIArgLists with user lookups and
 /// Owner callbacks outside of ValueAsMetadata.
 ///
-/// Currently only inherited by DPValue; if other classes need to use it, then
-/// a SubclassID will need to be added (either as a new field or by making
-/// DebugValue into a PointerIntUnion) to discriminate between the subclasses in
-/// lookup and callback handling.
+/// Currently only inherited by DbgVariableRecord; if other classes need to use
+/// it, then a SubclassID will need to be added (either as a new field or by
+/// making DebugValue into a PointerIntUnion) to discriminate between the
+/// subclasses in lookup and callback handling.
 class DebugValueUser {
 protected:
   // Capacity to store 3 debug values.
   // TODO: Not all DebugValueUser instances need all 3 elements, if we
-  // restructure the DPValue class then we can template parameterize this array
-  // size.
+  // restructure the DbgVariableRecord class then we can template parameterize
+  // this array size.
   std::array<Metadata *, 3> DebugValues;
 
   ArrayRef<Metadata *> getDebugValues() const { return DebugValues; }
 
 public:
-  DPValue *getUser();
-  const DPValue *getUser() const;
+  DbgVariableRecord *getUser();
+  const DbgVariableRecord *getUser() const;
   /// To be called by ReplaceableMetadataImpl::replaceAllUsesWith, where `Old`
   /// is a pointer to one of the pointers in `DebugValues` (so should be type
   /// Metadata**), and `NewDebugValue` is the new Metadata* that is replacing
@@ -407,8 +407,8 @@ class ReplaceableMetadataImpl {
   static void SalvageDebugInfo(const Constant &C); 
   /// Returns the list of all DIArgList users of this.
   SmallVector<Metadata *> getAllArgListUsers();
-  /// Returns the list of all DPValue users of this.
-  SmallVector<DPValue *> getAllDPValueUsers();
+  /// Returns the list of all DbgVariableRecord users of this.
+  SmallVector<DbgVariableRecord *> getAllDbgVariableRecordUsers();
 
   /// Resolve all uses of this.
   ///
@@ -494,8 +494,8 @@ class ValueAsMetadata : public Metadata, ReplaceableMetadataImpl {
   SmallVector<Metadata *> getAllArgListUsers() {
     return ReplaceableMetadataImpl::getAllArgListUsers();
   }
-  SmallVector<DPValue *> getAllDPValueUsers() {
-    return ReplaceableMetadataImpl::getAllDPValueUsers();
+  SmallVector<DbgVariableRecord *> getAllDbgVariableRecordUsers() {
+    return ReplaceableMetadataImpl::getAllDbgVariableRecordUsers();
   }
 
   static void handleDeletion(Value *V);
diff --git a/llvm/include/llvm/IR/NoFolder.h b/llvm/include/llvm/IR/NoFolder.h
index a612f98465aea..7bb5d5e696e9e 100644
--- a/llvm/include/llvm/IR/NoFolder.h
+++ b/llvm/include/llvm/IR/NoFolder.h
@@ -112,6 +112,11 @@ class NoFolder final : public IRBuilderFolder {
     return nullptr;
   }
 
+  Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                             Instruction *FMFSource) const override {
+    return nullptr;
+  }
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index c03d49c3b7b97..ec8b809d40bfb 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -227,7 +227,7 @@ class PassManager : public PassInfoMixin<
         detail::getAnalysisResult<PassInstrumentationAnalysis>(
             AM, IR, std::tuple<ExtraArgTs...>(ExtraArgs...));
 
-    // RemoveDIs: if requested, convert debug-info to DPValue representation
+    // RemoveDIs: if requested, convert debug-info to DbgRecord representation
     // for duration of these passes.
     bool ShouldConvertDbgInfo = shouldConvertDbgInfo(IR);
     if (ShouldConvertDbgInfo)
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 487ae170210de..382009d9df785 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -68,6 +68,22 @@ template <typename T> inline OneUse_match<T> m_OneUse(const T &SubPattern) {
   return SubPattern;
 }
 
+template <typename SubPattern_t> struct AllowReassoc_match {
+  SubPattern_t SubPattern;
+
+  AllowReassoc_match(const SubPattern_t &SP) : SubPattern(SP) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    auto *I = dyn_cast<FPMathOperator>(V);
+    return I && I->hasAllowReassoc() && SubPattern.match(I);
+  }
+};
+
+template <typename T>
+inline AllowReassoc_match<T> m_AllowReassoc(const T &SubPattern) {
+  return SubPattern;
+}
+
 template <typename Class> struct class_match {
   template <typename ITy> bool match(ITy *V) { return isa<Class>(V); }
 };
diff --git a/llvm/include/llvm/MC/MCInstBuilder.h b/llvm/include/llvm/MC/MCInstBuilder.h
index 6e5e9dd69018f..d06ed4c6c840a 100644
--- a/llvm/include/llvm/MC/MCInstBuilder.h
+++ b/llvm/include/llvm/MC/MCInstBuilder.h
@@ -27,6 +27,12 @@ class MCInstBuilder {
     Inst.setOpcode(Opcode);
   }
 
+  /// Set the location.
+  MCInstBuilder &setLoc(SMLoc SM) {
+    Inst.setLoc(SM);
+    return *this;
+  }
+
   /// Add a new register operand.
   MCInstBuilder &addReg(unsigned Reg) {
     Inst.addOperand(MCOperand::createReg(Reg));
diff --git a/llvm/include/llvm/MC/MCInstrAnalysis.h b/llvm/include/llvm/MC/MCInstrAnalysis.h
index e3ddf0b8b8939..87db57f74f520 100644
--- a/llvm/include/llvm/MC/MCInstrAnalysis.h
+++ b/llvm/include/llvm/MC/MCInstrAnalysis.h
@@ -123,14 +123,14 @@ class MCInstrAnalysis {
   /// broken. Each bit of the mask is associated with a specific input operand.
   /// Bits associated with explicit input operands are laid out first in the
   /// mask; implicit operands come after explicit operands.
-  /// 
+  ///
   /// Dependencies are broken only for operands that have their corresponding bit
   /// set. Operands that have their bit cleared, or that don't have a
   /// corresponding bit in the mask don't have their dependency broken.  Note
   /// that Mask may not be big enough to describe all operands.  The assumption
   /// for operands that don't have a correspondent bit in the mask is that those
   /// are still data dependent.
-  /// 
+  ///
   /// The only exception to the rule is for when Mask has all zeroes.
   /// A zero mask means: dependencies are broken for all explicit register
   /// operands.
diff --git a/llvm/include/llvm/MC/MCSymbolXCOFF.h b/llvm/include/llvm/MC/MCSymbolXCOFF.h
index 11c3b8831ba51..3bf4491e8fe2f 100644
--- a/llvm/include/llvm/MC/MCSymbolXCOFF.h
+++ b/llvm/include/llvm/MC/MCSymbolXCOFF.h
@@ -26,6 +26,8 @@ class MCSymbolXCOFF : public MCSymbol {
 
   static bool classof(const MCSymbol *S) { return S->isXCOFF(); }
 
+  enum CodeModel : uint8_t { CM_Small, CM_Large };
+
   static StringRef getUnqualifiedName(StringRef Name) {
     if (Name.back() == ']') {
       StringRef Lhs, Rhs;
@@ -72,8 +74,22 @@ class MCSymbolXCOFF : public MCSymbol {
 
   void setEHInfo() const { modifyFlags(SF_EHInfo, SF_EHInfo); }
 
+  bool hasPerSymbolCodeModel() const { return PerSymbolCodeModel.has_value(); }
+
+  CodeModel getPerSymbolCodeModel() const {
+    assert(hasPerSymbolCodeModel() &&
+           "Requested code model for symbol without one");
+    return *PerSymbolCodeModel;
+  }
+
+  void setPerSymbolCodeModel(MCSymbolXCOFF::CodeModel Model) {
+    PerSymbolCodeModel = Model;
+  }
+
 private:
   std::optional<XCOFF::StorageClass> StorageClass;
+  std::optional<CodeModel> PerSymbolCodeModel;
+
   MCSectionXCOFF *RepresentedCsect = nullptr;
   XCOFF::VisibilityType VisibilityType = XCOFF::SYM_V_UNSPECIFIED;
   StringRef SymbolTableName;
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index c9227da65708c..f57a7ab8882ad 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -60,6 +60,7 @@ class ELFObjectFileBase : public ObjectFile {
 
   SubtargetFeatures getMIPSFeatures() const;
   SubtargetFeatures getARMFeatures() const;
+  SubtargetFeatures getHexagonFeatures() const;
   Expected<SubtargetFeatures> getRISCVFeatures() const;
   SubtargetFeatures getLoongArchFeatures() const;
 
@@ -389,25 +390,38 @@ template <class ELFT> class ELFObjectFile : public ELFObjectFileBase {
   }
 
   Error getBuildAttributes(ELFAttributeParser &Attributes) const override {
+    uint32_t Type;
+    switch (getEMachine()) {
+    case ELF::EM_ARM:
+      Type = ELF::SHT_ARM_ATTRIBUTES;
+      break;
+    case ELF::EM_RISCV:
+      Type = ELF::SHT_RISCV_ATTRIBUTES;
+      break;
+    case ELF::EM_HEXAGON:
+      Type = ELF::SHT_HEXAGON_ATTRIBUTES;
+      break;
+    default:
+      return Error::success();
+    }
+
     auto SectionsOrErr = EF.sections();
     if (!SectionsOrErr)
       return SectionsOrErr.takeError();
-
     for (const Elf_Shdr &Sec : *SectionsOrErr) {
-      if (Sec.sh_type == ELF::SHT_ARM_ATTRIBUTES ||
-          Sec.sh_type == ELF::SHT_RISCV_ATTRIBUTES) {
-        auto ErrorOrContents = EF.getSectionContents(Sec);
-        if (!ErrorOrContents)
-          return ErrorOrContents.takeError();
-
-        auto Contents = ErrorOrContents.get();
-        if (Contents[0] != ELFAttrs::Format_Version || Contents.size() == 1)
-          return Error::success();
-
-        if (Error E = Attributes.parse(Contents, ELFT::TargetEndianness))
-          return E;
-        break;
-      }
+      if (Sec.sh_type != Type)
+        continue;
+      auto ErrorOrContents = EF.getSectionContents(Sec);
+      if (!ErrorOrContents)
+        return ErrorOrContents.takeError();
+
+      auto Contents = ErrorOrContents.get();
+      if (Contents[0] != ELFAttrs::Format_Version || Contents.size() == 1)
+        return Error::success();
+
+      if (Error E = Attributes.parse(Contents, ELFT::TargetEndianness))
+        return E;
+      break;
     }
     return Error::success();
   }
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 497f82bbd0f32..f7f8d5e6bf472 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -56,7 +56,7 @@ struct DXILProgram {
   std::optional<std::vector<llvm::yaml::Hex8>> DXIL;
 };
 
-#define SHADER_FEATURE_FLAG(Num, Val, Str) bool Val = false;
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str) bool Val = false;
 struct ShaderFeatureFlags {
   ShaderFeatureFlags() = default;
   ShaderFeatureFlags(uint64_t FlagData);
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 5cbde3fbda7e9..a356e42748669 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -634,6 +634,52 @@ class PassBuilder {
   void invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM,
                                                     OptimizationLevel Level);
 
+  static bool checkParametrizedPassName(StringRef Name, StringRef PassName) {
+    if (!Name.consume_front(PassName))
+      return false;
+    // normal pass name w/o parameters == default parameters
+    if (Name.empty())
+      return true;
+    return Name.starts_with("<") && Name.ends_with(">");
+  }
+
+  /// This performs customized parsing of pass name with parameters.
+  ///
+  /// We do not need parametrization of passes in textual pipeline very often,
+  /// yet on a rare occasion ability to specify parameters right there can be
+  /// useful.
+  ///
+  /// \p Name - parameterized specification of a pass from a textual pipeline
+  /// is a string in a form of :
+  ///      PassName '<' parameter-list '>'
+  ///
+  /// Parameter list is being parsed by the parser callable argument, \p Parser,
+  /// It takes a string-ref of parameters and returns either StringError or a
+  /// parameter list in a form of a custom parameters type, all wrapped into
+  /// Expected<> template class.
+  ///
+  template <typename ParametersParseCallableT>
+  static auto parsePassParameters(ParametersParseCallableT &&Parser,
+                                  StringRef Name, StringRef PassName)
+      -> decltype(Parser(StringRef{})) {
+    using ParametersT = typename decltype(Parser(StringRef{}))::value_type;
+
+    StringRef Params = Name;
+    if (!Params.consume_front(PassName)) {
+      llvm_unreachable(
+          "unable to strip pass name from parametrized pass specification");
+    }
+    if (!Params.empty() &&
+        (!Params.consume_front("<") || !Params.consume_back(">"))) {
+      llvm_unreachable("invalid format for parametrized pass name");
+    }
+
+    Expected<ParametersT> Result = Parser(Params);
+    assert((Result || Result.template errorIsA<StringError>()) &&
+           "Pass parameter parser can only return StringErrors.");
+    return Result;
+  }
+
 private:
   // O1 pass pipeline
   FunctionPassManager
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index 84cac3ef700e0..a119b0724d677 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -46,9 +46,12 @@
 
 // Sets bits for specified bit mask in specified destination.
 #ifndef AMDHSA_BITS_SET
-#define AMDHSA_BITS_SET(DST, MSK, VAL)  \
-  DST &= ~MSK;                          \
-  DST |= ((VAL << MSK ## _SHIFT) & MSK)
+#define AMDHSA_BITS_SET(DST, MSK, VAL)                                         \
+  do {                                                                         \
+    auto local = VAL;                                                          \
+    DST &= ~MSK;                                                               \
+    DST |= ((local << MSK##_SHIFT) & MSK);                                     \
+  } while (0)
 #endif // AMDHSA_BITS_SET
 
 namespace llvm {
diff --git a/llvm/include/llvm/Support/HexagonAttributeParser.h b/llvm/include/llvm/Support/HexagonAttributeParser.h
new file mode 100644
index 0000000000000..1116dd42b1ad0
--- /dev/null
+++ b/llvm/include/llvm/Support/HexagonAttributeParser.h
@@ -0,0 +1,37 @@
+//===-- HexagonAttributeParser.h - Hexagon Attribute Parser -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_HEXAGONATTRIBUTEPARSER_H
+#define LLVM_SUPPORT_HEXAGONATTRIBUTEPARSER_H
+
+#include "llvm/Support/ELFAttributeParser.h"
+#include "llvm/Support/HexagonAttributes.h"
+
+namespace llvm {
+class HexagonAttributeParser : public ELFAttributeParser {
+  struct DisplayHandler {
+    HexagonAttrs::AttrType Attribute;
+    Error (HexagonAttributeParser::*Routine)(unsigned);
+  };
+
+  static const DisplayHandler DisplayRoutines[];
+
+  Error handler(uint64_t Tag, bool &Handled) override;
+
+public:
+  HexagonAttributeParser(ScopedPrinter *SP)
+      : ELFAttributeParser(SP, HexagonAttrs::getHexagonAttributeTags(),
+                           "hexagon") {}
+  HexagonAttributeParser()
+      : ELFAttributeParser(HexagonAttrs::getHexagonAttributeTags(), "hexagon") {
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/HexagonAttributes.h b/llvm/include/llvm/Support/HexagonAttributes.h
new file mode 100644
index 0000000000000..8a50d8993e633
--- /dev/null
+++ b/llvm/include/llvm/Support/HexagonAttributes.h
@@ -0,0 +1,32 @@
+//===-- HexagonAttributes.h - Qualcomm Hexagon Attributes -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_HEXAGONATTRIBUTES_H
+#define LLVM_SUPPORT_HEXAGONATTRIBUTES_H
+
+#include "llvm/Support/ELFAttributes.h"
+
+namespace llvm {
+namespace HexagonAttrs {
+
+const TagNameMap &getHexagonAttributeTags();
+
+enum AttrType : unsigned {
+  ARCH = 4,
+  HVXARCH = 5,
+  HVXIEEEFP = 6,
+  HVXQFLOAT = 7,
+  ZREG = 8,
+  AUDIO = 9,
+  CABAC = 10
+};
+
+} // namespace HexagonAttrs
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
index 7fc572b6ff06e..c4e741549f3ff 100644
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -200,6 +200,20 @@ inline int64_t decodeSLEB128(const uint8_t *p, unsigned *n = nullptr,
   return Value;
 }
 
+inline uint64_t decodeULEB128AndInc(const uint8_t *&p) {
+  unsigned n;
+  auto ret = decodeULEB128(p, &n);
+  p += n;
+  return ret;
+}
+
+inline int64_t decodeSLEB128AndInc(const uint8_t *&p) {
+  unsigned n;
+  auto ret = decodeSLEB128(p, &n);
+  p += n;
+  return ret;
+}
+
 /// Utility function to get the size of the ULEB128-encoded value.
 extern unsigned getULEB128Size(uint64_t Value);
 
diff --git a/llvm/include/llvm/Support/TimeProfiler.h b/llvm/include/llvm/Support/TimeProfiler.h
index 454a65f70231f..31f7df10916db 100644
--- a/llvm/include/llvm/Support/TimeProfiler.h
+++ b/llvm/include/llvm/Support/TimeProfiler.h
@@ -86,6 +86,8 @@ class raw_pwrite_stream;
 struct TimeTraceProfiler;
 TimeTraceProfiler *getTimeTraceProfilerInstance();
 
+struct TimeTraceProfilerEntry;
+
 /// Initialize the time trace profiler.
 /// This sets up the global \p TimeTraceProfilerInstance
 /// variable to be the profiler instance.
@@ -120,19 +122,30 @@ Error timeTraceProfilerWrite(StringRef PreferredFileName,
 /// Profiler copies the string data, so the pointers can be given into
 /// temporaries. Time sections can be hierarchical; every Begin must have a
 /// matching End pair but they can nest.
-void timeTraceProfilerBegin(StringRef Name, StringRef Detail);
-void timeTraceProfilerBegin(StringRef Name,
-                            llvm::function_ref<std::string()> Detail);
+TimeTraceProfilerEntry *timeTraceProfilerBegin(StringRef Name,
+                                               StringRef Detail);
+TimeTraceProfilerEntry *
+timeTraceProfilerBegin(StringRef Name,
+                       llvm::function_ref<std::string()> Detail);
+
+/// Manually begin a time section, with the given \p Name and \p Detail.
+/// This starts Async Events having \p Name as a category which is shown
+/// separately from other traces. See
+/// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview#heading=h.jh64i9l3vwa1
+/// for more details.
+TimeTraceProfilerEntry *timeTraceAsyncProfilerBegin(StringRef Name,
+                                                    StringRef Detail);
 
 /// Manually end the last time section.
 void timeTraceProfilerEnd();
+void timeTraceProfilerEnd(TimeTraceProfilerEntry *E);
 
 /// The TimeTraceScope is a helper class to call the begin and end functions
 /// of the time trace profiler.  When the object is constructed, it begins
 /// the section; and when it is destroyed, it stops it. If the time profiler
 /// is not initialized, the overhead is a single branch.
-struct TimeTraceScope {
-
+class TimeTraceScope {
+public:
   TimeTraceScope() = delete;
   TimeTraceScope(const TimeTraceScope &) = delete;
   TimeTraceScope &operator=(const TimeTraceScope &) = delete;
@@ -141,20 +154,23 @@ struct TimeTraceScope {
 
   TimeTraceScope(StringRef Name) {
     if (getTimeTraceProfilerInstance() != nullptr)
-      timeTraceProfilerBegin(Name, StringRef(""));
+      Entry = timeTraceProfilerBegin(Name, StringRef(""));
   }
   TimeTraceScope(StringRef Name, StringRef Detail) {
     if (getTimeTraceProfilerInstance() != nullptr)
-      timeTraceProfilerBegin(Name, Detail);
+      Entry = timeTraceProfilerBegin(Name, Detail);
   }
   TimeTraceScope(StringRef Name, llvm::function_ref<std::string()> Detail) {
     if (getTimeTraceProfilerInstance() != nullptr)
-      timeTraceProfilerBegin(Name, Detail);
+      Entry = timeTraceProfilerBegin(Name, Detail);
   }
   ~TimeTraceScope() {
     if (getTimeTraceProfilerInstance() != nullptr)
-      timeTraceProfilerEnd();
+      timeTraceProfilerEnd(Entry);
   }
+
+private:
+  TimeTraceProfilerEntry *Entry = nullptr;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index ef1fac92c2fa4..770ca8764426a 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -929,12 +929,12 @@ class RedirectingFileSystem
   /// Canonicalize path by removing ".", "..", "./", components. This is
   /// a VFS request, do not bother about symlinks in the path components
   /// but canonicalize in order to perform the correct entry search.
-  std::error_code makeCanonical(SmallVectorImpl<char> &Path) const;
+  std::error_code makeCanonicalForLookup(SmallVectorImpl<char> &Path) const;
 
   /// Get the File status, or error, from the underlying external file system.
   /// This returns the status with the originally requested name, while looking
-  /// up the entry using the canonical path.
-  ErrorOr<Status> getExternalStatus(const Twine &CanonicalPath,
+  /// up the entry using a potentially different path.
+  ErrorOr<Status> getExternalStatus(const Twine &LookupPath,
                                     const Twine &OriginalPath) const;
 
   /// Make \a Path an absolute path.
@@ -1022,7 +1022,7 @@ class RedirectingFileSystem
                  llvm::SmallVectorImpl<Entry *> &Entries) const;
 
   /// Get the status for a path with the provided \c LookupResult.
-  ErrorOr<Status> status(const Twine &CanonicalPath, const Twine &OriginalPath,
+  ErrorOr<Status> status(const Twine &LookupPath, const Twine &OriginalPath,
                          const LookupResult &Result);
 
 public:
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 18db7a819540a..6980cbd04aeb1 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -264,7 +264,7 @@ def combine_extracted_vector_load : GICombineRule<
   (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
         [{ return Helper.matchCombineExtractedVectorLoad(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
-  
+
 def combine_indexed_load_store : GICombineRule<
   (defs root:$root, indexed_load_store_matchdata:$matchinfo),
   (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root,
@@ -1090,12 +1090,6 @@ def mulo_by_0: GICombineRule<
          [{ return Helper.matchMulOBy0(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
-def addo_by_0: GICombineRule<
-  (defs root:$root, build_fn_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_UADDO, G_SADDO):$root,
-         [{ return Helper.matchAddOBy0(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
-
 // Transform (uadde x, y, 0) -> (uaddo x, y)
 //           (sadde x, y, 0) -> (saddo x, y)
 //           (usube x, y, 0) -> (usubo x, y)
@@ -1291,6 +1285,12 @@ def match_ors : GICombineRule<
         [{ return Helper.matchOr(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
+def match_addos : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SADDO, G_UADDO):$root,
+        [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 // Combines concat operations
 def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
 def combine_concat_vector : GICombineRule<
@@ -1326,7 +1326,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
 
 def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p,
                                      overlapping_and, mulo_by_2, mulo_by_0,
-                                     addo_by_0, adde_to_addo,
+                                     adde_to_addo,
                                      combine_minmax_nan]>;
 
 def known_bits_simplifications : GICombineGroup<[
@@ -1356,7 +1356,7 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
 
 def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     extract_vec_elt_combines, combines_for_extload, combine_extracted_vector_load,
-    undef_combines, identity_combines, phi_combines, 
+    undef_combines, identity_combines, phi_combines,
     simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big,
     reassocs, ptr_add_immed_chain,
     shl_ashr_to_sext_inreg, sext_inreg_of_load,
@@ -1373,8 +1373,8 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     intdiv_combines, mulh_combines, redundant_neg_operands,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
-    fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, 
-    combine_concat_vector, double_icmp_zero_and_or_combine]>;
+    fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
+    combine_concat_vector, double_icmp_zero_and_or_combine, match_addos]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 0577c58f8da2d..1e40cc49040d6 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -947,8 +947,6 @@ class AsmOperandClass {
   /// instruction if it hasn't matched all the operands yet.  However, this
   /// error will be suppressed if all of the remaining unmatched operands are
   /// marked as IsOptional.
-  ///
-  /// Optional arguments must be at the end of the operand list.
   bit IsOptional = false;
 
   /// The name of the method on the target specific asm parser that returns the
@@ -1564,6 +1562,20 @@ class AsmParser {
   // method shall be called for all operands as opposed to only those
   // that have their own specified custom parsers.
   bit CallCustomParserForAllOperands = false;
+
+  // PreferSmallerInstructions - Should the assembly matcher prefer the smaller
+  // instructions.
+  // 
+  // This is useful for the ARM instructions set where smaller encodings must
+  // be preferentially selected.
+  //
+  // The preference order is: 
+  //    Instrution size (if this option is enabled, smallest first)
+  //    Number of Operands (least first), 
+  //    Operand Classes (lexicographically by operand), 
+  //    (Optional) Instruction id (see AsmMatcherEmitter.cpp for details), 
+  //    Number of required features (least first)
+  bit PreferSmallerInstructions = false;
 }
 def DefaultAsmParser : AsmParser;
 
@@ -1865,3 +1877,8 @@ include "llvm/Target/GlobalISel/SelectionDAGCompat.td"
 // Pull in the common support for Pfm Counters generation.
 //
 include "llvm/Target/TargetPfmCounters.td"
+
+//===----------------------------------------------------------------------===//
+// Pull in the common support for macro fusion.
+//
+include "llvm/Target/TargetMacroFusion.td"
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 37df9589e30d6..ceb371bdc7348 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -455,7 +455,7 @@ class LLVMTargetMachine : public TargetMachine {
 
   virtual Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
                                      raw_pwrite_stream *, CodeGenFileType,
-                                     CGPassBuilderOption,
+                                     const CGPassBuilderOption &,
                                      PassInstrumentationCallbacks *) {
     return make_error<StringError>("buildCodeGenPipeline is not overridden",
                                    inconvertibleErrorCode());
diff --git a/llvm/include/llvm/Target/TargetMacroFusion.td b/llvm/include/llvm/Target/TargetMacroFusion.td
new file mode 100644
index 0000000000000..0306794ef50d1
--- /dev/null
+++ b/llvm/include/llvm/Target/TargetMacroFusion.td
@@ -0,0 +1,153 @@
+//===-- TargetMacroFusion.td - Target Macro Fusion ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TableGen-based macro fusion classes.
+
+// The target instruction that FusionPredicate will be evaluated on.
+class FusionTarget;
+def first_fusion_target : FusionTarget;
+def second_fusion_target : FusionTarget;
+def both_fusion_target : FusionTarget;
+
+// Base class of FusionPredicate, etc. The avaliable variables are:
+// * const TargetInstrInfo &TII
+// * const TargetSubtargetInfo &STI
+// * const MachineRegisterInfo &MRI
+// * const MachineInstr *FirstMI
+// * const MachineInstr &SecondMI
+class FusionPredicate<FusionTarget target> {
+  FusionTarget Target = target;
+}
+class FirstFusionPredicate: FusionPredicate<first_fusion_target>;
+class SecondFusionPredicate: FusionPredicate<second_fusion_target>;
+class BothFusionPredicate: FusionPredicate<both_fusion_target>;
+
+// FusionPredicate with raw code predicate.
+class FusionPredicateWithCode<code pred> : FusionPredicate<both_fusion_target> {
+  code Predicate = pred;
+}
+
+// FusionPredicate with MCInstPredicate.
+class FusionPredicateWithMCInstPredicate<FusionTarget target, MCInstPredicate pred>
+  : FusionPredicate<target> {
+  MCInstPredicate Predicate = pred;
+}
+class FirstFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
+  : FusionPredicateWithMCInstPredicate<first_fusion_target, pred>;
+class SecondFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
+  : FusionPredicateWithMCInstPredicate<second_fusion_target, pred>;
+// The pred will be applied on both firstMI and secondMI.
+class BothFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
+  : FusionPredicateWithMCInstPredicate<both_fusion_target, pred>;
+
+// Tie firstOpIdx and secondOpIdx. The operand of `FirstMI` at position
+// `firstOpIdx` should be the same as the operand of `SecondMI` at position
+// `secondOpIdx`.
+// If the fusion has `IsCommutable` being true and the operand at `secondOpIdx`
+// has commutable operand, then the commutable operand will be checked too.
+class TieReg<int firstOpIdx, int secondOpIdx> : BothFusionPredicate {
+  int FirstOpIdx = firstOpIdx;
+  int SecondOpIdx = secondOpIdx;
+}
+
+// The operand of `SecondMI` at position `firstOpIdx` should be the same as the
+// operand at position `secondOpIdx`.
+// If the fusion has `IsCommutable` being true and the operand at `secondOpIdx`
+// has commutable operand, then the commutable operand will be checked too.
+class SameReg<int firstOpIdx, int secondOpIdx> : SecondFusionPredicate {
+  int FirstOpIdx = firstOpIdx;
+  int SecondOpIdx = secondOpIdx;
+}
+
+// A predicate for wildcard. The generated code will be like:
+// ```
+// if (!FirstMI)
+//   return ReturnValue;
+// ```
+class WildcardPred<bit ret> : FirstFusionPredicate {
+  bit ReturnValue = ret;
+}
+def WildcardFalse : WildcardPred<0>;
+def WildcardTrue : WildcardPred<1>;
+
+// Indicates that the destination register of `FirstMI` should have one use if
+// it is a virtual register.
+class OneUsePred : FirstFusionPredicate;
+def OneUse : OneUsePred;
+
+// Handled by MacroFusionPredicatorEmitter backend.
+// The generated predicator will be like:
+// ```
+// bool isNAME(const TargetInstrInfo &TII,
+//             const TargetSubtargetInfo &STI,
+//             const MachineInstr *FirstMI,
+//             const MachineInstr &SecondMI) {
+//   auto &MRI = SecondMI.getMF()->getRegInfo();
+//   /* Predicates */
+//   return true;
+// }
+// ```
+//
+// `IsCommutable` means whether we should handle commutable operands.
+class Fusion<string name, string fieldName, string desc, list<FusionPredicate> predicates>
+  : SubtargetFeature<name, fieldName, "true", desc> {
+  list<FusionPredicate> Predicates = predicates;
+  bit IsCommutable = 0;
+}
+
+// The generated predicator will be like:
+// ```
+// bool isNAME(const TargetInstrInfo &TII,
+//             const TargetSubtargetInfo &STI,
+//             const MachineInstr *FirstMI,
+//             const MachineInstr &SecondMI) {
+//   auto &MRI = SecondMI.getMF()->getRegInfo();
+//   /* Prolog */
+//   /* Predicate for `SecondMI` */
+//   /* Wildcard */
+//   /* Predicate for `FirstMI` */
+//   /* Check same registers */
+//   /* Check One Use */
+//   /* Tie registers */
+//   /* Epilog */
+//   return true;
+// }
+// ```
+class SimpleFusion<string name, string fieldName, string desc,
+                   MCInstPredicate firstPred, MCInstPredicate secondPred,
+                   list<FusionPredicate> prolog = [],
+                   list<FusionPredicate> epilog = []>
+  : Fusion<name, fieldName, desc,
+           !listconcat(
+              prolog,
+              [
+                SecondFusionPredicateWithMCInstPredicate<secondPred>,
+                WildcardTrue,
+                FirstFusionPredicateWithMCInstPredicate<firstPred>,
+                SameReg<0, 1>,
+                OneUse,
+                TieReg<0, 1>,
+              ],
+              epilog)>;
+
+class SingleFusion<string name, string fieldName, string desc,
+                   Instruction firstInst, Instruction secondInst,
+                   MCInstPredicate firstInstPred = TruePred,
+                   MCInstPredicate secondInstPred = TruePred,
+                   list<FusionPredicate> prolog = [],
+                   list<FusionPredicate> epilog = []>
+  : SimpleFusion<name, fieldName, desc,
+                 CheckAll<!listconcat(
+                            [CheckOpcode<[firstInst]>],
+                            [firstInstPred])>,
+                 CheckAll<!listconcat(
+                            [CheckOpcode<[secondInst]>],
+                            [secondInstPred])>,
+                 prolog, epilog> {
+  let IsCommutable = secondInst.isCommutable;
+}
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index 069eb2900bfe6..2562ed0901303 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -581,119 +581,3 @@ class MemoryQueue<ProcResourceKind PR> {
 
 class LoadQueue<ProcResourceKind LDQueue> : MemoryQueue<LDQueue>;
 class StoreQueue<ProcResourceKind STQueue> : MemoryQueue<STQueue>;
-
-// The target instruction that FusionPredicate will be evaluated on.
-class FusionTarget;
-def first_fusion_target : FusionTarget;
-def second_fusion_target : FusionTarget;
-def both_fusion_target : FusionTarget;
-
-// Base class of FusionPredicate, etc. The avaliable variables are:
-// * const TargetInstrInfo &TII
-// * const TargetSubtargetInfo &STI
-// * const MachineRegisterInfo &MRI
-// * const MachineInstr *FirstMI
-// * const MachineInstr &SecondMI
-class FusionPredicate<FusionTarget target> {
-  FusionTarget Target = target;
-}
-class FirstFusionPredicate: FusionPredicate<first_fusion_target>;
-class SecondFusionPredicate: FusionPredicate<second_fusion_target>;
-class BothFusionPredicate: FusionPredicate<both_fusion_target>;
-
-// FusionPredicate with raw code predicate.
-class FusionPredicateWithCode<code pred> : FusionPredicate<both_fusion_target> {
-  code Predicate = pred;
-}
-
-// FusionPredicate with MCInstPredicate.
-class FusionPredicateWithMCInstPredicate<FusionTarget target, MCInstPredicate pred>
-  : FusionPredicate<target> {
-  MCInstPredicate Predicate = pred;
-}
-class FirstFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
-  : FusionPredicateWithMCInstPredicate<first_fusion_target, pred>;
-class SecondFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
-  : FusionPredicateWithMCInstPredicate<second_fusion_target, pred>;
-// The pred will be applied on both firstMI and secondMI.
-class BothFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
-  : FusionPredicateWithMCInstPredicate<both_fusion_target, pred>;
-
-// Tie firstOpIdx and secondOpIdx. The operand of `FirstMI` at position
-// `firstOpIdx` should be the same as the operand of `SecondMI` at position
-// `secondOpIdx`.
-class TieReg<int firstOpIdx, int secondOpIdx> : BothFusionPredicate {
-  int FirstOpIdx = firstOpIdx;
-  int SecondOpIdx = secondOpIdx;
-}
-
-// A predicate for wildcard. The generated code will be like:
-// ```
-// if (!FirstMI)
-//   return ReturnValue;
-// ```
-class WildcardPred<bit ret> : FirstFusionPredicate {
-  bit ReturnValue = ret;
-}
-def WildcardFalse : WildcardPred<0>;
-def WildcardTrue : WildcardPred<1>;
-
-// Indicates that the destination register of `FirstMI` should have one use if
-// it is a virtual register.
-class OneUsePred : FirstFusionPredicate;
-def OneUse : OneUsePred;
-
-// Handled by MacroFusionPredicatorEmitter backend.
-// The generated predicator will be like:
-// ```
-// bool isNAME(const TargetInstrInfo &TII,
-//             const TargetSubtargetInfo &STI,
-//             const MachineInstr *FirstMI,
-//             const MachineInstr &SecondMI) {
-//   auto &MRI = SecondMI.getMF()->getRegInfo();
-//   /* Predicates */
-//   return true;
-// }
-// ```
-class Fusion<string name, string fieldName, string desc, list<FusionPredicate> predicates>
-  : SubtargetFeature<name, fieldName, "true", desc> {
-  list<FusionPredicate> Predicates = predicates;
-}
-
-// The generated predicator will be like:
-// ```
-// bool isNAME(const TargetInstrInfo &TII,
-//             const TargetSubtargetInfo &STI,
-//             const MachineInstr *FirstMI,
-//             const MachineInstr &SecondMI) {
-//   auto &MRI = SecondMI.getMF()->getRegInfo();
-//   /* Prolog */
-//   /* Predicate for `SecondMI` */
-//   /* Wildcard */
-//   /* Predicate for `FirstMI` */
-//   /* Check One Use */
-//   /* Tie registers */
-//   /* Epilog */
-//   return true;
-// }
-// ```
-class SimpleFusion<string name, string fieldName, string desc,
-                   MCInstPredicate firstPred, MCInstPredicate secondPred,
-                   list<FusionPredicate> prolog = [],
-                   list<FusionPredicate> epilog = []>
-  : Fusion<name, fieldName, desc,
-           !listconcat(
-              prolog,
-              [
-                SecondFusionPredicateWithMCInstPredicate<secondPred>,
-                WildcardTrue,
-                FirstFusionPredicateWithMCInstPredicate<firstPred>,
-                SecondFusionPredicateWithMCInstPredicate<
-                  CheckAny<[
-                    CheckIsVRegOperand<0>,
-                    CheckSameRegOperand<0, 1>
-                  ]>>,
-                OneUse,
-                TieReg<0, 1>,
-              ],
-              epilog)>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index d7bf8c35ee107..ea3520835fa07 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -318,7 +318,7 @@ def SDTAtomicStore : SDTypeProfile<0, 2, [
   SDTCisInt<0>, SDTCisPtrTy<1>
 ]>;
 def SDTAtomicLoad : SDTypeProfile<1, 1, [
-  SDTCisInt<0>, SDTCisPtrTy<1>
+  SDTCisPtrTy<1>
 ]>;
 
 class SDCallSeqStart<list<SDTypeConstraint> constraints> :
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index f4bb94f98bcd7..805b963a7a13c 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -554,6 +554,11 @@ inline constexpr CpuInfo CpuInfos[] = {
          {AArch64::AEK_SB, AArch64::AEK_SSBS, AArch64::AEK_MTE,
           AArch64::AEK_FP16FML, AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
           AArch64::AEK_FLAGM, AArch64::AEK_PERFMON, AArch64::AEK_PREDRES})},
+    {"cortex-a520ae", ARMV9_2A,
+     AArch64::ExtensionBitset(
+         {AArch64::AEK_SB, AArch64::AEK_SSBS, AArch64::AEK_MTE,
+          AArch64::AEK_FP16FML, AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
+          AArch64::AEK_FLAGM, AArch64::AEK_PERFMON, AArch64::AEK_PREDRES})},
     {"cortex-a57", ARMV8A,
      AArch64::ExtensionBitset(
          {AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_CRC})},
@@ -621,6 +626,12 @@ inline constexpr CpuInfo CpuInfos[] = {
                                AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
                                AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
                                AArch64::AEK_PREDRES, AArch64::AEK_PROFILE})},
+    {"cortex-a720ae", ARMV9_2A,
+     AArch64::ExtensionBitset({AArch64::AEK_SB, AArch64::AEK_SSBS,
+                               AArch64::AEK_MTE, AArch64::AEK_FP16FML,
+                               AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
+                               AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
+                               AArch64::AEK_PREDRES, AArch64::AEK_PROFILE})},
     {"cortex-r82", ARMV8R, AArch64::ExtensionBitset({AArch64::AEK_LSE})},
     {"cortex-x1", ARMV8_2A,
      AArch64::ExtensionBitset({AArch64::AEK_AES, AArch64::AEK_SHA2,
diff --git a/llvm/include/llvm/TextAPI/Record.h b/llvm/include/llvm/TextAPI/Record.h
index 98639b064eaad..ef152ce433877 100644
--- a/llvm/include/llvm/TextAPI/Record.h
+++ b/llvm/include/llvm/TextAPI/Record.h
@@ -51,7 +51,8 @@ class Record {
 public:
   Record() = default;
   Record(StringRef Name, RecordLinkage Linkage, SymbolFlags Flags)
-      : Name(Name), Linkage(Linkage), Flags(mergeFlags(Flags, Linkage)) {}
+      : Name(Name), Linkage(Linkage), Flags(mergeFlags(Flags, Linkage)),
+        Verified(false) {}
 
   bool isWeakDefined() const {
     return (Flags & SymbolFlags::WeakDefined) == SymbolFlags::WeakDefined;
@@ -79,6 +80,9 @@ class Record {
   bool isExported() const { return Linkage >= RecordLinkage::Rexported; }
   bool isRexported() const { return Linkage == RecordLinkage::Rexported; }
 
+  bool isVerified() const { return Verified; }
+  void setVerify(bool V = true) { Verified = V; }
+
   StringRef getName() const { return Name; }
   SymbolFlags getFlags() const { return Flags; }
 
@@ -89,6 +93,7 @@ class Record {
   StringRef Name;
   RecordLinkage Linkage;
   SymbolFlags Flags;
+  bool Verified;
 
   friend class RecordsSlice;
 };
diff --git a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
index a4d1653fdf4bc..48f8421c1813c 100644
--- a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
+++ b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
@@ -172,11 +172,12 @@ class ConstantHoistingPass : public PassInfoMixin<ConstantHoistingPass> {
 
   void collectMatInsertPts(
       const consthoist::RebasedConstantListType &RebasedConstants,
-      SmallVectorImpl<Instruction *> &MatInsertPts) const;
-  Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const;
-  SetVector<Instruction *>
-  findConstantInsertionPoint(const consthoist::ConstantInfo &ConstInfo,
-                             const ArrayRef<Instruction *> MatInsertPts) const;
+      SmallVectorImpl<BasicBlock::iterator> &MatInsertPts) const;
+  BasicBlock::iterator findMatInsertPt(Instruction *Inst,
+                                       unsigned Idx = ~0U) const;
+  SetVector<BasicBlock::iterator> findConstantInsertionPoint(
+      const consthoist::ConstantInfo &ConstInfo,
+      const ArrayRef<BasicBlock::iterator> MatInsertPts) const;
   void collectConstantCandidates(ConstCandMapType &ConstCandMap,
                                  Instruction *Inst, unsigned Idx,
                                  ConstantInt *ConstInt);
@@ -203,9 +204,9 @@ class ConstantHoistingPass : public PassInfoMixin<ConstantHoistingPass> {
   struct UserAdjustment {
     Constant *Offset;
     Type *Ty;
-    Instruction *MatInsertPt;
+    BasicBlock::iterator MatInsertPt;
     const consthoist::ConstantUser User;
-    UserAdjustment(Constant *O, Type *T, Instruction *I,
+    UserAdjustment(Constant *O, Type *T, BasicBlock::iterator I,
                    consthoist::ConstantUser U)
         : Offset(O), Ty(T), MatInsertPt(I), User(U) {}
   };
diff --git a/llvm/include/llvm/Transforms/Scalar/Float2Int.h b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
index 337e229efcf37..83be329bed60b 100644
--- a/llvm/include/llvm/Transforms/Scalar/Float2Int.h
+++ b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
@@ -44,7 +44,7 @@ class Float2IntPass : public PassInfoMixin<Float2IntPass> {
   std::optional<ConstantRange> calcRange(Instruction *I);
   void walkBackwards();
   void walkForwards();
-  bool validateAndTransform(const DataLayout &DL);
+  bool validateAndTransform();
   Value *convert(Instruction *I, Type *ToTy);
   void cleanup();
 
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index cc13362874362..9ae026fa95d21 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -262,21 +262,21 @@ CallInst *changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr);
 /// that has an associated llvm.dbg.declare intrinsic.
 void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
                                      StoreInst *SI, DIBuilder &Builder);
-void ConvertDebugDeclareToDebugValue(DPValue *DPV, StoreInst *SI,
+void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, StoreInst *SI,
                                      DIBuilder &Builder);
 
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
 /// that has an associated llvm.dbg.declare intrinsic.
 void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
                                      LoadInst *LI, DIBuilder &Builder);
-void ConvertDebugDeclareToDebugValue(DPValue *DPV, LoadInst *LI,
+void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, LoadInst *LI,
                                      DIBuilder &Builder);
 
 /// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
 /// llvm.dbg.declare intrinsic.
 void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
                                      PHINode *LI, DIBuilder &Builder);
-void ConvertDebugDeclareToDebugValue(DPValue *DPV, PHINode *LI,
+void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, PHINode *LI,
                                      DIBuilder &Builder);
 
 /// Lowers llvm.dbg.declare intrinsics into appropriate set of
@@ -313,7 +313,7 @@ void salvageDebugInfo(Instruction &I);
 /// Mark undef if salvaging cannot be completed.
 void salvageDebugInfoForDbgValues(Instruction &I,
                                   ArrayRef<DbgVariableIntrinsic *> Insns,
-                                  ArrayRef<DPValue *> DPInsns);
+                                  ArrayRef<DbgVariableRecord *> DPInsns);
 
 /// Given an instruction \p I and DIExpression \p DIExpr operating on
 /// it, append the effects of \p I to the DIExpression operand list
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index eb00e6c4e856d..0a0e16d2a9e6e 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/Alignment.h"
 
 namespace llvm {
@@ -53,7 +54,7 @@ struct AllocaInfo {
   SmallVector<IntrinsicInst *, 2> LifetimeEnd;
   SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
   // Non-intrinsic records of variable locations.
-  SmallVector<DPValue *, 2> DbgVariableRecords;
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
 };
 
 struct StackInfo {
@@ -78,6 +79,11 @@ class StackInfoBuilder {
 
 uint64_t getAllocaSizeInBytes(const AllocaInst &AI);
 void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Align);
+bool isLifetimeIntrinsic(Value *V);
+
+Value *readRegister(IRBuilder<> &IRB, StringRef Name);
+Value *getFP(IRBuilder<> &IRB);
+Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB);
 
 } // namespace memtag
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index c79d436fc676b..29d96a0ab6bf5 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -23,7 +23,7 @@ class BasicBlock;
 class Instruction;
 class LoadInst;
 class PHINode;
-class DPValue;
+class DbgVariableRecord;
 template <typename T> class SmallVectorImpl;
 template <typename T> class SSAUpdaterTraits;
 class Type;
@@ -124,7 +124,8 @@ class SSAUpdater {
   void UpdateDebugValues(Instruction *I);
   void UpdateDebugValues(Instruction *I,
                          SmallVectorImpl<DbgValueInst *> &DbgValues);
-  void UpdateDebugValues(Instruction *I, SmallVectorImpl<DPValue *> &DbgValues);
+  void UpdateDebugValues(Instruction *I,
+                         SmallVectorImpl<DbgVariableRecord *> &DbgValues);
 
   /// Rewrite a use like \c RewriteUse but handling in-block definitions.
   ///
@@ -136,7 +137,7 @@ class SSAUpdater {
 private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
   void UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue);
-  void UpdateDebugValue(Instruction *I, DPValue *DbgValue);
+  void UpdateDebugValue(Instruction *I, DbgVariableRecord *DbgValue);
 };
 
 /// Helper class for promoting a collection of loads and stores into SSA
diff --git a/llvm/include/llvm/Transforms/Utils/ValueMapper.h b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
index dd183bc04f0c2..54e3e62dc3af5 100644
--- a/llvm/include/llvm/Transforms/Utils/ValueMapper.h
+++ b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
@@ -180,8 +180,9 @@ class ValueMapper {
   Constant *mapConstant(const Constant &C);
 
   void remapInstruction(Instruction &I);
-  void remapDPValue(Module *M, DPValue &V);
-  void remapDPValueRange(Module *M, iterator_range<DbgRecordIterator> Range);
+  void remapDbgVariableRecord(Module *M, DbgVariableRecord &V);
+  void remapDbgVariableRecordRange(Module *M,
+                                   iterator_range<DbgRecordIterator> Range);
   void remapFunction(Function &F);
   void remapGlobalObjectMetadata(GlobalObject &GO);
 
@@ -267,21 +268,26 @@ inline void RemapInstruction(Instruction *I, ValueToValueMapTy &VM,
   ValueMapper(VM, Flags, TypeMapper, Materializer).remapInstruction(*I);
 }
 
-/// Remap the Values used in the DPValue \a V using the value map \a VM.
-inline void RemapDPValue(Module *M, DPValue *V, ValueToValueMapTy &VM,
-                         RemapFlags Flags = RF_None,
-                         ValueMapTypeRemapper *TypeMapper = nullptr,
-                         ValueMaterializer *Materializer = nullptr) {
-  ValueMapper(VM, Flags, TypeMapper, Materializer).remapDPValue(M, *V);
+/// Remap the Values used in the DbgVariableRecord \a V using the value map \a
+/// VM.
+inline void RemapDbgVariableRecord(Module *M, DbgVariableRecord *V,
+                                   ValueToValueMapTy &VM,
+                                   RemapFlags Flags = RF_None,
+                                   ValueMapTypeRemapper *TypeMapper = nullptr,
+                                   ValueMaterializer *Materializer = nullptr) {
+  ValueMapper(VM, Flags, TypeMapper, Materializer)
+      .remapDbgVariableRecord(M, *V);
 }
 
-/// Remap the Values used in the DPValue \a V using the value map \a VM.
-inline void RemapDPValueRange(Module *M,
-                              iterator_range<DbgRecordIterator> Range,
-                              ValueToValueMapTy &VM, RemapFlags Flags = RF_None,
-                              ValueMapTypeRemapper *TypeMapper = nullptr,
-                              ValueMaterializer *Materializer = nullptr) {
-  ValueMapper(VM, Flags, TypeMapper, Materializer).remapDPValueRange(M, Range);
+/// Remap the Values used in the DbgVariableRecord \a V using the value map \a
+/// VM.
+inline void
+RemapDbgVariableRecordRange(Module *M, iterator_range<DbgRecordIterator> Range,
+                            ValueToValueMapTy &VM, RemapFlags Flags = RF_None,
+                            ValueMapTypeRemapper *TypeMapper = nullptr,
+                            ValueMaterializer *Materializer = nullptr) {
+  ValueMapper(VM, Flags, TypeMapper, Materializer)
+      .remapDbgVariableRecordRange(M, Range);
 }
 
 /// Remap the operands, metadata, arguments, and instructions of a function.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 8b7031e7fe4a6..6b2e88d06b18c 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2534,12 +2534,73 @@ static Constant *evaluateCompare(const APFloat &Op1, const APFloat &Op2,
   return nullptr;
 }
 
-static Constant *ConstantFoldScalarCall2(StringRef Name,
-                                         Intrinsic::ID IntrinsicID,
-                                         Type *Ty,
-                                         ArrayRef<Constant *> Operands,
-                                         const TargetLibraryInfo *TLI,
-                                         const CallBase *Call) {
+static Constant *ConstantFoldLibCall2(StringRef Name, Type *Ty,
+                                      ArrayRef<Constant *> Operands,
+                                      const TargetLibraryInfo *TLI) {
+  if (!TLI)
+    return nullptr;
+
+  LibFunc Func = NotLibFunc;
+  if (!TLI->getLibFunc(Name, Func))
+    return nullptr;
+
+  const auto *Op1 = dyn_cast<ConstantFP>(Operands[0]);
+  if (!Op1)
+    return nullptr;
+
+  const auto *Op2 = dyn_cast<ConstantFP>(Operands[1]);
+  if (!Op2)
+    return nullptr;
+
+  const APFloat &Op1V = Op1->getValueAPF();
+  const APFloat &Op2V = Op2->getValueAPF();
+
+  switch (Func) {
+  default:
+    break;
+  case LibFunc_pow:
+  case LibFunc_powf:
+  case LibFunc_pow_finite:
+  case LibFunc_powf_finite:
+    if (TLI->has(Func))
+      return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
+    break;
+  case LibFunc_fmod:
+  case LibFunc_fmodf:
+    if (TLI->has(Func)) {
+      APFloat V = Op1->getValueAPF();
+      if (APFloat::opStatus::opOK == V.mod(Op2->getValueAPF()))
+        return ConstantFP::get(Ty->getContext(), V);
+    }
+    break;
+  case LibFunc_remainder:
+  case LibFunc_remainderf:
+    if (TLI->has(Func)) {
+      APFloat V = Op1->getValueAPF();
+      if (APFloat::opStatus::opOK == V.remainder(Op2->getValueAPF()))
+        return ConstantFP::get(Ty->getContext(), V);
+    }
+    break;
+  case LibFunc_atan2:
+  case LibFunc_atan2f:
+    // atan2(+/-0.0, +/-0.0) is known to raise an exception on some libm
+    // (Solaris), so we do not assume a known result for that.
+    if (Op1V.isZero() && Op2V.isZero())
+      return nullptr;
+    [[fallthrough]];
+  case LibFunc_atan2_finite:
+  case LibFunc_atan2f_finite:
+    if (TLI->has(Func))
+      return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
+    break;
+  }
+
+  return nullptr;
+}
+
+static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
+                                            ArrayRef<Constant *> Operands,
+                                            const CallBase *Call) {
   assert(Operands.size() == 2 && "Wrong number of operands.");
 
   if (Ty->isFloatingPointTy()) {
@@ -2569,7 +2630,8 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return nullptr;
       const APFloat &Op2V = Op2->getValueAPF();
 
-      if (const auto *ConstrIntr = dyn_cast<ConstrainedFPIntrinsic>(Call)) {
+      if (const auto *ConstrIntr =
+              dyn_cast_if_present<ConstrainedFPIntrinsic>(Call)) {
         RoundingMode RM = getEvaluationRoundingMode(ConstrIntr);
         APFloat Res = Op1V;
         APFloat::opStatus St;
@@ -2632,52 +2694,6 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return ConstantFP::get(Ty->getContext(), Op1V * Op2V);
       }
 
-      if (!TLI)
-        return nullptr;
-
-      LibFunc Func = NotLibFunc;
-      if (!TLI->getLibFunc(Name, Func))
-        return nullptr;
-
-      switch (Func) {
-      default:
-        break;
-      case LibFunc_pow:
-      case LibFunc_powf:
-      case LibFunc_pow_finite:
-      case LibFunc_powf_finite:
-        if (TLI->has(Func))
-          return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
-        break;
-      case LibFunc_fmod:
-      case LibFunc_fmodf:
-        if (TLI->has(Func)) {
-          APFloat V = Op1->getValueAPF();
-          if (APFloat::opStatus::opOK == V.mod(Op2->getValueAPF()))
-            return ConstantFP::get(Ty->getContext(), V);
-        }
-        break;
-      case LibFunc_remainder:
-      case LibFunc_remainderf:
-        if (TLI->has(Func)) {
-          APFloat V = Op1->getValueAPF();
-          if (APFloat::opStatus::opOK == V.remainder(Op2->getValueAPF()))
-            return ConstantFP::get(Ty->getContext(), V);
-        }
-        break;
-      case LibFunc_atan2:
-      case LibFunc_atan2f:
-        // atan2(+/-0.0, +/-0.0) is known to raise an exception on some libm
-        // (Solaris), so we do not assume a known result for that.
-        if (Op1V.isZero() && Op2V.isZero())
-          return nullptr;
-        [[fallthrough]];
-      case LibFunc_atan2_finite:
-      case LibFunc_atan2f_finite:
-        if (TLI->has(Func))
-          return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
-        break;
-      }
     } else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
       switch (IntrinsicID) {
       case Intrinsic::ldexp: {
@@ -3168,8 +3184,13 @@ static Constant *ConstantFoldScalarCall(StringRef Name,
   if (Operands.size() == 1)
     return ConstantFoldScalarCall1(Name, IntrinsicID, Ty, Operands, TLI, Call);
 
-  if (Operands.size() == 2)
-    return ConstantFoldScalarCall2(Name, IntrinsicID, Ty, Operands, TLI, Call);
+  if (Operands.size() == 2) {
+    if (Constant *FoldedLibCall =
+            ConstantFoldLibCall2(Name, Ty, Operands, TLI)) {
+      return FoldedLibCall;
+    }
+    return ConstantFoldIntrinsicCall2(IntrinsicID, Ty, Operands, Call);
+  }
 
   if (Operands.size() == 3)
     return ConstantFoldScalarCall3(Name, IntrinsicID, Ty, Operands, TLI, Call);
@@ -3376,6 +3397,13 @@ ConstantFoldStructCall(StringRef Name, Intrinsic::ID IntrinsicID,
 
 } // end anonymous namespace
 
+Constant *llvm::ConstantFoldBinaryIntrinsic(Intrinsic::ID ID, Constant *LHS,
+                                            Constant *RHS, Type *Ty,
+                                            Instruction *FMFSource) {
+  return ConstantFoldIntrinsicCall2(ID, Ty, {LHS, RHS},
+                                    dyn_cast_if_present<CallBase>(FMFSource));
+}
+
 Constant *llvm::ConstantFoldCall(const CallBase *Call, Function *F,
                                  ArrayRef<Constant *> Operands,
                                  const TargetLibraryInfo *TLI) {
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index ce651783caf16..06283cb4bb934 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5767,11 +5767,16 @@ static Value *simplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
     if (FMF.noNaNs() && FMF.noSignedZeros())
       return ConstantFP::getZero(Op0->getType());
 
-    // +normal number * (-)0.0 --> (-)0.0
     KnownFPClass Known =
         computeKnownFPClass(Op0, FMF, fcInf | fcNan, /*Depth=*/0, Q);
-    if (Known.SignBit == false && Known.isKnownNever(fcInf | fcNan))
-      return Op1;
+    if (Known.isKnownNever(fcInf | fcNan)) {
+      // +normal number * (-)0.0 --> (-)0.0
+      if (Known.SignBit == false)
+        return Op1;
+      // -normal number * (-)0.0 --> -(-)0.0
+      if (Known.SignBit == true)
+        return foldConstant(Instruction::FNeg, Op1, Q);
+    }
   }
 
   // sqrt(X) * sqrt(X) --> X, if we can:
@@ -6384,11 +6389,10 @@ static Value *foldMinimumMaximumSharedOp(Intrinsic::ID IID, Value *Op0,
   return nullptr;
 }
 
-static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
-                                      const SimplifyQuery &Q,
-                                      const CallBase *Call) {
-  Intrinsic::ID IID = F->getIntrinsicID();
-  Type *ReturnType = F->getReturnType();
+Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
+                                     Value *Op0, Value *Op1,
+                                     const SimplifyQuery &Q,
+                                     const CallBase *Call) {
   unsigned BitWidth = ReturnType->getScalarSizeInBits();
   switch (IID) {
   case Intrinsic::abs:
@@ -6646,19 +6650,21 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     // float, if the ninf flag is set.
     const APFloat *C;
     if (match(Op1, m_APFloat(C)) &&
-        (C->isInfinity() || (Call->hasNoInfs() && C->isLargest()))) {
+        (C->isInfinity() || (Call && Call->hasNoInfs() && C->isLargest()))) {
       // minnum(X, -inf) -> -inf
       // maxnum(X, +inf) -> +inf
       // minimum(X, -inf) -> -inf if nnan
       // maximum(X, +inf) -> +inf if nnan
-      if (C->isNegative() == IsMin && (!PropagateNaN || Call->hasNoNaNs()))
+      if (C->isNegative() == IsMin &&
+          (!PropagateNaN || (Call && Call->hasNoNaNs())))
         return ConstantFP::get(ReturnType, *C);
 
       // minnum(X, +inf) -> X if nnan
       // maxnum(X, -inf) -> X if nnan
       // minimum(X, +inf) -> X
       // maximum(X, -inf) -> X
-      if (C->isNegative() != IsMin && (PropagateNaN || Call->hasNoNaNs()))
+      if (C->isNegative() != IsMin &&
+          (PropagateNaN || (Call && Call->hasNoNaNs())))
         return Op0;
     }
 
@@ -6672,8 +6678,6 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     break;
   }
   case Intrinsic::vector_extract: {
-    Type *ReturnType = F->getReturnType();
-
     // (extract_vector (insert_vector _, X, 0), 0) -> X
     unsigned IdxN = cast<ConstantInt>(Op1)->getZExtValue();
     Value *X = nullptr;
@@ -6720,7 +6724,8 @@ static Value *simplifyIntrinsic(CallBase *Call, Value *Callee,
     return simplifyUnaryIntrinsic(F, Args[0], Q, Call);
 
   if (NumOperands == 2)
-    return simplifyBinaryIntrinsic(F, Args[0], Args[1], Q, Call);
+    return simplifyBinaryIntrinsic(IID, F->getReturnType(), Args[0], Args[1], Q,
+                                   Call);
 
   // Handle intrinsics with 3 or more arguments.
   switch (IID) {
diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp
index 5700fd664a4cd..d16dec0d42927 100644
--- a/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/llvm/lib/Analysis/PHITransAddr.cpp
@@ -369,7 +369,7 @@ Value *PHITransAddr::insertTranslatedSubExpr(
     // Otherwise insert a cast at the end of PredBB.
     CastInst *New = CastInst::Create(Cast->getOpcode(), OpVal, InVal->getType(),
                                      InVal->getName() + ".phi.trans.insert",
-                                     PredBB->getTerminator());
+                                     PredBB->getTerminator()->getIterator());
     New->setDebugLoc(Inst->getDebugLoc());
     NewInsts.push_back(New);
     return New;
@@ -387,7 +387,8 @@ Value *PHITransAddr::insertTranslatedSubExpr(
 
     GetElementPtrInst *Result = GetElementPtrInst::Create(
         GEP->getSourceElementType(), GEPOps[0], ArrayRef(GEPOps).slice(1),
-        InVal->getName() + ".phi.trans.insert", PredBB->getTerminator());
+        InVal->getName() + ".phi.trans.insert",
+        PredBB->getTerminator()->getIterator());
     Result->setDebugLoc(Inst->getDebugLoc());
     Result->setIsInBounds(GEP->isInBounds());
     NewInsts.push_back(Result);
@@ -408,9 +409,9 @@ Value *PHITransAddr::insertTranslatedSubExpr(
     if (OpVal == nullptr)
       return nullptr;
 
-    BinaryOperator *Res = BinaryOperator::CreateAdd(OpVal, Inst->getOperand(1),
-                                           InVal->getName()+".phi.trans.insert",
-                                                    PredBB->getTerminator());
+    BinaryOperator *Res = BinaryOperator::CreateAdd(
+        OpVal, Inst->getOperand(1), InVal->getName() + ".phi.trans.insert",
+        PredBB->getTerminator()->getIterator());
     Res->setHasNoSignedWrap(cast<BinaryOperator>(Inst)->hasNoSignedWrap());
     Res->setHasNoUnsignedWrap(cast<BinaryOperator>(Inst)->hasNoUnsignedWrap());
     NewInsts.push_back(Res);
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 4babfa292e166..3926524fd05e8 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -571,16 +571,12 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     break;
   case Triple::Linux:
     // exp10, exp10f, exp10l is available on Linux (GLIBC) but are extremely
-    // buggy prior to glibc version 2.18. Until this version is widely deployed
-    // or we have a reasonable detection strategy, we cannot use exp10 reliably
-    // on Linux.
-    //
-    // Fall through to disable all of them.
-    [[fallthrough]];
-  default:
-    TLI.setUnavailable(LibFunc_exp10);
-    TLI.setUnavailable(LibFunc_exp10f);
-    TLI.setUnavailable(LibFunc_exp10l);
+    // buggy prior to glibc version 2.18. As this version is so old, we
+    // don't really need to worry about using exp10 on Linux.
+    TLI.setAvailableWithName(LibFunc_exp10, "__exp10");
+    TLI.setAvailableWithName(LibFunc_exp10f, "__exp10f");
+    TLI.setAvailableWithName(LibFunc_exp10l, "__exp10l");
+    break;
   }
 
   // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and
@@ -852,6 +848,9 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_strndup);
     TLI.setUnavailable(LibFunc_strnlen);
     TLI.setUnavailable(LibFunc_toascii);
+    TLI.setUnavailable(LibFunc_exp10);
+    TLI.setUnavailable(LibFunc_exp10f);
+    TLI.setUnavailable(LibFunc_exp10l);
   }
 
   // As currently implemented in clang, NVPTX code has no standard library to
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 15311be4dba27..2e0bd84339659 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
@@ -874,7 +875,22 @@ TargetTransformInfo::getOperandInfo(const Value *V) {
 InstructionCost TargetTransformInfo::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     OperandValueInfo Op1Info, OperandValueInfo Op2Info,
-    ArrayRef<const Value *> Args, const Instruction *CxtI) const {
+    ArrayRef<const Value *> Args, const Instruction *CxtI,
+    const TargetLibraryInfo *TLibInfo) const {
+
+  // Use call cost for frem intructions that have platform specific vector math
+  // functions, as those will be replaced with calls later by SelectionDAG or
+  // ReplaceWithVecLib pass.
+  if (TLibInfo && Opcode == Instruction::FRem) {
+    VectorType *VecTy = dyn_cast<VectorType>(Ty);
+    LibFunc Func;
+    if (VecTy &&
+        TLibInfo->getLibFunc(Instruction::FRem, Ty->getScalarType(), Func) &&
+        TLibInfo->isFunctionVectorizable(TLibInfo->getName(Func),
+                                         VecTy->getElementCount()))
+      return getCallInstrCost(nullptr, VecTy, {VecTy, VecTy}, CostKind);
+  }
+
   InstructionCost Cost =
       TTIImpl->getArithmeticInstrCost(Opcode, Ty, CostKind,
                                       Op1Info, Op2Info,
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 8a4a2c4f92a0d..58bc68b576b0b 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1023,11 +1023,44 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::Select: {
-    computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
-
+    auto ComputeForArm = [&](Value *Arm, bool Invert) {
+      KnownBits Res(Known.getBitWidth());
+      computeKnownBits(Arm, Res, Depth + 1, Q);
+      // If we have a constant arm, we are done.
+      if (Res.isConstant())
+        return Res;
+
+      // See what condition implies about the bits of the two select arms.
+      KnownBits CondRes(Res.getBitWidth());
+      computeKnownBitsFromCond(Arm, I->getOperand(0), CondRes, Depth + 1, Q,
+                               Invert);
+      // If we don't get any information from the condition, no reason to
+      // proceed.
+      if (CondRes.isUnknown())
+        return Res;
+
+      // We can have conflict if the condition is dead. I.e if we have
+      // (x | 64) < 32 ? (x | 64) : y
+      // we will have conflict at bit 6 from the condition/the `or`.
+      // In that case just return. Its not particularly important
+      // what we do, as this select is going to be simplified soon.
+      CondRes = CondRes.unionWith(Res);
+      if (CondRes.hasConflict())
+        return Res;
+
+      // Finally make sure the information we found is valid. This is relatively
+      // expensive so it's left for the very end.
+      if (!isGuaranteedNotToBeUndef(Arm, Q.AC, Q.CxtI, Q.DT, Depth + 1))
+        return Res;
+
+      // Finally, we know we get information from the condition and its valid,
+      // so return it.
+      return CondRes;
+    };
     // Only known if known in both the LHS and RHS.
-    Known = Known.intersectWith(Known2);
+    Known =
+        ComputeForArm(I->getOperand(1), /*Invert=*/false)
+            .intersectWith(ComputeForArm(I->getOperand(2), /*Invert=*/true));
     break;
   }
   case Instruction::FPTrunc:
@@ -2726,27 +2759,33 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     auto *LI = cast<LoadInst>(I);
     // A Load tagged with nonnull or dereferenceable with null pointer undefined
     // is never null.
-    if (auto *PtrT = dyn_cast<PointerType>(I->getType()))
+    if (auto *PtrT = dyn_cast<PointerType>(I->getType())) {
       if (Q.IIQ.getMetadata(LI, LLVMContext::MD_nonnull) ||
           (Q.IIQ.getMetadata(LI, LLVMContext::MD_dereferenceable) &&
            !NullPointerIsDefined(LI->getFunction(), PtrT->getAddressSpace())))
         return true;
+    } else if (MDNode *Ranges = Q.IIQ.getMetadata(LI, LLVMContext::MD_range)) {
+      return rangeMetadataExcludesValue(Ranges, APInt::getZero(BitWidth));
+    }
 
     // No need to fall through to computeKnownBits as range metadata is already
     // handled in isKnownNonZero.
     return false;
   }
   case Instruction::Call:
-  case Instruction::Invoke:
+  case Instruction::Invoke: {
+    const auto *Call = cast<CallBase>(I);
     if (I->getType()->isPointerTy()) {
-      const auto *Call = cast<CallBase>(I);
       if (Call->isReturnNonNull())
         return true;
       if (const auto *RP = getArgumentAliasingToReturnedPointer(Call, true))
         return isKnownNonZero(RP, Depth, Q);
-    } else if (const Value *RV = cast<CallBase>(I)->getReturnedArgOperand()) {
-      if (RV->getType() == I->getType() && isKnownNonZero(RV, Depth, Q))
-        return true;
+    } else {
+      if (MDNode *Ranges = Q.IIQ.getMetadata(Call, LLVMContext::MD_range))
+        return rangeMetadataExcludesValue(Ranges, APInt::getZero(BitWidth));
+      if (const Value *RV = Call->getReturnedArgOperand())
+        if (RV->getType() == I->getType() && isKnownNonZero(RV, Depth, Q))
+          return true;
     }
 
     if (auto *II = dyn_cast<IntrinsicInst>(I)) {
@@ -2816,6 +2855,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
 
     return false;
   }
+  }
 
   KnownBits Known(BitWidth);
   computeKnownBits(I, DemandedElts, Known, Depth, Q);
@@ -2830,9 +2870,9 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
 /// Supports values with integer or pointer type and vectors of integers.
 bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
                     const SimplifyQuery &Q) {
+  Type *Ty = V->getType();
 
 #ifndef NDEBUG
-  Type *Ty = V->getType();
   assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
 
   if (auto *FVTy = dyn_cast<FixedVectorType>(Ty)) {
@@ -2854,7 +2894,7 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
 
     // For constant vectors, check that all elements are undefined or known
     // non-zero to determine that the whole vector is known non-zero.
-    if (auto *VecTy = dyn_cast<FixedVectorType>(C->getType())) {
+    if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
       for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) {
         if (!DemandedElts[i])
           continue;
@@ -2881,18 +2921,6 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
       return false;
   }
 
-  if (auto *I = dyn_cast<Instruction>(V)) {
-    if (MDNode *Ranges = Q.IIQ.getMetadata(I, LLVMContext::MD_range)) {
-      // If the possible ranges don't contain zero, then the value is
-      // definitely non-zero.
-      if (auto *Ty = dyn_cast<IntegerType>(V->getType())) {
-        const APInt ZeroValue(Ty->getBitWidth(), 0);
-        if (rangeMetadataExcludesValue(Ranges, ZeroValue))
-          return true;
-      }
-    }
-  }
-
   if (!isa<Constant>(V) && isKnownNonZeroFromAssume(V, Q))
     return true;
 
@@ -2902,7 +2930,7 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
 
   // Check for pointer simplifications.
 
-  if (PointerType *PtrTy = dyn_cast<PointerType>(V->getType())) {
+  if (PointerType *PtrTy = dyn_cast<PointerType>(Ty)) {
     // A byval, inalloca may not be null in a non-default addres space. A
     // nonnull argument is assumed never 0.
     if (const Argument *A = dyn_cast<Argument>(V)) {
@@ -3939,10 +3967,16 @@ std::tuple<Value *, FPClassTest, FPClassTest>
 llvm::fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
                        FPClassTest RHSClass, bool LookThroughSrc) {
   assert(RHSClass != fcNone);
+  Value *Src = LHS;
+
+  if (Pred == FCmpInst::FCMP_TRUE)
+    return exactClass(Src, fcAllFlags);
+
+  if (Pred == FCmpInst::FCMP_FALSE)
+    return exactClass(Src, fcNone);
 
   const FPClassTest OrigClass = RHSClass;
 
-  Value *Src = LHS;
   const bool IsNegativeRHS = (RHSClass & fcNegative) == RHSClass;
   const bool IsPositiveRHS = (RHSClass & fcPositive) == RHSClass;
   const bool IsNaN = (RHSClass & ~fcNan) == fcNone;
@@ -3961,12 +3995,6 @@ llvm::fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
   if (Pred == FCmpInst::FCMP_UNO)
     return exactClass(Src, fcNan);
 
-  if (Pred == FCmpInst::FCMP_TRUE)
-    return exactClass(Src, fcAllFlags);
-
-  if (Pred == FCmpInst::FCMP_FALSE)
-    return exactClass(Src, fcNone);
-
   const bool IsFabs = LookThroughSrc && match(LHS, m_FAbs(m_Value(Src)));
   if (IsFabs)
     RHSClass = llvm::inverse_fabs(RHSClass);
@@ -8438,26 +8466,12 @@ isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
   }
 }
 
-/// Return true if the operands of two compares (expanded as "L0 pred L1" and
-/// "R0 pred R1") match. IsSwappedOps is true when the operands match, but are
-/// swapped.
-static bool areMatchingOperands(const Value *L0, const Value *L1, const Value *R0,
-                           const Value *R1, bool &AreSwappedOps) {
-  bool AreMatchingOps = (L0 == R0 && L1 == R1);
-  AreSwappedOps = (L0 == R1 && L1 == R0);
-  return AreMatchingOps || AreSwappedOps;
-}
-
 /// Return true if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is true.
 /// Return false if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is false.
 /// Otherwise, return std::nullopt if we can't infer anything.
 static std::optional<bool>
 isImpliedCondMatchingOperands(CmpInst::Predicate LPred,
-                              CmpInst::Predicate RPred, bool AreSwappedOps) {
-  // Canonicalize the predicate as if the operands were not commuted.
-  if (AreSwappedOps)
-    RPred = ICmpInst::getSwappedPredicate(RPred);
-
+                              CmpInst::Predicate RPred) {
   if (CmpInst::isImpliedTrueByMatchingCmp(LPred, RPred))
     return true;
   if (CmpInst::isImpliedFalseByMatchingCmp(LPred, RPred))
@@ -8499,6 +8513,26 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
   CmpInst::Predicate LPred =
       LHSIsTrue ? LHS->getPredicate() : LHS->getInversePredicate();
 
+  // We can have non-canonical operands, so try to normalize any common operand
+  // to L0/R0.
+  if (L0 == R1) {
+    std::swap(R0, R1);
+    RPred = ICmpInst::getSwappedPredicate(RPred);
+  }
+  if (R0 == L1) {
+    std::swap(L0, L1);
+    LPred = ICmpInst::getSwappedPredicate(LPred);
+  }
+  if (L1 == R1) {
+    // If we have L0 == R0 and L1 == R1, then make L1/R1 the constants.
+    if (L0 != R0 || match(L0, m_ImmConstant())) {
+      std::swap(L0, L1);
+      LPred = ICmpInst::getSwappedPredicate(LPred);
+      std::swap(R0, R1);
+      RPred = ICmpInst::getSwappedPredicate(RPred);
+    }
+  }
+
   // Can we infer anything when the 0-operands match and the 1-operands are
   // constants (not necessarily matching)?
   const APInt *LC, *RC;
@@ -8506,32 +8540,15 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
     return isImpliedCondCommonOperandWithConstants(LPred, *LC, RPred, *RC);
 
   // Can we infer anything when the two compares have matching operands?
-  bool AreSwappedOps;
-  if (areMatchingOperands(L0, L1, R0, R1, AreSwappedOps))
-    return isImpliedCondMatchingOperands(LPred, RPred, AreSwappedOps);
+  if (L0 == R0 && L1 == R1)
+    return isImpliedCondMatchingOperands(LPred, RPred);
 
   // L0 = R0 = L1 + R1, L0 >=u L1 implies R0 >=u R1, L0 <u L1 implies R0 <u R1
-  if (ICmpInst::isUnsigned(LPred) && ICmpInst::isUnsigned(RPred)) {
-    if (L0 == R1) {
-      std::swap(R0, R1);
-      RPred = ICmpInst::getSwappedPredicate(RPred);
-    }
-    if (L1 == R0) {
-      std::swap(L0, L1);
-      LPred = ICmpInst::getSwappedPredicate(LPred);
-    }
-    if (L1 == R1) {
-      std::swap(L0, L1);
-      LPred = ICmpInst::getSwappedPredicate(LPred);
-      std::swap(R0, R1);
-      RPred = ICmpInst::getSwappedPredicate(RPred);
-    }
-    if (L0 == R0 &&
-        (LPred == ICmpInst::ICMP_ULT || LPred == ICmpInst::ICMP_UGE) &&
-        (RPred == ICmpInst::ICMP_ULT || RPred == ICmpInst::ICMP_UGE) &&
-        match(L0, m_c_Add(m_Specific(L1), m_Specific(R1))))
-      return LPred == RPred;
-  }
+  if (L0 == R0 &&
+      (LPred == ICmpInst::ICMP_ULT || LPred == ICmpInst::ICMP_UGE) &&
+      (RPred == ICmpInst::ICMP_ULT || RPred == ICmpInst::ICMP_UGE) &&
+      match(L0, m_c_Add(m_Specific(L1), m_Specific(R1))))
+    return LPred == RPred;
 
   if (LPred == RPred)
     return isImpliedCondOperands(LPred, L0, L1, R0, R1, DL, Depth);
@@ -8589,6 +8606,10 @@ llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred,
   assert(LHS->getType()->isIntOrIntVectorTy(1) &&
          "Expected integer type only!");
 
+  // Match not
+  if (match(LHS, m_Not(m_Value(LHS))))
+    LHSIsTrue = !LHSIsTrue;
+
   // Both LHS and RHS are icmps.
   const ICmpInst *LHSCmp = dyn_cast<ICmpInst>(LHS);
   if (LHSCmp)
@@ -8615,10 +8636,21 @@ std::optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
   if (LHS == RHS)
     return LHSIsTrue;
 
-  if (const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS))
-    return isImpliedCondition(LHS, RHSCmp->getPredicate(),
-                              RHSCmp->getOperand(0), RHSCmp->getOperand(1), DL,
-                              LHSIsTrue, Depth);
+  // Match not
+  bool InvertRHS = false;
+  if (match(RHS, m_Not(m_Value(RHS)))) {
+    if (LHS == RHS)
+      return !LHSIsTrue;
+    InvertRHS = true;
+  }
+
+  if (const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS)) {
+    if (auto Implied = isImpliedCondition(
+            LHS, RHSCmp->getPredicate(), RHSCmp->getOperand(0),
+            RHSCmp->getOperand(1), DL, LHSIsTrue, Depth))
+      return InvertRHS ? !*Implied : *Implied;
+    return std::nullopt;
+  }
 
   if (Depth == MaxAnalysisRecursionDepth)
     return std::nullopt;
@@ -8630,21 +8662,21 @@ std::optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
     if (std::optional<bool> Imp =
             isImpliedCondition(LHS, RHS1, DL, LHSIsTrue, Depth + 1))
       if (*Imp == true)
-        return true;
+        return !InvertRHS;
     if (std::optional<bool> Imp =
             isImpliedCondition(LHS, RHS2, DL, LHSIsTrue, Depth + 1))
       if (*Imp == true)
-        return true;
+        return !InvertRHS;
   }
   if (match(RHS, m_LogicalAnd(m_Value(RHS1), m_Value(RHS2)))) {
     if (std::optional<bool> Imp =
             isImpliedCondition(LHS, RHS1, DL, LHSIsTrue, Depth + 1))
       if (*Imp == false)
-        return false;
+        return InvertRHS;
     if (std::optional<bool> Imp =
             isImpliedCondition(LHS, RHS2, DL, LHSIsTrue, Depth + 1))
       if (*Imp == false)
-        return false;
+        return InvertRHS;
   }
 
   return std::nullopt;
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 2e0f5ba82220c..2c352be74931a 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -5184,7 +5184,11 @@ bool LLParser::parseDIStringType(MDNode *&Result, bool IsDistinct) {
 ///   ::= !DIDerivedType(tag: DW_TAG_pointer_type, name: "int", file: !0,
 ///                      line: 7, scope: !1, baseType: !2, size: 32,
 ///                      align: 32, offset: 0, flags: 0, extraData: !3,
-///                      dwarfAddressSpace: 3)
+///                      dwarfAddressSpace: 3, ptrAuthKey: 1,
+///                      ptrAuthIsAddressDiscriminated: true,
+///                      ptrAuthExtraDiscriminator: 0x1234,
+///                      ptrAuthIsaPointer: 1, ptrAuthAuthenticatesNullValues:1
+///                      )
 bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
@@ -5199,19 +5203,30 @@ bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(extraData, MDField, );                                              \
   OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));      \
-  OPTIONAL(annotations, MDField, );
+  OPTIONAL(annotations, MDField, );                                            \
+  OPTIONAL(ptrAuthKey, MDUnsignedField, (0, 7));                               \
+  OPTIONAL(ptrAuthIsAddressDiscriminated, MDBoolField, );                      \
+  OPTIONAL(ptrAuthExtraDiscriminator, MDUnsignedField, (0, 0xffff));           \
+  OPTIONAL(ptrAuthIsaPointer, MDBoolField, );                                  \
+  OPTIONAL(ptrAuthAuthenticatesNullValues, MDBoolField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
   std::optional<unsigned> DWARFAddressSpace;
   if (dwarfAddressSpace.Val != UINT32_MAX)
     DWARFAddressSpace = dwarfAddressSpace.Val;
+  std::optional<DIDerivedType::PtrAuthData> PtrAuthData;
+  if (ptrAuthKey.Val)
+    PtrAuthData.emplace(
+        (unsigned)ptrAuthKey.Val, ptrAuthIsAddressDiscriminated.Val,
+        (unsigned)ptrAuthExtraDiscriminator.Val, ptrAuthIsaPointer.Val,
+        ptrAuthAuthenticatesNullValues.Val);
 
   Result = GET_OR_DISTINCT(DIDerivedType,
                            (Context, tag.Val, name.Val, file.Val, line.Val,
                             scope.Val, baseType.Val, size.Val, align.Val,
-                            offset.Val, DWARFAddressSpace, flags.Val,
-                            extraData.Val, annotations.Val));
+                            offset.Val, DWARFAddressSpace, PtrAuthData,
+                            flags.Val, extraData.Val, annotations.Val));
   return false;
 }
 
@@ -6543,10 +6558,10 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
 ///                 (MDNode ',' Metadata ',' Metadata ',')? MDNode ')'
 bool LLParser::parseDebugRecord(DbgRecord *&DR, PerFunctionState &PFS) {
   using RecordKind = DbgRecord::Kind;
-  using LocType = DPValue::LocationType;
-  LocTy DPVLoc = Lex.getLoc();
+  using LocType = DbgVariableRecord::LocationType;
+  LocTy DVRLoc = Lex.getLoc();
   if (Lex.getKind() != lltok::DbgRecordType)
-    return error(DPVLoc, "expected debug record type here");
+    return error(DVRLoc, "expected debug record type here");
   RecordKind RecordType = StringSwitch<RecordKind>(Lex.getStrVal())
                               .Case("declare", RecordKind::ValueKind)
                               .Case("value", RecordKind::ValueKind)
@@ -6554,7 +6569,7 @@ bool LLParser::parseDebugRecord(DbgRecord *&DR, PerFunctionState &PFS) {
                               .Case("label", RecordKind::LabelKind);
 
   // Parsing labels is trivial; parse here and early exit, otherwise go into the
-  // full DPValue processing stage.
+  // full DbgVariableRecord processing stage.
   if (RecordType == RecordKind::LabelKind) {
     Lex.Lex();
     if (parseToken(lltok::lparen, "Expected '(' here"))
@@ -6634,9 +6649,9 @@ bool LLParser::parseDebugRecord(DbgRecord *&DR, PerFunctionState &PFS) {
 
   if (parseToken(lltok::rparen, "Expected ')' here"))
     return true;
-  DR = DPValue::createUnresolvedDPValue(ValueType, ValLocMD, Variable,
-                                        Expression, AssignID, AddressLocation,
-                                        AddressExpression, DebugLoc);
+  DR = DbgVariableRecord::createUnresolvedDbgVariableRecord(
+      ValueType, ValLocMD, Variable, Expression, AssignID, AddressLocation,
+      AddressExpression, DebugLoc);
   return false;
 }
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 7005011980ebc..c085c715179ba 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -270,6 +270,11 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FUNC_CODE, INST_CMPXCHG)
       STRINGIFY_CODE(FUNC_CODE, INST_CALLBR)
       STRINGIFY_CODE(FUNC_CODE, BLOCKADDR_USERS)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_DECLARE)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_VALUE)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_ASSIGN)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_VALUE_SIMPLE)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_LABEL)
     }
   case bitc::VALUE_SYMTAB_BLOCK_ID:
     switch (CodeID) {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 9c63116114f3c..8261084323834 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -100,8 +100,14 @@ static cl::opt<bool> ExpandConstantExprs(
     cl::desc(
         "Expand constant expressions to instructions for testing purposes"));
 
-// Declare external flag for whether we're using the new debug-info format.
-extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+/// Load bitcode directly into RemoveDIs format (use debug records instead
+/// of debug intrinsics). UNSET is treated as FALSE, so the default action
+/// is to do nothing. Individual tools can override this to incrementally add
+/// support for the RemoveDIs format.
+cl::opt<cl::boolOrDefault> LoadBitcodeIntoNewDbgInforFormat(
+    "load-bitcode-into-experimental-debuginfo-iterators", cl::Hidden,
+    cl::desc("Load bitcode directly into the new debug info format (regardless "
+             "of input format)"));
 
 namespace {
 
@@ -4279,6 +4285,12 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 Error BitcodeReader::parseModule(uint64_t ResumeBit,
                                  bool ShouldLazyLoadMetadata,
                                  ParserCallbacks Callbacks) {
+  // Load directly into RemoveDIs format if LoadBitcodeIntoNewDbgInforFormat
+  // has been set to true (default action: load into the old debug format).
+  TheModule->IsNewDbgInfoFormat =
+      UseNewDbgInfoFormat &&
+      LoadBitcodeIntoNewDbgInforFormat == cl::boolOrDefault::BOU_TRUE;
+
   this->ValueTypeCallback = std::move(Callbacks.ValueType);
   if (ResumeBit) {
     if (Error JumpFailed = Stream.JumpToBit(ResumeBit))
@@ -6398,6 +6410,91 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     }
+    case bitc::FUNC_CODE_DEBUG_RECORD_LABEL: {
+      // DPLabels are placed after the Instructions that they are attached to.
+      Instruction *Inst = getLastInstruction();
+      if (!Inst)
+        return error("Invalid dbg record: missing instruction");
+      DILocation *DIL = cast<DILocation>(getFnMetadataByID(Record[0]));
+      DILabel *Label = cast<DILabel>(getFnMetadataByID(Record[1]));
+      Inst->getParent()->insertDbgRecordBefore(
+          new DPLabel(Label, DebugLoc(DIL)), Inst->getIterator());
+      continue; // This isn't an instruction.
+    }
+    case bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE:
+    case bitc::FUNC_CODE_DEBUG_RECORD_VALUE:
+    case bitc::FUNC_CODE_DEBUG_RECORD_DECLARE:
+    case bitc::FUNC_CODE_DEBUG_RECORD_ASSIGN: {
+      // DbgVariableRecords are placed after the Instructions that they are
+      // attached to.
+      Instruction *Inst = getLastInstruction();
+      if (!Inst)
+        return error("Invalid dbg record: missing instruction");
+
+      // First 3 fields are common to all kinds:
+      //   DILocation, DILocalVariable, DIExpression
+      // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE)
+      //   ..., LocationMetadata
+      // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE - abbrev'd)
+      //   ..., Value
+      // dbg_declare (FUNC_CODE_DEBUG_RECORD_DECLARE)
+      //   ..., LocationMetadata
+      // dbg_assign (FUNC_CODE_DEBUG_RECORD_ASSIGN)
+      //   ..., LocationMetadata, DIAssignID, DIExpression, LocationMetadata
+      unsigned Slot = 0;
+      // Common fields (0-2).
+      DILocation *DIL = cast<DILocation>(getFnMetadataByID(Record[Slot++]));
+      DILocalVariable *Var =
+          cast<DILocalVariable>(getFnMetadataByID(Record[Slot++]));
+      DIExpression *Expr =
+          cast<DIExpression>(getFnMetadataByID(Record[Slot++]));
+
+      // Union field (3: LocationMetadata | Value).
+      Metadata *RawLocation = nullptr;
+      if (BitCode == bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE) {
+        Value *V = nullptr;
+        unsigned TyID = 0;
+        // We never expect to see a fwd reference value here because
+        // use-before-defs are encoded with the standard non-abbrev record
+        // type (they'd require encoding the type too, and they're rare). As a
+        // result, getValueTypePair only ever increments Slot by one here (once
+        // for the value, never twice for value and type).
+        unsigned SlotBefore = Slot;
+        if (getValueTypePair(Record, Slot, NextValueNo, V, TyID, CurBB))
+          return error("Invalid dbg record: invalid value");
+        (void)SlotBefore;
+        assert((SlotBefore == Slot - 1) && "unexpected fwd ref");
+        RawLocation = ValueAsMetadata::get(V);
+      } else {
+        RawLocation = getFnMetadataByID(Record[Slot++]);
+      }
+
+      DbgVariableRecord *DVR = nullptr;
+      switch (BitCode) {
+      case bitc::FUNC_CODE_DEBUG_RECORD_VALUE:
+      case bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE:
+        DVR = new DbgVariableRecord(RawLocation, Var, Expr, DIL,
+                                    DbgVariableRecord::LocationType::Value);
+        break;
+      case bitc::FUNC_CODE_DEBUG_RECORD_DECLARE:
+        DVR = new DbgVariableRecord(RawLocation, Var, Expr, DIL,
+                                    DbgVariableRecord::LocationType::Declare);
+        break;
+      case bitc::FUNC_CODE_DEBUG_RECORD_ASSIGN: {
+        DIAssignID *ID = cast<DIAssignID>(getFnMetadataByID(Record[Slot++]));
+        DIExpression *AddrExpr =
+            cast<DIExpression>(getFnMetadataByID(Record[Slot++]));
+        Metadata *Addr = getFnMetadataByID(Record[Slot++]);
+        DVR = new DbgVariableRecord(RawLocation, Var, Expr, ID, Addr, AddrExpr,
+                                    DIL);
+        break;
+      }
+      default:
+        llvm_unreachable("Unknown DbgVariableRecord bitcode");
+      }
+      Inst->getParent()->insertDbgRecordBefore(DVR, Inst->getIterator());
+      continue; // This isn't an instruction.
+    }
     case bitc::FUNC_CODE_INST_CALL: {
       // CALL: [paramattrs, cc, fmf, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
@@ -6677,10 +6774,22 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
   // Move the bit stream to the saved position of the deferred function body.
   if (Error JumpFailed = Stream.JumpToBit(DFII->second))
     return JumpFailed;
+
+  // Set the debug info mode to "new", possibly creating a mismatch between
+  // module and function debug modes. This is okay because we'll convert
+  // everything back to the old mode after parsing if needed.
+  // FIXME: Remove this once all tools support RemoveDIs.
+  F->IsNewDbgInfoFormat = true;
+
   if (Error Err = parseFunctionBody(F))
     return Err;
   F->setIsMaterializable(false);
 
+  // Convert new debug info records into intrinsics.
+  // FIXME: Remove this once all tools support RemoveDIs.
+  if (!F->getParent()->IsNewDbgInfoFormat)
+    F->convertFromNewDbgValues();
+
   if (StripDebugInfo)
     stripDebugInfo(*F);
 
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 770eb83af17f9..9102f3a60cffc 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -609,17 +609,25 @@ class MetadataLoader::MetadataLoaderImpl {
     if (!NeedDeclareExpressionUpgrade)
       return;
 
+    auto UpdateDeclareIfNeeded = [&](auto *Declare) {
+      auto *DIExpr = Declare->getExpression();
+      if (!DIExpr || !DIExpr->startsWithDeref() ||
+          !isa_and_nonnull<Argument>(Declare->getAddress()))
+        return;
+      SmallVector<uint64_t, 8> Ops;
+      Ops.append(std::next(DIExpr->elements_begin()), DIExpr->elements_end());
+      Declare->setExpression(DIExpression::get(Context, Ops));
+    };
+
     for (auto &BB : F)
-      for (auto &I : BB)
+      for (auto &I : BB) {
+        for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+          if (DVR.isDbgDeclare())
+            UpdateDeclareIfNeeded(&DVR);
+        }
         if (auto *DDI = dyn_cast<DbgDeclareInst>(&I))
-          if (auto *DIExpr = DDI->getExpression())
-            if (DIExpr->startsWithDeref() &&
-                isa_and_nonnull<Argument>(DDI->getAddress())) {
-              SmallVector<uint64_t, 8> Ops;
-              Ops.append(std::next(DIExpr->elements_begin()),
-                         DIExpr->elements_end());
-              DDI->setExpression(DIExpression::get(Context, Ops));
-            }
+          UpdateDeclareIfNeeded(DDI);
+      }
   }
 
   /// Upgrade the expression from previous versions.
@@ -1556,7 +1564,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_DERIVED_TYPE: {
-    if (Record.size() < 12 || Record.size() > 14)
+    if (Record.size() < 12 || Record.size() > 15)
       return error("Invalid record");
 
     // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
@@ -1566,8 +1574,17 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       DWARFAddressSpace = Record[12] - 1;
 
     Metadata *Annotations = nullptr;
-    if (Record.size() > 13 && Record[13])
-      Annotations = getMDOrNull(Record[13]);
+    std::optional<DIDerivedType::PtrAuthData> PtrAuthData;
+
+    // Only look for annotations/ptrauth if both are allocated.
+    // If not, we can't tell which was intended to be embedded, as both ptrauth
+    // and annotations have been expected at Record[13] at various times.
+    if (Record.size() > 14) {
+      if (Record[13])
+        Annotations = getMDOrNull(Record[13]);
+      if (Record[14])
+        PtrAuthData.emplace(Record[14]);
+    }
 
     IsDistinct = Record[0];
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
@@ -1577,7 +1594,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getMDOrNull(Record[3]), Record[4],
                          getDITypeRefOrNull(Record[5]),
                          getDITypeRefOrNull(Record[6]), Record[7], Record[8],
-                         Record[9], DWARFAddressSpace, Flags,
+                         Record[9], DWARFAddressSpace, PtrAuthData, Flags,
                          getDITypeRefOrNull(Record[11]), Annotations)),
         NextMetadataNo);
     NextMetadataNo++;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 597f49332fad2..fd211f74f07c5 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -99,6 +99,9 @@ namespace llvm {
 extern FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold;
 }
 
+extern bool WriteNewDbgInfoFormatToBitcode;
+extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+
 namespace {
 
 /// These are manifest constants used by the bitcode writer. They do not need to
@@ -128,6 +131,7 @@ enum {
   FUNCTION_INST_RET_VAL_ABBREV,
   FUNCTION_INST_UNREACHABLE_ABBREV,
   FUNCTION_INST_GEP_ABBREV,
+  FUNCTION_DEBUG_RECORD_VALUE_ABBREV,
 };
 
 /// Abstract class to manage the bitcode writing, subclassed for each bitcode
@@ -1825,6 +1829,11 @@ void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
 
   Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
 
+  if (auto PtrAuthData = N->getPtrAuthData())
+    Record.push_back(PtrAuthData->RawData);
+  else
+    Record.push_back(0);
+
   Stream.EmitRecord(bitc::METADATA_DERIVED_TYPE, Record, Abbrev);
   Record.clear();
 }
@@ -3512,25 +3521,96 @@ void ModuleBitcodeWriter::writeFunction(
       NeedsMetadataAttachment |= I.hasMetadataOtherThanDebugLoc();
 
       // If the instruction has a debug location, emit it.
-      DILocation *DL = I.getDebugLoc();
-      if (!DL)
-        continue;
-
-      if (DL == LastDL) {
-        // Just repeat the same debug loc as last time.
-        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC_AGAIN, Vals);
-        continue;
+      if (DILocation *DL = I.getDebugLoc()) {
+        if (DL == LastDL) {
+          // Just repeat the same debug loc as last time.
+          Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC_AGAIN, Vals);
+        } else {
+          Vals.push_back(DL->getLine());
+          Vals.push_back(DL->getColumn());
+          Vals.push_back(VE.getMetadataOrNullID(DL->getScope()));
+          Vals.push_back(VE.getMetadataOrNullID(DL->getInlinedAt()));
+          Vals.push_back(DL->isImplicitCode());
+          Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC, Vals);
+          Vals.clear();
+          LastDL = DL;
+        }
       }
 
-      Vals.push_back(DL->getLine());
-      Vals.push_back(DL->getColumn());
-      Vals.push_back(VE.getMetadataOrNullID(DL->getScope()));
-      Vals.push_back(VE.getMetadataOrNullID(DL->getInlinedAt()));
-      Vals.push_back(DL->isImplicitCode());
-      Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC, Vals);
-      Vals.clear();
-
-      LastDL = DL;
+      // If the instruction has DbgRecords attached to it, emit them. Note that
+      // they come after the instruction so that it's easy to attach them again
+      // when reading the bitcode, even though conceptually the debug locations
+      // start "before" the instruction.
+      if (I.hasDbgRecords() && WriteNewDbgInfoFormatToBitcode) {
+        /// Try to push the value only (unwrapped), otherwise push the
+        /// metadata wrapped value. Returns true if the value was pushed
+        /// without the ValueAsMetadata wrapper.
+        auto PushValueOrMetadata = [&Vals, InstID,
+                                    this](Metadata *RawLocation) {
+          assert(RawLocation &&
+                 "RawLocation unexpectedly null in DbgVariableRecord");
+          if (ValueAsMetadata *VAM = dyn_cast<ValueAsMetadata>(RawLocation)) {
+            SmallVector<unsigned, 2> ValAndType;
+            // If the value is a fwd-ref the type is also pushed. We don't
+            // want the type, so fwd-refs are kept wrapped (pushValueAndType
+            // returns false if the value is pushed without type).
+            if (!pushValueAndType(VAM->getValue(), InstID, ValAndType)) {
+              Vals.push_back(ValAndType[0]);
+              return true;
+            }
+          }
+          // The metadata is a DIArgList, or ValueAsMetadata wrapping a
+          // fwd-ref. Push the metadata ID.
+          Vals.push_back(VE.getMetadataID(RawLocation));
+          return false;
+        };
+
+        // Write out non-instruction debug information attached to this
+        // instruction. Write it after the instruction so that it's easy to
+        // re-attach to the instruction reading the records in.
+        for (DbgRecord &DR : I.DbgMarker->getDbgRecordRange()) {
+          if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+            Vals.push_back(VE.getMetadataID(&*DPL->getDebugLoc()));
+            Vals.push_back(VE.getMetadataID(DPL->getLabel()));
+            Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_LABEL, Vals);
+            Vals.clear();
+            continue;
+          }
+
+          // First 3 fields are common to all kinds:
+          //   DILocation, DILocalVariable, DIExpression
+          // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE)
+          //   ..., LocationMetadata
+          // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE - abbrev'd)
+          //   ..., Value
+          // dbg_declare (FUNC_CODE_DEBUG_RECORD_DECLARE)
+          //   ..., LocationMetadata
+          // dbg_assign (FUNC_CODE_DEBUG_RECORD_ASSIGN)
+          //   ..., LocationMetadata, DIAssignID, DIExpression, LocationMetadata
+          DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
+          Vals.push_back(VE.getMetadataID(&*DVR.getDebugLoc()));
+          Vals.push_back(VE.getMetadataID(DVR.getVariable()));
+          Vals.push_back(VE.getMetadataID(DVR.getExpression()));
+          if (DVR.isDbgValue()) {
+            if (PushValueOrMetadata(DVR.getRawLocation()))
+              Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE, Vals,
+                                FUNCTION_DEBUG_RECORD_VALUE_ABBREV);
+            else
+              Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_VALUE, Vals);
+          } else if (DVR.isDbgDeclare()) {
+            Vals.push_back(VE.getMetadataID(DVR.getRawLocation()));
+            Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_DECLARE, Vals);
+          } else {
+            assert(DVR.isDbgAssign() && "Unexpected DbgRecord kind");
+            Vals.push_back(VE.getMetadataID(DVR.getRawLocation()));
+            Vals.push_back(VE.getMetadataID(DVR.getAssignID()));
+            Vals.push_back(VE.getMetadataID(DVR.getAddressExpression()));
+            Vals.push_back(VE.getMetadataID(DVR.getRawAddress()));
+            Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_ASSIGN, Vals);
+          }
+          Vals.clear();
+        }
+      }
     }
 
     if (BlockAddress *BA = BlockAddress::lookup(&BB)) {
@@ -3771,7 +3851,17 @@ void ModuleBitcodeWriter::writeBlockInfo() {
         FUNCTION_INST_GEP_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
-
+  {
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7)); // dbgloc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7)); // var
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7)); // expr
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // val
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_DEBUG_RECORD_VALUE_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering! 1");
+  }
   Stream.ExitBlock();
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 93fb2a821dee7..de2396f31f666 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -18,11 +18,12 @@
 #include "llvm/Pass.h"
 using namespace llvm;
 
+extern bool WriteNewDbgInfoFormatToBitcode;
+
 PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
-  // RemoveDIs: there's no bitcode representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
+  bool ConvertToOldDbgFormatForWrite =
+      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
+  if (ConvertToOldDbgFormatForWrite)
     M.convertFromNewDbgValues();
 
   const ModuleSummaryIndex *Index =
@@ -30,7 +31,7 @@ PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
                        : nullptr;
   WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, Index, EmitModuleHash);
 
-  if (IsNewDbgInfoFormat)
+  if (ConvertToOldDbgFormatForWrite)
     M.convertToNewDbgValues();
 
   return PreservedAnalyses::all();
@@ -56,16 +57,15 @@ namespace {
     StringRef getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      // RemoveDIs: there's no bitcode representation of the DPValue debug-info,
-      // convert to dbg.values before writing out.
-      bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-      if (IsNewDbgInfoFormat)
+      bool ConvertToOldDbgFormatForWrite =
+          M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
+      if (ConvertToOldDbgFormatForWrite)
         M.convertFromNewDbgValues();
 
       WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, /*Index=*/nullptr,
                          /*EmitModuleHash=*/false);
 
-      if (IsNewDbgInfoFormat)
+      if (ConvertToOldDbgFormatForWrite)
         M.convertToNewDbgValues();
       return false;
     }
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index fccb2a606f7ed..3209dca253feb 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -134,20 +134,28 @@ static OrderMap orderModule(const Module &M) {
     // Metadata used by instructions is decoded before the actual instructions,
     // so visit any constants used by it beforehand.
     for (const BasicBlock &BB : F)
-      for (const Instruction &I : BB)
-        for (const Value *V : I.operands()) {
-          if (const auto *MAV = dyn_cast<MetadataAsValue>(V)) {
-            if (const auto *VAM =
-                    dyn_cast<ValueAsMetadata>(MAV->getMetadata())) {
+      for (const Instruction &I : BB) {
+        auto OrderConstantFromMetadata = [&](Metadata *MD) {
+          if (const auto *VAM = dyn_cast<ValueAsMetadata>(MD)) {
+            orderConstantValue(VAM->getValue());
+          } else if (const auto *AL = dyn_cast<DIArgList>(MD)) {
+            for (const auto *VAM : AL->getArgs())
               orderConstantValue(VAM->getValue());
-            } else if (const auto *AL =
-                           dyn_cast<DIArgList>(MAV->getMetadata())) {
-              for (const auto *VAM : AL->getArgs())
-                orderConstantValue(VAM->getValue());
-            }
           }
+        };
+
+        for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+          OrderConstantFromMetadata(DVR.getRawLocation());
+          if (DVR.isDbgAssign())
+            OrderConstantFromMetadata(DVR.getRawAddress());
         }
 
+        for (const Value *V : I.operands()) {
+          if (const auto *MAV = dyn_cast<MetadataAsValue>(V))
+            OrderConstantFromMetadata(MAV->getMetadata());
+        }
+      }
+
     for (const Argument &A : F.args())
       orderValue(&A, OM);
     for (const BasicBlock &BB : F)
@@ -261,33 +269,39 @@ static UseListOrderStack predictUseListOrder(const Module &M) {
   // constants in the last Function they're used in.  Module-level constants
   // have already been visited above.
   for (const Function &F : llvm::reverse(M)) {
+    auto PredictValueOrderFromMetadata = [&](Metadata *MD) {
+      if (const auto *VAM = dyn_cast<ValueAsMetadata>(MD)) {
+        predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
+      } else if (const auto *AL = dyn_cast<DIArgList>(MD)) {
+        for (const auto *VAM : AL->getArgs())
+          predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
+      }
+    };
     if (F.isDeclaration())
       continue;
     for (const BasicBlock &BB : F)
       predictValueUseListOrder(&BB, &F, OM, Stack);
     for (const Argument &A : F.args())
       predictValueUseListOrder(&A, &F, OM, Stack);
-    for (const BasicBlock &BB : F)
+    for (const BasicBlock &BB : F) {
       for (const Instruction &I : BB) {
+        for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+          PredictValueOrderFromMetadata(DVR.getRawLocation());
+          if (DVR.isDbgAssign())
+            PredictValueOrderFromMetadata(DVR.getRawAddress());
+        }
         for (const Value *Op : I.operands()) {
           if (isa<Constant>(*Op) || isa<InlineAsm>(*Op)) // Visit GlobalValues.
             predictValueUseListOrder(Op, &F, OM, Stack);
-          if (const auto *MAV = dyn_cast<MetadataAsValue>(Op)) {
-            if (const auto *VAM =
-                    dyn_cast<ValueAsMetadata>(MAV->getMetadata())) {
-              predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
-            } else if (const auto *AL =
-                           dyn_cast<DIArgList>(MAV->getMetadata())) {
-              for (const auto *VAM : AL->getArgs())
-                predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
-            }
-          }
+          if (const auto *MAV = dyn_cast<MetadataAsValue>(Op))
+            PredictValueOrderFromMetadata(MAV->getMetadata());
         }
         if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
           predictValueUseListOrder(SVI->getShuffleMaskForBitcode(), &F, OM,
                                    Stack);
         predictValueUseListOrder(&I, &F, OM, Stack);
       }
+    }
   }
 
   // Visit globals last, since the module-level use-list block will be seen
@@ -409,6 +423,41 @@ ValueEnumerator::ValueEnumerator(const Module &M,
 
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB) {
+        // Local metadata is enumerated during function-incorporation, but
+        // any ConstantAsMetadata arguments in a DIArgList should be examined
+        // now.
+        auto EnumerateNonLocalValuesFromMetadata = [&](Metadata *MD) {
+          assert(MD && "Metadata unexpectedly null");
+          if (const auto *AL = dyn_cast<DIArgList>(MD)) {
+            for (const auto *VAM : AL->getArgs()) {
+              if (isa<ConstantAsMetadata>(VAM))
+                EnumerateMetadata(&F, VAM);
+            }
+            return;
+          }
+
+          if (!isa<LocalAsMetadata>(MD))
+            EnumerateMetadata(&F, MD);
+        };
+
+        for (DbgRecord &DR : I.getDbgRecordRange()) {
+          if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+            EnumerateMetadata(&F, DPL->getLabel());
+            EnumerateMetadata(&F, &*DPL->getDebugLoc());
+            continue;
+          }
+          // Enumerate non-local location metadata.
+          DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
+          EnumerateNonLocalValuesFromMetadata(DVR.getRawLocation());
+          EnumerateMetadata(&F, DVR.getExpression());
+          EnumerateMetadata(&F, DVR.getVariable());
+          EnumerateMetadata(&F, &*DVR.getDebugLoc());
+          if (DVR.isDbgAssign()) {
+            EnumerateNonLocalValuesFromMetadata(DVR.getRawAddress());
+            EnumerateMetadata(&F, DVR.getAssignID());
+            EnumerateMetadata(&F, DVR.getAddressExpression());
+          }
+        }
         for (const Use &Op : I.operands()) {
           auto *MD = dyn_cast<MetadataAsValue>(&Op);
           if (!MD) {
@@ -416,19 +465,7 @@ ValueEnumerator::ValueEnumerator(const Module &M,
             continue;
           }
 
-          // Local metadata is enumerated during function-incorporation, but
-          // any ConstantAsMetadata arguments in a DIArgList should be examined
-          // now.
-          if (isa<LocalAsMetadata>(MD->getMetadata()))
-            continue;
-          if (auto *AL = dyn_cast<DIArgList>(MD->getMetadata())) {
-            for (auto *VAM : AL->getArgs())
-              if (isa<ConstantAsMetadata>(VAM))
-                EnumerateMetadata(&F, VAM);
-            continue;
-          }
-
-          EnumerateMetadata(&F, MD->getMetadata());
+          EnumerateNonLocalValuesFromMetadata(MD->getMetadata());
         }
         if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
           EnumerateType(SVI->getShuffleMaskForBitcode()->getType());
@@ -1064,27 +1101,43 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
 
   SmallVector<LocalAsMetadata *, 8> FnLocalMDVector;
   SmallVector<DIArgList *, 8> ArgListMDVector;
+
+  auto AddFnLocalMetadata = [&](Metadata *MD) {
+    if (!MD)
+      return;
+    if (auto *Local = dyn_cast<LocalAsMetadata>(MD)) {
+      // Enumerate metadata after the instructions they might refer to.
+      FnLocalMDVector.push_back(Local);
+    } else if (auto *ArgList = dyn_cast<DIArgList>(MD)) {
+      ArgListMDVector.push_back(ArgList);
+      for (ValueAsMetadata *VMD : ArgList->getArgs()) {
+        if (auto *Local = dyn_cast<LocalAsMetadata>(VMD)) {
+          // Enumerate metadata after the instructions they might refer
+          // to.
+          FnLocalMDVector.push_back(Local);
+        }
+      }
+    }
+  };
+
   // Add all of the instructions.
   for (const BasicBlock &BB : F) {
     for (const Instruction &I : BB) {
       for (const Use &OI : I.operands()) {
-        if (auto *MD = dyn_cast<MetadataAsValue>(&OI)) {
-          if (auto *Local = dyn_cast<LocalAsMetadata>(MD->getMetadata())) {
-            // Enumerate metadata after the instructions they might refer to.
-            FnLocalMDVector.push_back(Local);
-          } else if (auto *ArgList = dyn_cast<DIArgList>(MD->getMetadata())) {
-            ArgListMDVector.push_back(ArgList);
-            for (ValueAsMetadata *VMD : ArgList->getArgs()) {
-              if (auto *Local = dyn_cast<LocalAsMetadata>(VMD)) {
-                // Enumerate metadata after the instructions they might refer
-                // to.
-                FnLocalMDVector.push_back(Local);
-              }
-            }
-          }
+        if (auto *MD = dyn_cast<MetadataAsValue>(&OI))
+          AddFnLocalMetadata(MD->getMetadata());
+      }
+      /// RemoveDIs: Add non-instruction function-local metadata uses.
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        assert(DVR.getRawLocation() &&
+               "DbgVariableRecord location unexpectedly null");
+        AddFnLocalMetadata(DVR.getRawLocation());
+        if (DVR.isDbgAssign()) {
+          assert(DVR.getRawAddress() &&
+                 "DbgVariableRecord location unexpectedly null");
+          AddFnLocalMetadata(DVR.getRawAddress());
         }
       }
-
       if (!I.getType()->isVoidTy())
         EnumerateValue(&I);
     }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 0efe7a0e73367..a15538755d73b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -538,8 +538,10 @@ bool AsmPrinter::doInitialization(Module &M) {
   if (!M.getModuleInlineAsm().empty()) {
     OutStreamer->AddComment("Start of file scope inline assembly");
     OutStreamer->addBlankLine();
-    emitInlineAsm(M.getModuleInlineAsm() + "\n", *TM.getMCSubtargetInfo(),
-                  TM.Options.MCOptions);
+    emitInlineAsm(
+        M.getModuleInlineAsm() + "\n", *TM.getMCSubtargetInfo(),
+        TM.Options.MCOptions, nullptr,
+        InlineAsm::AsmDialect(TM.getMCAsmInfo()->getAssemblerDialect()));
     OutStreamer->AddComment("End of file scope inline assembly");
     OutStreamer->addBlankLine();
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index d462859e48946..c40beeeb925e0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -803,6 +803,18 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   if (DTy->getDWARFAddressSpace())
     addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4,
             *DTy->getDWARFAddressSpace());
+  if (auto PtrAuthData = DTy->getPtrAuthData()) {
+    addUInt(Buffer, dwarf::DW_AT_LLVM_ptrauth_key, dwarf::DW_FORM_data1,
+            PtrAuthData->key());
+    if (PtrAuthData->isAddressDiscriminated())
+      addFlag(Buffer, dwarf::DW_AT_LLVM_ptrauth_address_discriminated);
+    addUInt(Buffer, dwarf::DW_AT_LLVM_ptrauth_extra_discriminator,
+            dwarf::DW_FORM_data2, PtrAuthData->extraDiscriminator());
+    if (PtrAuthData->isaPointer())
+      addFlag(Buffer, dwarf::DW_AT_LLVM_ptrauth_isa_pointer);
+    if (PtrAuthData->authenticatesNullValues())
+      addFlag(Buffer, dwarf::DW_AT_LLVM_ptrauth_authenticates_null_values);
+  }
 }
 
 void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index a4b819a735c64..09177950fc824 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -215,22 +215,24 @@ void FunctionVarLocs::init(FunctionVarLocsBuilder &Builder) {
 
   // Insert a contiguous block of VarLocInfos for each instruction, mapping it
   // to the start and end position in the vector with VarLocsBeforeInst. This
-  // block includes VarLocs for any DPValues attached to that instruction.
+  // block includes VarLocs for any DbgVariableRecords attached to that
+  // instruction.
   for (auto &P : Builder.VarLocsBeforeInst) {
-    // Process VarLocs attached to a DPValue alongside their marker Instruction.
+    // Process VarLocs attached to a DbgRecord alongside their marker
+    // Instruction.
     if (isa<const DbgRecord *>(P.first))
       continue;
     const Instruction *I = cast<const Instruction *>(P.first);
     unsigned BlockStart = VarLocRecords.size();
-    // Any VarLocInfos attached to a DPValue should now be remapped to their
-    // marker Instruction, in order of DPValue appearance and prior to any
+    // Any VarLocInfos attached to a DbgRecord should now be remapped to their
+    // marker Instruction, in order of DbgRecord appearance and prior to any
     // VarLocInfos attached directly to that instruction.
-    for (const DPValue &DPV : DPValue::filter(I->getDbgRecordRange())) {
-      // Even though DPV defines a variable location, VarLocsBeforeInst can
+    for (const DbgVariableRecord &DVR : filterDbgVars(I->getDbgRecordRange())) {
+      // Even though DVR defines a variable location, VarLocsBeforeInst can
       // still be empty if that VarLoc was redundant.
-      if (!Builder.VarLocsBeforeInst.count(&DPV))
+      if (!Builder.VarLocsBeforeInst.count(&DVR))
         continue;
-      for (const VarLocInfo &VarLoc : Builder.VarLocsBeforeInst[&DPV])
+      for (const VarLocInfo &VarLoc : Builder.VarLocsBeforeInst[&DVR])
         VarLocRecords.emplace_back(VarLoc);
     }
     for (const VarLocInfo &VarLoc : P.second)
@@ -829,10 +831,10 @@ class MemLocFragmentFill {
   void process(BasicBlock &BB, VarFragMap &LiveSet) {
     BBInsertBeforeMap[&BB].clear();
     for (auto &I : BB) {
-      for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange())) {
-        if (const auto *Locs = FnVarLocs->getWedge(&DPV)) {
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (const auto *Locs = FnVarLocs->getWedge(&DVR)) {
           for (const VarLocInfo &Loc : *Locs) {
-            addDef(Loc, &DPV, *I.getParent(), LiveSet);
+            addDef(Loc, &DVR, *I.getParent(), LiveSet);
           }
         }
       }
@@ -1026,7 +1028,7 @@ class AssignmentTrackingLowering {
   /// i.e. for all values x and y where x != y:
   /// join(x, x) = x
   /// join(x, y) = NoneOrPhi
-  using AssignRecord = PointerUnion<DbgAssignIntrinsic *, DPValue *>;
+  using AssignRecord = PointerUnion<DbgAssignIntrinsic *, DbgVariableRecord *>;
   struct Assignment {
     enum S { Known, NoneOrPhi } Status;
     /// ID of the assignment. nullptr if Status is not Known.
@@ -1053,16 +1055,16 @@ class AssignmentTrackingLowering {
       else if (isa<DbgAssignIntrinsic *>(Source))
         OS << Source.get<DbgAssignIntrinsic *>();
       else
-        OS << Source.get<DPValue *>();
+        OS << Source.get<DbgVariableRecord *>();
       OS << ")";
     }
 
     static Assignment make(DIAssignID *ID, DbgAssignIntrinsic *Source) {
       return Assignment(Known, ID, Source);
     }
-    static Assignment make(DIAssignID *ID, DPValue *Source) {
+    static Assignment make(DIAssignID *ID, DbgVariableRecord *Source) {
       assert(Source->isDbgAssign() &&
-             "Cannot make an assignment from a non-assign DPValue");
+             "Cannot make an assignment from a non-assign DbgVariableRecord");
       return Assignment(Known, ID, Source);
     }
     static Assignment make(DIAssignID *ID, AssignRecord Source) {
@@ -1083,7 +1085,7 @@ class AssignmentTrackingLowering {
       // If the Status is Known then we expect there to be an assignment ID.
       assert(Status == NoneOrPhi || ID);
     }
-    Assignment(S Status, DIAssignID *ID, DPValue *Source)
+    Assignment(S Status, DIAssignID *ID, DbgVariableRecord *Source)
         : Status(Status), ID(ID), Source(Source) {
       // If the Status is Known then we expect there to be an assignment ID.
       assert(Status == NoneOrPhi || ID);
@@ -1118,10 +1120,10 @@ class AssignmentTrackingLowering {
   /// Clear the location definitions currently cached for insertion after /p
   /// After.
   void resetInsertionPoint(Instruction &After);
-  void resetInsertionPoint(DPValue &After);
+  void resetInsertionPoint(DbgVariableRecord &After);
 
   // emitDbgValue can be called with:
-  //   Source=[AssignRecord|DbgValueInst*|DbgAssignIntrinsic*|DPValue*]
+  //   Source=[AssignRecord|DbgValueInst*|DbgAssignIntrinsic*|DbgVariableRecord*]
   // Since AssignRecord can be cast to one of the latter two types, and all
   // other types have a shared interface, we use a template to handle the latter
   // three types, and an explicit overload for AssignRecord that forwards to
@@ -1354,9 +1356,10 @@ class AssignmentTrackingLowering {
   /// attachment, \p I.
   void processUntaggedInstruction(Instruction &I, BlockInfo *LiveSet);
   void processDbgAssign(AssignRecord Assign, BlockInfo *LiveSet);
-  void processDPValue(DPValue &DPV, BlockInfo *LiveSet);
-  void processDbgValue(PointerUnion<DbgValueInst *, DPValue *> DbgValueRecord,
-                       BlockInfo *LiveSet);
+  void processDbgVariableRecord(DbgVariableRecord &DVR, BlockInfo *LiveSet);
+  void processDbgValue(
+      PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgValueRecord,
+      BlockInfo *LiveSet);
   /// Add an assignment to memory for the variable /p Var.
   void addMemDef(BlockInfo *LiveSet, VariableID Var, const Assignment &AV);
   /// Add an assignment to the variable /p Var.
@@ -1456,10 +1459,10 @@ static DIAssignID *getIDFromMarker(const DbgAssignIntrinsic &DAI) {
   return cast<DIAssignID>(DAI.getAssignID());
 }
 
-static DIAssignID *getIDFromMarker(const DPValue &DPV) {
-  assert(DPV.isDbgAssign() &&
-         "Cannot get a DIAssignID from a non-assign DPValue!");
-  return DPV.getAssignID();
+static DIAssignID *getIDFromMarker(const DbgVariableRecord &DVR) {
+  assert(DVR.isDbgAssign() &&
+         "Cannot get a DIAssignID from a non-assign DbgVariableRecord!");
+  return DVR.getAssignID();
 }
 
 /// Return true if \p Var has an assignment in \p M matching \p AV.
@@ -1492,10 +1495,10 @@ const char *locStr(AssignmentTrackingLowering::LocKind Loc) {
 }
 #endif
 
-VarLocInsertPt getNextNode(const DbgRecord *DPV) {
-  auto NextIt = ++(DPV->getIterator());
-  if (NextIt == DPV->getMarker()->getDbgRecordRange().end())
-    return DPV->getMarker()->MarkedInstr;
+VarLocInsertPt getNextNode(const DbgRecord *DVR) {
+  auto NextIt = ++(DVR->getIterator());
+  if (NextIt == DVR->getMarker()->getDbgRecordRange().end())
+    return DVR->getMarker()->MarkedInstr;
   return &*NextIt;
 }
 VarLocInsertPt getNextNode(const Instruction *Inst) {
@@ -1514,10 +1517,10 @@ DbgAssignIntrinsic *CastToDbgAssign(DbgVariableIntrinsic *DVI) {
   return cast<DbgAssignIntrinsic>(DVI);
 }
 
-DPValue *CastToDbgAssign(DPValue *DPV) {
-  assert(DPV->isDbgAssign() &&
-         "Attempted to cast non-assign DPValue to DPVAssign.");
-  return DPV;
+DbgVariableRecord *CastToDbgAssign(DbgVariableRecord *DVR) {
+  assert(DVR->isDbgAssign() &&
+         "Attempted to cast non-assign DbgVariableRecord to DVRAssign.");
+  return DVR;
 }
 
 void AssignmentTrackingLowering::emitDbgValue(
@@ -1526,7 +1529,7 @@ void AssignmentTrackingLowering::emitDbgValue(
   if (isa<DbgAssignIntrinsic *>(Source))
     emitDbgValue(Kind, cast<DbgAssignIntrinsic *>(Source), After);
   else
-    emitDbgValue(Kind, cast<DPValue *>(Source), After);
+    emitDbgValue(Kind, cast<DbgVariableRecord *>(Source), After);
 }
 template <typename T>
 void AssignmentTrackingLowering::emitDbgValue(
@@ -1649,7 +1652,7 @@ void AssignmentTrackingLowering::processUntaggedInstruction(
     Ops.push_back(dwarf::DW_OP_deref);
     DIE = DIExpression::prependOpcodes(DIE, Ops, /*StackValue=*/false,
                                        /*EntryValue=*/false);
-    // Find a suitable insert point, before the next instruction or DPValue
+    // Find a suitable insert point, before the next instruction or DbgRecord
     // after I.
     auto InsertBefore = getNextNode(&I);
     assert(InsertBefore && "Shouldn't be inserting after a terminator");
@@ -1673,7 +1676,7 @@ void AssignmentTrackingLowering::processUntaggedInstruction(
 void AssignmentTrackingLowering::processTaggedInstruction(
     Instruction &I, AssignmentTrackingLowering::BlockInfo *LiveSet) {
   auto Linked = at::getAssignmentMarkers(&I);
-  auto LinkedDPAssigns = at::getDPVAssignmentMarkers(&I);
+  auto LinkedDPAssigns = at::getDVRAssignmentMarkers(&I);
   // No dbg.assign intrinsics linked.
   // FIXME: All vars that have a stack slot this store modifies that don't have
   // a dbg.assign linked to it should probably treat this like an untagged
@@ -1756,8 +1759,8 @@ void AssignmentTrackingLowering::processTaggedInstruction(
   };
   for (DbgAssignIntrinsic *DAI : Linked)
     ProcessLinkedAssign(DAI);
-  for (DPValue *DPV : LinkedDPAssigns)
-    ProcessLinkedAssign(DPV);
+  for (DbgVariableRecord *DVR : LinkedDPAssigns)
+    ProcessLinkedAssign(DVR);
 }
 
 void AssignmentTrackingLowering::processDbgAssign(AssignRecord Assign,
@@ -1802,13 +1805,13 @@ void AssignmentTrackingLowering::processDbgAssign(AssignRecord Assign,
       emitDbgValue(LocKind::Val, DbgAssign, DbgAssign);
     }
   };
-  if (isa<DPValue *>(Assign))
-    return ProcessDbgAssignImpl(cast<DPValue *>(Assign));
+  if (isa<DbgVariableRecord *>(Assign))
+    return ProcessDbgAssignImpl(cast<DbgVariableRecord *>(Assign));
   return ProcessDbgAssignImpl(cast<DbgAssignIntrinsic *>(Assign));
 }
 
 void AssignmentTrackingLowering::processDbgValue(
-    PointerUnion<DbgValueInst *, DPValue *> DbgValueRecord,
+    PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgValueRecord,
     BlockInfo *LiveSet) {
   auto ProcessDbgValueImpl = [&](auto *DbgValue) {
     // Only other tracking variables that are at some point stack homed.
@@ -1833,8 +1836,8 @@ void AssignmentTrackingLowering::processDbgValue(
     setLocKind(LiveSet, Var, LocKind::Val);
     emitDbgValue(LocKind::Val, DbgValue, DbgValue);
   };
-  if (isa<DPValue *>(DbgValueRecord))
-    return ProcessDbgValueImpl(cast<DPValue *>(DbgValueRecord));
+  if (isa<DbgVariableRecord *>(DbgValueRecord))
+    return ProcessDbgValueImpl(cast<DbgVariableRecord *>(DbgValueRecord));
   return ProcessDbgValueImpl(cast<DbgValueInst *>(DbgValueRecord));
 }
 
@@ -1859,16 +1862,16 @@ void AssignmentTrackingLowering::processDbgInstruction(
   else if (auto *DVI = dyn_cast<DbgValueInst>(&I))
     processDbgValue(DVI, LiveSet);
 }
-void AssignmentTrackingLowering::processDPValue(
-    DPValue &DPV, AssignmentTrackingLowering::BlockInfo *LiveSet) {
+void AssignmentTrackingLowering::processDbgVariableRecord(
+    DbgVariableRecord &DVR, AssignmentTrackingLowering::BlockInfo *LiveSet) {
   // Ignore assignments to zero bits of the variable.
-  if (hasZeroSizedFragment(DPV))
+  if (hasZeroSizedFragment(DVR))
     return;
 
-  if (DPV.isDbgAssign())
-    processDbgAssign(&DPV, LiveSet);
-  else if (DPV.isDbgValue())
-    processDbgValue(&DPV, LiveSet);
+  if (DVR.isDbgAssign())
+    processDbgAssign(&DVR, LiveSet);
+  else if (DVR.isDbgValue())
+    processDbgValue(&DVR, LiveSet);
 }
 
 void AssignmentTrackingLowering::resetInsertionPoint(Instruction &After) {
@@ -1878,7 +1881,7 @@ void AssignmentTrackingLowering::resetInsertionPoint(Instruction &After) {
     return;
   R->second.clear();
 }
-void AssignmentTrackingLowering::resetInsertionPoint(DPValue &After) {
+void AssignmentTrackingLowering::resetInsertionPoint(DbgVariableRecord &After) {
   auto *R = InsertBeforeMap.find(getNextNode(&After));
   if (R == InsertBeforeMap.end())
     return;
@@ -1886,21 +1889,21 @@ void AssignmentTrackingLowering::resetInsertionPoint(DPValue &After) {
 }
 
 void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
-  // If the block starts with DPValues, we need to process those DPValues as
+  // If the block starts with DbgRecords, we need to process those DbgRecords as
   // their own frame without processing any instructions first.
-  bool ProcessedLeadingDPValues = !BB.begin()->hasDbgRecords();
+  bool ProcessedLeadingDbgRecords = !BB.begin()->hasDbgRecords();
   for (auto II = BB.begin(), EI = BB.end(); II != EI;) {
     assert(VarsTouchedThisFrame.empty());
     // Process the instructions in "frames". A "frame" includes a single
     // non-debug instruction followed any debug instructions before the
     // next non-debug instruction.
 
-    // Skip the current instruction if it has unprocessed DPValues attached (see
-    // comment above `ProcessedLeadingDPValues`).
-    if (ProcessedLeadingDPValues) {
+    // Skip the current instruction if it has unprocessed DbgRecords attached
+    // (see comment above `ProcessedLeadingDbgRecords`).
+    if (ProcessedLeadingDbgRecords) {
       // II is now either a debug intrinsic, a non-debug instruction with no
-      // attached DPValues, or a non-debug instruction with attached processed
-      // DPValues.
+      // attached DbgRecords, or a non-debug instruction with attached processed
+      // DbgRecords.
       // II has not been processed.
       if (!isa<DbgInfoIntrinsic>(&*II)) {
         if (II->isTerminator())
@@ -1912,19 +1915,19 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
       }
     }
     // II is now either a debug intrinsic, a non-debug instruction with no
-    // attached DPValues, or a non-debug instruction with attached unprocessed
-    // DPValues.
+    // attached DbgRecords, or a non-debug instruction with attached unprocessed
+    // DbgRecords.
     if (II != EI && II->hasDbgRecords()) {
       // Skip over non-variable debug records (i.e., labels). They're going to
       // be read from IR (possibly re-ordering them within the debug record
       // range) rather than from the analysis results.
-      for (DPValue &DPV : DPValue::filter(II->getDbgRecordRange())) {
-        resetInsertionPoint(DPV);
-        processDPValue(DPV, LiveSet);
+      for (DbgVariableRecord &DVR : filterDbgVars(II->getDbgRecordRange())) {
+        resetInsertionPoint(DVR);
+        processDbgVariableRecord(DVR, LiveSet);
         assert(LiveSet->isValid());
       }
     }
-    ProcessedLeadingDPValues = true;
+    ProcessedLeadingDbgRecords = true;
     while (II != EI) {
       auto *Dbg = dyn_cast<DbgInfoIntrinsic>(&*II);
       if (!Dbg)
@@ -1934,9 +1937,9 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
       assert(LiveSet->isValid());
       ++II;
     }
-    // II is now a non-debug instruction either with no attached DPValues, or
-    // with attached processed DPValues. II has not been processed, and all
-    // debug instructions or DPValues in the frame preceding II have been
+    // II is now a non-debug instruction either with no attached DbgRecords, or
+    // with attached processed DbgRecords. II has not been processed, and all
+    // debug instructions or DbgRecords in the frame preceding II have been
     // processed.
 
     // We've processed everything in the "frame". Now determine which variables
@@ -1999,9 +2002,11 @@ AssignmentTrackingLowering::joinAssignment(const Assignment &A,
       return A.Source;
     if (!A.Source || !B.Source)
       return AssignRecord();
-    assert(isa<DPValue *>(A.Source) == isa<DPValue *>(B.Source));
-    if (isa<DPValue *>(A.Source) &&
-        cast<DPValue *>(A.Source)->isEquivalentTo(*cast<DPValue *>(B.Source)))
+    assert(isa<DbgVariableRecord *>(A.Source) ==
+           isa<DbgVariableRecord *>(B.Source));
+    if (isa<DbgVariableRecord *>(A.Source) &&
+        cast<DbgVariableRecord *>(A.Source)->isEquivalentTo(
+            *cast<DbgVariableRecord *>(B.Source)))
       return A.Source;
     if (isa<DbgAssignIntrinsic *>(A.Source) &&
         cast<DbgAssignIntrinsic *>(A.Source)->isIdenticalTo(
@@ -2122,8 +2127,8 @@ DbgDeclareInst *DynCastToDbgDeclare(DbgVariableIntrinsic *DVI) {
   return dyn_cast<DbgDeclareInst>(DVI);
 }
 
-DPValue *DynCastToDbgDeclare(DPValue *DPV) {
-  return DPV->isDbgDeclare() ? DPV : nullptr;
+DbgVariableRecord *DynCastToDbgDeclare(DbgVariableRecord *DVR) {
+  return DVR->isDbgDeclare() ? DVR : nullptr;
 }
 
 /// Build a map of {Variable x: Variables y} where all variable fragments
@@ -2160,7 +2165,7 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
   // We need to add fragments for untagged stores too so that we can correctly
   // clobber overlapped fragment locations later.
   SmallVector<DbgDeclareInst *> InstDeclares;
-  SmallVector<DPValue *> DPDeclares;
+  SmallVector<DbgVariableRecord *> DPDeclares;
   auto ProcessDbgRecord = [&](auto *Record, auto &DeclareList) {
     if (auto *Declare = DynCastToDbgDeclare(Record)) {
       DeclareList.push_back(Declare);
@@ -2175,8 +2180,8 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
   };
   for (auto &BB : Fn) {
     for (auto &I : BB) {
-      for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange()))
-        ProcessDbgRecord(&DPV, DPDeclares);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+        ProcessDbgRecord(&DVR, DPDeclares);
       if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
         ProcessDbgRecord(DII, InstDeclares);
       } else if (auto Info = getUntaggedStoreAssignmentInfo(
@@ -2217,8 +2222,8 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
         };
         for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(Info->Base))
           HandleDbgAssignForStore(DAI);
-        for (DPValue *DPV : at::getDPVAssignmentMarkers(Info->Base))
-          HandleDbgAssignForStore(DPV);
+        for (DbgVariableRecord *DVR : at::getDVRAssignmentMarkers(Info->Base))
+          HandleDbgAssignForStore(DVR);
       }
     }
   }
@@ -2268,10 +2273,10 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
   for (auto *DDI : InstDeclares)
     FnVarLocs->addSingleLocVar(DebugVariable(DDI), DDI->getExpression(),
                                DDI->getDebugLoc(), DDI->getWrappedLocation());
-  for (auto *DPV : DPDeclares)
-    FnVarLocs->addSingleLocVar(DebugVariable(DPV), DPV->getExpression(),
-                               DPV->getDebugLoc(),
-                               RawLocationWrapper(DPV->getRawLocation()));
+  for (auto *DVR : DPDeclares)
+    FnVarLocs->addSingleLocVar(DebugVariable(DVR), DVR->getExpression(),
+                               DVR->getDebugLoc(),
+                               RawLocationWrapper(DVR->getRawLocation()));
   return Map;
 }
 
@@ -2465,9 +2470,9 @@ bool AssignmentTrackingLowering::emitPromotedVarLocs(
   for (auto &BB : Fn) {
     for (auto &I : BB) {
       // Skip instructions other than dbg.values and dbg.assigns.
-      for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange()))
-        if (DPV.isDbgValue() || DPV.isDbgAssign())
-          TranslateDbgRecord(&DPV);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+        if (DVR.isDbgValue() || DVR.isDbgAssign())
+          TranslateDbgRecord(&DVR);
       auto *DVI = dyn_cast<DbgValueInst>(&I);
       if (DVI)
         TranslateDbgRecord(DVI);
@@ -2567,8 +2572,8 @@ removeRedundantDbgLocsUsingBackwardScan(const BasicBlock *BB,
       }
     };
     HandleLocsForWedge(&I);
-    for (DPValue &DPV : reverse(DPValue::filter(I.getDbgRecordRange())))
-      HandleLocsForWedge(&DPV);
+    for (DbgVariableRecord &DVR : reverse(filterDbgVars(I.getDbgRecordRange())))
+      HandleLocsForWedge(&DVR);
   }
 
   return Changed;
@@ -2632,8 +2637,8 @@ removeRedundantDbgLocsUsingForwardScan(const BasicBlock *BB,
       }
     };
 
-    for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange()))
-      HandleLocsForWedge(&DPV);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      HandleLocsForWedge(&DVR);
     HandleLocsForWedge(&I);
   }
 
@@ -2718,8 +2723,8 @@ removeUndefDbgLocsFromEntryBlock(const BasicBlock *BB,
         Changed = true;
       }
     };
-    for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange()))
-      HandleLocsForWedge(&DPV);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      HandleLocsForWedge(&DVR);
     HandleLocsForWedge(&I);
   }
 
@@ -2752,8 +2757,8 @@ static DenseSet<DebugAggregate> findVarsWithStackSlot(Function &Fn) {
       for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(&I)) {
         Result.insert({DAI->getVariable(), DAI->getDebugLoc().getInlinedAt()});
       }
-      for (DPValue *DPV : at::getDPVAssignmentMarkers(&I)) {
-        Result.insert({DPV->getVariable(), DPV->getDebugLoc().getInlinedAt()});
+      for (DbgVariableRecord *DVR : at::getDVRAssignmentMarkers(&I)) {
+        Result.insert({DVR->getVariable(), DVR->getDebugLoc().getInlinedAt()});
       }
     }
   }
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 59a0c64d3c9f2..9f99bb7e693f7 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -445,8 +445,8 @@ class CodeGenPrepare {
   bool optimizeExtractElementInst(Instruction *Inst);
   bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT);
   bool fixupDbgValue(Instruction *I);
-  bool fixupDPValue(DPValue &I);
-  bool fixupDPValuesOnInst(Instruction &I);
+  bool fixupDbgVariableRecord(DbgVariableRecord &I);
+  bool fixupDbgVariableRecordsOnInst(Instruction &I);
   bool placeDbgValues(Function &F);
   bool placePseudoProbes(Function &F);
   bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
@@ -1943,6 +1943,39 @@ static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) {
   return false;
 }
 
+static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
+                                  const DataLayout &DL) {
+  FCmpInst *FCmp = dyn_cast<FCmpInst>(Cmp);
+  if (!FCmp)
+    return false;
+
+  // Don't fold if the target offers free fabs and the predicate is legal.
+  EVT VT = TLI.getValueType(DL, Cmp->getOperand(0)->getType());
+  if (TLI.isFAbsFree(VT) &&
+      TLI.isCondCodeLegal(getFCmpCondCode(FCmp->getPredicate()),
+                          VT.getSimpleVT()))
+    return false;
+
+  // Reverse the canonicalization if it is a FP class test
+  auto ShouldReverseTransform = [](FPClassTest ClassTest) {
+    return ClassTest == fcInf || ClassTest == (fcInf | fcNan);
+  };
+  auto [ClassVal, ClassTest] =
+      fcmpToClassTest(FCmp->getPredicate(), *FCmp->getParent()->getParent(),
+                      FCmp->getOperand(0), FCmp->getOperand(1));
+  if (!ClassVal)
+    return false;
+
+  if (!ShouldReverseTransform(ClassTest) && !ShouldReverseTransform(~ClassTest))
+    return false;
+
+  IRBuilder<> Builder(Cmp);
+  Value *IsFPClass = Builder.createIsFPClass(ClassVal, ClassTest);
+  Cmp->replaceAllUsesWith(IsFPClass);
+  RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+  return true;
+}
+
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (sinkCmpExpression(Cmp, *TLI))
     return true;
@@ -1959,6 +1992,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (swapICmpOperandsToExposeCSEOpportunities(Cmp))
     return true;
 
+  if (foldFCmpToFPClassTest(Cmp, *TLI, *DL))
+    return true;
+
   return false;
 }
 
@@ -2021,9 +2057,9 @@ static bool sinkAndCmp0Expression(Instruction *AndI, const TargetLowering &TLI,
     // Keep the 'and' in the same place if the use is already in the same block.
     Instruction *InsertPt =
         User->getParent() == AndI->getParent() ? AndI : User;
-    Instruction *InsertedAnd =
-        BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
-                               AndI->getOperand(1), "", InsertPt);
+    Instruction *InsertedAnd = BinaryOperator::Create(
+        Instruction::And, AndI->getOperand(0), AndI->getOperand(1), "",
+        InsertPt->getIterator());
     // Propagate the debug info.
     InsertedAnd->setDebugLoc(AndI->getDebugLoc());
 
@@ -2946,7 +2982,7 @@ class TypePromotionTransaction {
       Instruction *PrevInst;
       BasicBlock *BB;
     } Point;
-    std::optional<DPValue::self_iterator> BeforeDPValue = std::nullopt;
+    std::optional<DbgRecord::self_iterator> BeforeDbgRecord = std::nullopt;
 
     /// Remember whether or not the instruction had a previous instruction.
     bool HasPrevInstruction;
@@ -2958,9 +2994,9 @@ class TypePromotionTransaction {
       BasicBlock *BB = Inst->getParent();
 
       // Record where we would have to re-insert the instruction in the sequence
-      // of DPValues, if we ended up reinserting.
+      // of DbgRecords, if we ended up reinserting.
       if (BB->IsNewDbgInfoFormat)
-        BeforeDPValue = Inst->getDbgReinsertionPosition();
+        BeforeDbgRecord = Inst->getDbgReinsertionPosition();
 
       if (HasPrevInstruction) {
         Point.PrevInst = &*std::prev(Inst->getIterator());
@@ -2983,7 +3019,7 @@ class TypePromotionTransaction {
           Inst->insertBefore(*Point.BB, Position);
       }
 
-      Inst->getParent()->reinsertInstInDbgRecords(Inst, BeforeDPValue);
+      Inst->getParent()->reinsertInstInDbgRecords(Inst, BeforeDbgRecord);
     }
   };
 
@@ -3187,7 +3223,7 @@ class TypePromotionTransaction {
     /// Keep track of the debug users.
     SmallVector<DbgValueInst *, 1> DbgValues;
     /// And non-instruction debug-users too.
-    SmallVector<DPValue *, 1> DPValues;
+    SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
 
     /// Keep track of the new value so that we can undo it by replacing
     /// instances of the new value with the original value.
@@ -3208,7 +3244,7 @@ class TypePromotionTransaction {
       }
       // Record the debug uses separately. They are not in the instruction's
       // use list, but they are replaced by RAUW.
-      findDbgValues(DbgValues, Inst, &DPValues);
+      findDbgValues(DbgValues, Inst, &DbgVariableRecords);
 
       // Now, we can replace the uses.
       Inst->replaceAllUsesWith(New);
@@ -3225,10 +3261,10 @@ class TypePromotionTransaction {
       // correctness and utility of debug value instructions.
       for (auto *DVI : DbgValues)
         DVI->replaceVariableLocationOp(New, Inst);
-      // Similar story with DPValues, the non-instruction representation of
-      // dbg.values.
-      for (DPValue *DPV : DPValues) // tested by transaction-test I'm adding
-        DPV->replaceVariableLocationOp(New, Inst);
+      // Similar story with DbgVariableRecords, the non-instruction
+      // representation of dbg.values.
+      for (DbgVariableRecord *DVR : DbgVariableRecords)
+        DVR->replaceVariableLocationOp(New, Inst);
     }
   };
 
@@ -4117,9 +4153,10 @@ class AddressingModeCombiner {
       if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {
         // Is it OK to get metadata from OrigSelect?!
         // Create a Select placeholder with dummy value.
-        SelectInst *Select = SelectInst::Create(
-            CurrentSelect->getCondition(), Dummy, Dummy,
-            CurrentSelect->getName(), CurrentSelect, CurrentSelect);
+        SelectInst *Select =
+            SelectInst::Create(CurrentSelect->getCondition(), Dummy, Dummy,
+                               CurrentSelect->getName(),
+                               CurrentSelect->getIterator(), CurrentSelect);
         Map[Current] = Select;
         ST.insertNewSelect(Select);
         // We are interested in True and False values.
@@ -6430,8 +6467,8 @@ bool CodeGenPrepare::optimizePhiType(
       ValMap[D] = D->getOperand(0);
       DeletedInstrs.insert(D);
     } else {
-      ValMap[D] =
-          new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());
+      BasicBlock::iterator insertPt = std::next(D->getIterator());
+      ValMap[D] = new BitCastInst(D, ConvertTy, D->getName() + ".bc", insertPt);
     }
   }
   for (PHINode *Phi : PhiNodes)
@@ -6451,8 +6488,8 @@ bool CodeGenPrepare::optimizePhiType(
       DeletedInstrs.insert(U);
       replaceAllUsesWith(U, ValMap[U->getOperand(0)], FreshBBs, IsHugeFunc);
     } else {
-      U->setOperand(0,
-                    new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
+      U->setOperand(0, new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc",
+                                       U->getIterator()));
     }
   }
 
@@ -7080,9 +7117,9 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   CurInstIterator = std::next(LastSI->getIterator());
   // Examine debug-info attached to the consecutive select instructions. They
   // won't be individually optimised by optimizeInst, so we need to perform
-  // DPValue maintenence here instead.
+  // DbgVariableRecord maintenence here instead.
   for (SelectInst *SI : ArrayRef(ASI).drop_front())
-    fixupDPValuesOnInst(*SI);
+    fixupDbgVariableRecordsOnInst(*SI);
 
   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
 
@@ -8238,7 +8275,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
 
 bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
   bool AnyChange = false;
-  AnyChange = fixupDPValuesOnInst(*I);
+  AnyChange = fixupDbgVariableRecordsOnInst(*I);
 
   // Bail out if we inserted the instruction to prevent optimizations from
   // stepping on each other's toes.
@@ -8348,7 +8385,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (GEPI->hasAllZeroIndices()) {
       /// The GEP operand must be a pointer, so must its result -> BitCast
       Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
-                                        GEPI->getName(), GEPI);
+                                        GEPI->getName(), GEPI->getIterator());
       NC->setDebugLoc(GEPI->getDebugLoc());
       replaceAllUsesWith(GEPI, NC, FreshBBs, IsHugeFunc);
       RecursivelyDeleteTriviallyDeadInstructions(
@@ -8380,7 +8417,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
                     isa<ConstantPointerNull>(Op1);
       if (Const0 || Const1) {
         if (!Const0 || !Const1) {
-          auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI);
+          auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI->getIterator());
           F->takeName(FI);
           CmpI->setOperand(Const0 ? 1 : 0, F);
         }
@@ -8504,24 +8541,24 @@ bool CodeGenPrepare::fixupDbgValue(Instruction *I) {
   return AnyChange;
 }
 
-bool CodeGenPrepare::fixupDPValuesOnInst(Instruction &I) {
+bool CodeGenPrepare::fixupDbgVariableRecordsOnInst(Instruction &I) {
   bool AnyChange = false;
-  for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange()))
-    AnyChange |= fixupDPValue(DPV);
+  for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+    AnyChange |= fixupDbgVariableRecord(DVR);
   return AnyChange;
 }
 
 // FIXME: should updating debug-info really cause the "changed" flag to fire,
 // which can cause a function to be reprocessed?
-bool CodeGenPrepare::fixupDPValue(DPValue &DPV) {
-  if (DPV.Type != DPValue::LocationType::Value &&
-      DPV.Type != DPValue::LocationType::Assign)
+bool CodeGenPrepare::fixupDbgVariableRecord(DbgVariableRecord &DVR) {
+  if (DVR.Type != DbgVariableRecord::LocationType::Value &&
+      DVR.Type != DbgVariableRecord::LocationType::Assign)
     return false;
 
-  // Does this DPValue refer to a sunk address calculation?
+  // Does this DbgVariableRecord refer to a sunk address calculation?
   bool AnyChange = false;
-  SmallDenseSet<Value *> LocationOps(DPV.location_ops().begin(),
-                                     DPV.location_ops().end());
+  SmallDenseSet<Value *> LocationOps(DVR.location_ops().begin(),
+                                     DVR.location_ops().end());
   for (Value *Location : LocationOps) {
     WeakTrackingVH SunkAddrVH = SunkAddrs[Location];
     Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
@@ -8531,7 +8568,7 @@ bool CodeGenPrepare::fixupDPValue(DPValue &DPV) {
       // of pointer being referred to; however this makes no difference to
       // debugging information, and we can't generate bitcasts that may affect
       // codegen.
-      DPV.replaceVariableLocationOp(Location, SunkAddr);
+      DVR.replaceVariableLocationOp(Location, SunkAddr);
       AnyChange = true;
     }
   }
@@ -8546,13 +8583,13 @@ static void DbgInserterHelper(DbgValueInst *DVI, Instruction *VI) {
     DVI->insertAfter(VI);
 }
 
-static void DbgInserterHelper(DPValue *DPV, Instruction *VI) {
-  DPV->removeFromParent();
+static void DbgInserterHelper(DbgVariableRecord *DVR, Instruction *VI) {
+  DVR->removeFromParent();
   BasicBlock *VIBB = VI->getParent();
   if (isa<PHINode>(VI))
-    VIBB->insertDbgRecordBefore(DPV, VIBB->getFirstInsertionPt());
+    VIBB->insertDbgRecordBefore(DVR, VIBB->getFirstInsertionPt());
   else
-    VIBB->insertDbgRecordAfter(DPV, VI);
+    VIBB->insertDbgRecordAfter(DVR, VI);
 }
 
 // A llvm.dbg.value may be using a value before its definition, due to
@@ -8617,13 +8654,13 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
         continue;
       }
 
-      // If this isn't a dbg.value, process any attached DPValue records
-      // attached to this instruction.
-      for (DPValue &DPV : llvm::make_early_inc_range(
-               DPValue::filter(Insn.getDbgRecordRange()))) {
-        if (DPV.Type != DPValue::LocationType::Value)
+      // If this isn't a dbg.value, process any attached DbgVariableRecord
+      // records attached to this instruction.
+      for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
+               filterDbgVars(Insn.getDbgRecordRange()))) {
+        if (DVR.Type != DbgVariableRecord::LocationType::Value)
           continue;
-        DbgProcessor(&DPV, &Insn);
+        DbgProcessor(&DVR, &Insn);
       }
     }
   }
diff --git a/llvm/lib/CodeGen/DFAPacketizer.cpp b/llvm/lib/CodeGen/DFAPacketizer.cpp
index 48bb4a07662e1..c16166a1d5e1c 100644
--- a/llvm/lib/CodeGen/DFAPacketizer.cpp
+++ b/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -252,12 +252,13 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
 bool VLIWPacketizerList::alias(const MachineMemOperand &Op1,
                                const MachineMemOperand &Op2,
                                bool UseTBAA) const {
-  if (!Op1.getValue() || !Op2.getValue())
+  if (!Op1.getValue() || !Op2.getValue() || !Op1.getSize().hasValue() ||
+      !Op2.getSize().hasValue())
     return true;
 
   int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
-  int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
-  int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
+  int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;
+  int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;
 
   AliasResult AAResult =
       AA->alias(MemoryLocation(Op1.getValue(), Overlapa,
diff --git a/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
index e7eb34d8e6518..09e7cfb12bdba 100644
--- a/llvm/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -111,7 +111,8 @@ Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
   }
 
   if (!ExnObj)
-    ExnObj = ExtractValueInst::Create(RI->getOperand(0), 0, "exn.obj", RI);
+    ExnObj = ExtractValueInst::Create(RI->getOperand(0), 0, "exn.obj",
+                                      RI->getIterator());
 
   RI->eraseFromParent();
 
@@ -158,7 +159,7 @@ size_t DwarfEHPrepare::pruneUnreachableResumes(
       Resumes[ResumesLeft++] = RI;
     } else {
       BasicBlock *BB = RI->getParent();
-      new UnreachableInst(Ctx, RI);
+      new UnreachableInst(Ctx, RI->getIterator());
       RI->eraseFromParent();
       simplifyCFG(BB, *TTI, DTU);
     }
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index 78ad2a25d0e47..308f13c19f756 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -375,7 +375,7 @@ static void expandIToFP(Instruction *IToFP) {
   Value *Sub2 = Builder.CreateSub(Builder.getIntN(BitWidthNew, BitWidth - 1),
                                   FloatWidth == 128 ? Call : Cast);
   Value *Cmp3 = Builder.CreateICmpSGT(
-      Sub2, Builder.getIntN(BitWidthNew, FPMantissaWidth + 1));
+      Sub1, Builder.getIntN(BitWidthNew, FPMantissaWidth + 1));
   Builder.CreateCondBr(Cmp3, IfThen4, IfElse);
 
   // if.then4:
diff --git a/llvm/lib/CodeGen/FinalizeISel.cpp b/llvm/lib/CodeGen/FinalizeISel.cpp
index 329c9587e3212..978355f8eb1bb 100644
--- a/llvm/lib/CodeGen/FinalizeISel.cpp
+++ b/llvm/lib/CodeGen/FinalizeISel.cpp
@@ -14,8 +14,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
@@ -45,6 +47,7 @@ INITIALIZE_PASS(FinalizeISel, DEBUG_TYPE,
 
 bool FinalizeISel::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
 
   // Iterate through each instruction in the function, looking for pseudos.
@@ -54,6 +57,12 @@ bool FinalizeISel::runOnMachineFunction(MachineFunction &MF) {
          MBBI != MBBE; ) {
       MachineInstr &MI = *MBBI++;
 
+      // Set AdjustsStack to true if the instruction selector emits a stack
+      // frame setup instruction or a stack aligning inlineasm.
+      if (MI.getOpcode() == TII->getCallFrameSetupOpcode() ||
+          MI.isStackAligningInlineAsm())
+        MF.getFrameInfo().setAdjustsStack(true);
+
       // If MI is a pseudo, expand it.
       if (MI.usesCustomInsertionHook()) {
         Changed = true;
diff --git a/llvm/lib/CodeGen/GCRootLowering.cpp b/llvm/lib/CodeGen/GCRootLowering.cpp
index 894ab9a0486a7..700714d539847 100644
--- a/llvm/lib/CodeGen/GCRootLowering.cpp
+++ b/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -81,6 +81,9 @@ class GCMachineCodeAnalysis : public MachineFunctionPass {
 
 PreservedAnalyses GCLoweringPass::run(Function &F,
                                       FunctionAnalysisManager &FAM) {
+  if (!F.hasGC())
+    return PreservedAnalyses::all();
+
   auto &Info = FAM.getResult<GCFunctionAnalysis>(F);
 
   bool Changed = DoLowering(F, Info.getStrategy());
@@ -178,7 +181,7 @@ static bool InsertRootInitializers(Function &F, ArrayRef<AllocaInst *> Roots) {
     if (!InitedRoots.count(Root)) {
       new StoreInst(
           ConstantPointerNull::get(cast<PointerType>(Root->getAllocatedType())),
-          Root, Root->getNextNode());
+          Root, std::next(Root->getIterator()));
       MadeChange = true;
     }
 
@@ -213,8 +216,8 @@ bool DoLowering(Function &F, GCStrategy &S) {
       default: break;
       case Intrinsic::gcwrite: {
         // Replace a write barrier with a simple store.
-        Value *St = new StoreInst(CI->getArgOperand(0),
-                                  CI->getArgOperand(2), CI);
+        Value *St = new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2),
+                                  CI->getIterator());
         CI->replaceAllUsesWith(St);
         CI->eraseFromParent();
         MadeChange = true;
@@ -222,7 +225,8 @@ bool DoLowering(Function &F, GCStrategy &S) {
       }
       case Intrinsic::gcread: {
         // Replace a read barrier with a simple load.
-        Value *Ld = new LoadInst(CI->getType(), CI->getArgOperand(1), "", CI);
+        Value *Ld = new LoadInst(CI->getType(), CI->getArgOperand(1), "",
+                                 CI->getIterator());
         Ld->takeName(CI);
         CI->replaceAllUsesWith(Ld);
         CI->eraseFromParent();
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 3a37bf3cd7a8a..363fad53b76c3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
@@ -91,6 +92,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
                              ArrayRef<Register> ResRegs,
                              ArrayRef<ArrayRef<Register>> ArgRegs,
                              Register SwiftErrorVReg,
+                             Register ConvergenceCtrlToken,
                              std::function<unsigned()> GetCalleeReg) const {
   CallLoweringInfo Info;
   const DataLayout &DL = MIRBuilder.getDataLayout();
@@ -121,7 +123,6 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
     CanBeTailCalled = false;
   }
 
-
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
   // we'll pass to the assigner function.
@@ -187,6 +188,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees);
   Info.CallConv = CallConv;
   Info.SwiftErrorVReg = SwiftErrorVReg;
+  Info.ConvergenceCtrlToken = ConvergenceCtrlToken;
   Info.IsMustTailCall = CB.isMustTailCall();
   Info.IsTailCall = CanBeTailCalled;
   Info.IsVarArg = IsVarArg;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index ab055b723dbb1..d3f86af1e2908 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -770,12 +770,12 @@ bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
   LLT RegTy = MRI.getType(LoadReg);
   Register PtrReg = LoadMI->getPointerReg();
   unsigned RegSize = RegTy.getSizeInBits();
-  uint64_t LoadSizeBits = LoadMI->getMemSizeInBits();
+  LocationSize LoadSizeBits = LoadMI->getMemSizeInBits();
   unsigned MaskSizeBits = MaskVal.countr_one();
 
   // The mask may not be larger than the in-memory type, as it might cover sign
   // extended bits
-  if (MaskSizeBits > LoadSizeBits)
+  if (MaskSizeBits > LoadSizeBits.getValue())
     return false;
 
   // If the mask covers the whole destination register, there's nothing to
@@ -795,7 +795,8 @@ bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
   // still adjust the opcode to indicate the high bit behavior.
   if (LoadMI->isSimple())
     MemDesc.MemoryTy = LLT::scalar(MaskSizeBits);
-  else if (LoadSizeBits > MaskSizeBits || LoadSizeBits == RegSize)
+  else if (LoadSizeBits.getValue() > MaskSizeBits ||
+           LoadSizeBits.getValue() == RegSize)
     return false;
 
   // TODO: Could check if it's legal with the reduced or original memory size.
@@ -860,7 +861,8 @@ bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) {
   if (auto *LoadMI = getOpcodeDef<GSExtLoad>(LoadUser, MRI)) {
     // If truncating more than the original extended value, abort.
     auto LoadSizeBits = LoadMI->getMemSizeInBits();
-    if (TruncSrc && MRI.getType(TruncSrc).getSizeInBits() < LoadSizeBits)
+    if (TruncSrc &&
+        MRI.getType(TruncSrc).getSizeInBits() < LoadSizeBits.getValue())
       return false;
     if (LoadSizeBits == SizeInBits)
       return true;
@@ -891,7 +893,7 @@ bool CombinerHelper::matchSextInRegOfLoad(
   if (!LoadDef || !MRI.hasOneNonDBGUse(DstReg))
     return false;
 
-  uint64_t MemBits = LoadDef->getMemSizeInBits();
+  uint64_t MemBits = LoadDef->getMemSizeInBits().getValue();
 
   // If the sign extend extends from a narrower width than the load's width,
   // then we can narrow the load width when we combine to a G_SEXTLOAD.
@@ -1490,7 +1492,7 @@ void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI,
   Observer.changedInstr(*BrCond);
 }
 
- 
+
 bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) {
   MachineIRBuilder HelperBuilder(MI);
   GISelObserverWrapper DummyObserver;
@@ -4936,24 +4938,6 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
   return true;
 }
 
-bool CombinerHelper::matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
-  // (G_*ADDO x, 0) -> x + no carry out
-  assert(MI.getOpcode() == TargetOpcode::G_UADDO ||
-         MI.getOpcode() == TargetOpcode::G_SADDO);
-  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0)))
-    return false;
-  Register Carry = MI.getOperand(1).getReg();
-  if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
-    return false;
-  Register Dst = MI.getOperand(0).getReg();
-  Register LHS = MI.getOperand(2).getReg();
-  MatchInfo = [=](MachineIRBuilder &B) {
-    B.buildCopy(Dst, LHS);
-    B.buildConstant(Carry, 0);
-  };
-  return true;
-}
-
 bool CombinerHelper::matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) {
   // (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
   // (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
@@ -6354,6 +6338,26 @@ CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
   return Value;
 }
 
+// FIXME G_SPLAT_VECTOR
+bool CombinerHelper::isConstantOrConstantVectorI(Register Src) const {
+  auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+  if (IConstant)
+    return true;
+
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return false;
+
+  unsigned NumSources = BuildVector->getNumSources();
+  for (unsigned I = 0; I < NumSources; ++I) {
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (!IConstant)
+      return false;
+  }
+  return true;
+}
+
 // TODO: use knownbits to determine zeros
 bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
                                               BuildFnTy &MatchInfo) {
@@ -6928,3 +6932,178 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) {
 
   return false;
 }
+
+bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  GAddCarryOut *Add = cast<GAddCarryOut>(&MI);
+
+  // Addo has no flags
+  Register Dst = Add->getReg(0);
+  Register Carry = Add->getReg(1);
+  Register LHS = Add->getLHSReg();
+  Register RHS = Add->getRHSReg();
+  bool IsSigned = Add->isSigned();
+  LLT DstTy = MRI.getType(Dst);
+  LLT CarryTy = MRI.getType(Carry);
+
+  // We want do fold the [u|s]addo.
+  if (!MRI.hasOneNonDBGUse(Dst))
+    return false;
+
+  // Fold addo, if the carry is dead -> add, undef.
+  if (MRI.use_nodbg_empty(Carry) &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildAdd(Dst, LHS, RHS);
+      B.buildUndef(Carry);
+    };
+    return true;
+  }
+
+  // We want do fold the [u|s]addo.
+  if (!MRI.hasOneNonDBGUse(Carry))
+    return false;
+
+  // Canonicalize constant to RHS.
+  if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) {
+    if (IsSigned) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSAddo(Dst, Carry, RHS, LHS);
+      };
+      return true;
+    }
+    // !IsSigned
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildUAddo(Dst, Carry, RHS, LHS);
+    };
+    return true;
+  }
+
+  std::optional<APInt> MaybeLHS = getConstantOrConstantSplatVector(LHS);
+  std::optional<APInt> MaybeRHS = getConstantOrConstantSplatVector(RHS);
+
+  // Fold addo(c1, c2) -> c3, carry.
+  if (MaybeLHS && MaybeRHS && isConstantLegalOrBeforeLegalizer(DstTy) &&
+      isConstantLegalOrBeforeLegalizer(CarryTy)) {
+    bool Overflow;
+    APInt Result = IsSigned ? MaybeLHS->sadd_ov(*MaybeRHS, Overflow)
+                            : MaybeLHS->uadd_ov(*MaybeRHS, Overflow);
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildConstant(Dst, Result);
+      B.buildConstant(Carry, Overflow);
+    };
+    return true;
+  }
+
+  // Fold (addo x, 0) -> x, no borrow
+  if (MaybeRHS && *MaybeRHS == 0 && isConstantLegalOrBeforeLegalizer(CarryTy)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildCopy(Dst, LHS);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+
+  // Given 2 constant operands whose sum does not overflow:
+  // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1
+  // saddo (X +nsw C0), C1 -> saddo X, C0 + C1
+  GAdd *AddLHS = getOpcodeDef<GAdd>(LHS, MRI);
+  if (MaybeRHS && AddLHS && MRI.hasOneNonDBGUse(Add->getReg(0)) &&
+      ((IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoSWrap)) ||
+       (!IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoUWrap)))) {
+    std::optional<APInt> MaybeAddRHS =
+        getConstantOrConstantSplatVector(AddLHS->getRHSReg());
+    if (MaybeAddRHS) {
+      bool Overflow;
+      APInt NewC = IsSigned ? MaybeAddRHS->sadd_ov(*MaybeRHS, Overflow)
+                            : MaybeAddRHS->uadd_ov(*MaybeRHS, Overflow);
+      if (!Overflow && isConstantLegalOrBeforeLegalizer(DstTy)) {
+        if (IsSigned) {
+          MatchInfo = [=](MachineIRBuilder &B) {
+            auto ConstRHS = B.buildConstant(DstTy, NewC);
+            B.buildSAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
+          };
+          return true;
+        }
+        // !IsSigned
+        MatchInfo = [=](MachineIRBuilder &B) {
+          auto ConstRHS = B.buildConstant(DstTy, NewC);
+          B.buildUAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
+        };
+        return true;
+      }
+    }
+  };
+
+  // We try to combine addo to non-overflowing add.
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) ||
+      !isConstantLegalOrBeforeLegalizer(CarryTy))
+    return false;
+
+  // We try to combine uaddo to non-overflowing add.
+  if (!IsSigned) {
+    ConstantRange CRLHS =
+        ConstantRange::fromKnownBits(KB->getKnownBits(LHS), /*IsSigned=*/false);
+    ConstantRange CRRHS =
+        ConstantRange::fromKnownBits(KB->getKnownBits(RHS), /*IsSigned=*/false);
+
+    switch (CRLHS.unsignedAddMayOverflow(CRRHS)) {
+    case ConstantRange::OverflowResult::MayOverflow:
+      return false;
+    case ConstantRange::OverflowResult::NeverOverflows: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+        B.buildConstant(Carry, 0);
+      };
+      return true;
+    }
+    case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+    case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildAdd(Dst, LHS, RHS);
+        B.buildConstant(Carry, 1);
+      };
+      return true;
+    }
+    }
+    return false;
+  }
+
+  // We try to combine saddo to non-overflowing add.
+
+  // If LHS and RHS each have at least two sign bits, then there is no signed
+  // overflow.
+  if (KB->computeNumSignBits(RHS) > 1 && KB->computeNumSignBits(LHS) > 1) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+
+  ConstantRange CRLHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(LHS), /*IsSigned=*/true);
+  ConstantRange CRRHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(RHS), /*IsSigned=*/true);
+
+  switch (CRLHS.signedAddMayOverflow(CRRHS)) {
+  case ConstantRange::OverflowResult::MayOverflow:
+    return false;
+  case ConstantRange::OverflowResult::NeverOverflows: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+  case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+  case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildAdd(Dst, LHS, RHS);
+      B.buildConstant(Carry, 1);
+    };
+    return true;
+  }
+  }
+
+  return false;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 099bf45b2734c..2e2cc9a95bd95 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -415,7 +415,8 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     if (DstTy.isVector())
       break;
     // Everything above the retrieved bits is zero
-    Known.Zero.setBitsFrom((*MI.memoperands_begin())->getSizeInBits());
+    Known.Zero.setBitsFrom(
+        (*MI.memoperands_begin())->getSizeInBits().getValue());
     break;
   }
   case TargetOpcode::G_ASHR: {
@@ -666,7 +667,7 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
 
     // e.g. i16->i32 = '17' bits known.
     const MachineMemOperand *MMO = *MI.memoperands_begin();
-    return TyBits - MMO->getSizeInBits() + 1;
+    return TyBits - MMO->getSizeInBits().getValue() + 1;
   }
   case TargetOpcode::G_ZEXTLOAD: {
     // FIXME: We need an in-memory type representation.
@@ -675,7 +676,7 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
 
     // e.g. i16->i32 = '16' bits known.
     const MachineMemOperand *MMO = *MI.memoperands_begin();
-    return TyBits - MMO->getSizeInBits();
+    return TyBits - MMO->getSizeInBits().getValue();
   }
   case TargetOpcode::G_TRUNC: {
     Register Src = MI.getOperand(1).getReg();
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 94fdb37e283bb..c18574071bfb6 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
@@ -213,8 +214,9 @@ ArrayRef<Register> IRTranslator::getOrCreateVRegs(const Value &Val) {
   auto *VRegs = VMap.getVRegs(Val);
   auto *Offsets = VMap.getOffsets(Val);
 
-  assert(Val.getType()->isSized() &&
-         "Don't know how to create an empty vreg");
+  if (!Val.getType()->isTokenTy())
+    assert(Val.getType()->isSized() &&
+           "Don't know how to create an empty vreg");
 
   SmallVector<LLT, 4> SplitTys;
   computeValueLLTs(*DL, *Val.getType(), SplitTys,
@@ -1361,9 +1363,8 @@ static bool isSwiftError(const Value *V) {
 
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
-
-  unsigned StoreSize = DL->getTypeStoreSize(LI.getType());
-  if (StoreSize == 0)
+  TypeSize StoreSize = DL->getTypeStoreSize(LI.getType());
+  if (StoreSize.isZero())
     return true;
 
   ArrayRef<Register> Regs = getOrCreateVRegs(LI);
@@ -1770,6 +1771,41 @@ bool IRTranslator::translateMemFunc(const CallInst &CI,
   return true;
 }
 
+bool IRTranslator::translateVectorInterleave2Intrinsic(
+    const CallInst &CI, MachineIRBuilder &MIRBuilder) {
+  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2 &&
+         "This function can only be called on the interleave2 intrinsic!");
+  // Canonicalize interleave2 to G_SHUFFLE_VECTOR (similar to SelectionDAG).
+  Register Op0 = getOrCreateVReg(*CI.getOperand(0));
+  Register Op1 = getOrCreateVReg(*CI.getOperand(1));
+  Register Res = getOrCreateVReg(CI);
+
+  LLT OpTy = MRI->getType(Op0);
+  MIRBuilder.buildShuffleVector(Res, Op0, Op1,
+                                createInterleaveMask(OpTy.getNumElements(), 2));
+
+  return true;
+}
+
+bool IRTranslator::translateVectorDeinterleave2Intrinsic(
+    const CallInst &CI, MachineIRBuilder &MIRBuilder) {
+  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2 &&
+         "This function can only be called on the deinterleave2 intrinsic!");
+  // Canonicalize deinterleave2 to shuffles that extract sub-vectors (similar to
+  // SelectionDAG).
+  Register Op = getOrCreateVReg(*CI.getOperand(0));
+  auto Undef = MIRBuilder.buildUndef(MRI->getType(Op));
+  ArrayRef<Register> Res = getOrCreateVRegs(CI);
+
+  LLT ResTy = MRI->getType(Res[0]);
+  MIRBuilder.buildShuffleVector(Res[0], Op, Undef,
+                                createStrideMask(0, 2, ResTy.getNumElements()));
+  MIRBuilder.buildShuffleVector(Res[1], Op, Undef,
+                                createStrideMask(1, 2, ResTy.getNumElements()));
+
+  return true;
+}
+
 void IRTranslator::getStackGuard(Register DstReg,
                                  MachineIRBuilder &MIRBuilder) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
@@ -2038,6 +2074,36 @@ bool IRTranslator::translateIfEntryValueArgument(bool isDeclare, Value *Val,
   return true;
 }
 
+static unsigned getConvOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unexpected intrinsic");
+  case Intrinsic::experimental_convergence_anchor:
+    return TargetOpcode::CONVERGENCECTRL_ANCHOR;
+  case Intrinsic::experimental_convergence_entry:
+    return TargetOpcode::CONVERGENCECTRL_ENTRY;
+  case Intrinsic::experimental_convergence_loop:
+    return TargetOpcode::CONVERGENCECTRL_LOOP;
+  }
+}
+
+bool IRTranslator::translateConvergenceControlIntrinsic(
+    const CallInst &CI, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder) {
+  MachineInstrBuilder MIB = MIRBuilder.buildInstr(getConvOpcode(ID));
+  Register OutputReg = getOrCreateConvergenceTokenVReg(CI);
+  MIB.addDef(OutputReg);
+
+  if (ID == Intrinsic::experimental_convergence_loop) {
+    auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl);
+    assert(Bundle && "Expected a convergence control token.");
+    Register InputReg =
+        getOrCreateConvergenceTokenVReg(*Bundle->Inputs[0].get());
+    MIB.addUse(InputReg);
+  }
+
+  return true;
+}
+
 bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                                            MachineIRBuilder &MIRBuilder) {
   if (auto *MI = dyn_cast<AnyMemIntrinsic>(&CI)) {
@@ -2474,12 +2540,30 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
     return true;
   }
+
+  case Intrinsic::experimental_vector_interleave2:
+  case Intrinsic::experimental_vector_deinterleave2: {
+    // Both intrinsics have at least one operand.
+    Value *Op0 = CI.getOperand(0);
+    LLT ResTy = getLLTForType(*Op0->getType(), MIRBuilder.getDataLayout());
+    if (!ResTy.isFixedVector())
+      return false;
+
+    if (CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+      return translateVectorInterleave2Intrinsic(CI, MIRBuilder);
+
+    return translateVectorDeinterleave2Intrinsic(CI, MIRBuilder);
+  }
+
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
     return translateConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(CI),
                                            MIRBuilder);
-
+  case Intrinsic::experimental_convergence_anchor:
+  case Intrinsic::experimental_convergence_entry:
+  case Intrinsic::experimental_convergence_loop:
+    return translateConvergenceControlIntrinsic(CI, ID, MIRBuilder);
   }
   return false;
 }
@@ -2530,12 +2614,18 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
     }
   }
 
+  Register ConvergenceCtrlToken = 0;
+  if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+    const auto &Token = *Bundle->Inputs[0].get();
+    ConvergenceCtrlToken = getOrCreateConvergenceTokenVReg(Token);
+  }
+
   // We don't set HasCalls on MFI here yet because call lowering may decide to
   // optimize into tail calls. Instead, we defer that to selection where a final
   // scan is done to check if any instructions are calls.
-  bool Success =
-      CLI->lowerCall(MIRBuilder, CB, Res, Args, SwiftErrorVReg,
-                     [&]() { return getOrCreateVReg(*CB.getCalledOperand()); });
+  bool Success = CLI->lowerCall(
+      MIRBuilder, CB, Res, Args, SwiftErrorVReg, ConvergenceCtrlToken,
+      [&]() { return getOrCreateVReg(*CB.getCalledOperand()); });
 
   // Check if we just inserted a tail call.
   if (Success) {
@@ -2649,6 +2739,14 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
         MF->getMachineMemOperand(MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata()));
   }
 
+  if (CI.isConvergent()) {
+    if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+      auto *Token = Bundle->Inputs[0].get();
+      Register TokenReg = getOrCreateVReg(*Token);
+      MIB.addUse(TokenReg, RegState::Implicit);
+    }
+  }
+
   return true;
 }
 
@@ -3287,16 +3385,16 @@ void IRTranslator::translateDbgInfo(const Instruction &Inst,
       MIRBuilder.buildDbgLabel(DPL->getLabel());
       continue;
     }
-    DPValue &DPV = cast<DPValue>(DR);
-    const DILocalVariable *Variable = DPV.getVariable();
-    const DIExpression *Expression = DPV.getExpression();
-    Value *V = DPV.getVariableLocationOp(0);
-    if (DPV.isDbgDeclare())
-      translateDbgDeclareRecord(V, DPV.hasArgList(), Variable,
-                         Expression, DPV.getDebugLoc(), MIRBuilder);
+    DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
+    const DILocalVariable *Variable = DVR.getVariable();
+    const DIExpression *Expression = DVR.getExpression();
+    Value *V = DVR.getVariableLocationOp(0);
+    if (DVR.isDbgDeclare())
+      translateDbgDeclareRecord(V, DVR.hasArgList(), Variable, Expression,
+                                DVR.getDebugLoc(), MIRBuilder);
     else
-      translateDbgValueRecord(V, DPV.hasArgList(), Variable,
-                         Expression, DPV.getDebugLoc(), MIRBuilder);
+      translateDbgValueRecord(V, DVR.hasArgList(), Variable, Expression,
+                              DVR.getDebugLoc(), MIRBuilder);
   }
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 4089a5e941b05..14e1e1fdf01de 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -538,6 +538,14 @@ bool InlineAsmLowering::lowerInlineAsm(
     }
   }
 
+  if (auto Bundle = Call.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+    auto *Token = Bundle->Inputs[0].get();
+    ArrayRef<Register> SourceRegs = GetOrCreateVRegs(*Token);
+    assert(SourceRegs.size() == 1 &&
+           "Expected the control token to fit into a single virtual register");
+    Inst.addUse(SourceRegs[0], RegState::Implicit);
+  }
+
   if (const MDNode *SrcLoc = Call.getMetadata("srcloc"))
     Inst.addMetadata(SrcLoc);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index bd3ff7265d51f..abe23af00a789 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1317,7 +1317,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     if (DstTy.isVector())
       return UnableToLegalize;
 
-    if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
+    if (8 * LoadMI.getMemSize().getValue() != DstTy.getSizeInBits()) {
       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
       MIRBuilder.buildAnyExt(DstReg, TmpReg);
@@ -1335,7 +1335,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
     auto &MMO = LoadMI.getMMO();
-    unsigned MemSize = MMO.getSizeInBits();
+    unsigned MemSize = MMO.getSizeInBits().getValue();
 
     if (MemSize == NarrowSize) {
       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
@@ -1368,7 +1368,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     if (SrcTy.isVector() && LeftoverBits != 0)
       return UnableToLegalize;
 
-    if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
+    if (8 * StoreMI.getMemSize().getValue() != SrcTy.getSizeInBits()) {
       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildTrunc(TmpReg, SrcReg);
       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
@@ -4456,7 +4456,7 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
   LLT ValTy = MRI.getType(ValReg);
 
   // FIXME: Do we need a distinct NarrowMemory legalize action?
-  if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
+  if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize().getValue()) {
     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
     return UnableToLegalize;
   }
@@ -5411,6 +5411,9 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT:
   case TargetOpcode::G_TRUNC:
   case TargetOpcode::G_FPTRUNC:
   case TargetOpcode::G_FPEXT:
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index b5c9d3e912cc2..9fc8ecd60b03f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -117,12 +117,8 @@ bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1,
   if (!BasePtr0.BaseReg.isValid() || !BasePtr1.BaseReg.isValid())
     return false;
 
-  LocationSize Size1 = LdSt1->getMemSize() != MemoryLocation::UnknownSize
-                           ? LdSt1->getMemSize()
-                           : LocationSize::beforeOrAfterPointer();
-  LocationSize Size2 = LdSt2->getMemSize() != MemoryLocation::UnknownSize
-                           ? LdSt2->getMemSize()
-                           : LocationSize::beforeOrAfterPointer();
+  LocationSize Size1 = LdSt1->getMemSize();
+  LocationSize Size2 = LdSt2->getMemSize();
 
   int64_t PtrDiff;
   if (BasePtr0.BaseReg == BasePtr1.BaseReg) {
@@ -214,14 +210,9 @@ bool GISelAddressing::instMayAlias(const MachineInstr &MI,
         Offset = 0;
       }
 
-      TypeSize Size = LS->getMMO().getMemoryType().getSizeInBytes();
-      return {LS->isVolatile(),
-              LS->isAtomic(),
-              BaseReg,
-              Offset /*base offset*/,
-              Size.isScalable() ? LocationSize::beforeOrAfterPointer()
-                                : LocationSize::precise(Size),
-              &LS->getMMO()};
+      LocationSize Size = LS->getMMO().getSize();
+      return {LS->isVolatile(),       LS->isAtomic(), BaseReg,
+              Offset /*base offset*/, Size,           &LS->getMMO()};
     }
     // FIXME: support recognizing lifetime instructions.
     // Default.
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 4941d5b01ae0f..545ee1741834a 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -309,10 +309,9 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
   for (size_t GI = 0, GE = Globals.size(); GI != GE; ++GI) {
     GlobalVariable *GV = Globals[GI];
 
-    // Reset the encountered sets for this global...
-    std::fill(EncounteredUGS.begin(), EncounteredUGS.end(), 0);
-    // ...and grow it in case we created new sets for the previous global.
-    EncounteredUGS.resize(UsedGlobalSets.size());
+    // Reset the encountered sets for this global and grow it in case we created
+    // new sets for the previous global.
+    EncounteredUGS.assign(UsedGlobalSets.size(), 0);
 
     // We might need to create a set that only consists of the current global.
     // Keep track of its index into UsedGlobalSets.
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index c536ec9f79d62..cc5aad14e1b56 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -580,7 +580,7 @@ PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
   BasicBlock *Preheader = L->getLoopPreheader();
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = ExitBranch->getParent();
-  IRBuilder<> Builder(Header->getFirstNonPHI());
+  IRBuilder<> Builder(Header, Header->getFirstNonPHIIt());
   PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
   Index->addIncoming(NumElts, Preheader);
   Index->addIncoming(EltsRem, Latch);
diff --git a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
index f7b931a3bdac2..13f595bef8eeb 100644
--- a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
+++ b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
@@ -113,7 +113,7 @@ bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
       // Handle the degenerate case of no successors by replacing the indirectbr
       // with unreachable as there is no successor available.
       if (IBr->getNumSuccessors() == 0) {
-        (void)new UnreachableInst(F.getContext(), IBr);
+        (void)new UnreachableInst(F.getContext(), IBr->getIterator());
         IBr->eraseFromParent();
         continue;
       }
@@ -183,7 +183,7 @@ bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
         for (BasicBlock *SuccBB : IBr->successors())
           Updates.push_back({DominatorTree::Delete, IBr->getParent(), SuccBB});
       }
-      (void)new UnreachableInst(F.getContext(), IBr);
+      (void)new UnreachableInst(F.getContext(), IBr->getIterator());
       IBr->eraseFromParent();
     }
     if (DTU) {
@@ -207,9 +207,10 @@ bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
   }
 
   auto GetSwitchValue = [CommonITy](IndirectBrInst *IBr) {
-    return CastInst::CreatePointerCast(
-        IBr->getAddress(), CommonITy,
-        Twine(IBr->getAddress()->getName()) + ".switch_cast", IBr);
+    return CastInst::CreatePointerCast(IBr->getAddress(), CommonITy,
+                                       Twine(IBr->getAddress()->getName()) +
+                                           ".switch_cast",
+                                       IBr->getIterator());
   };
 
   SmallVector<DominatorTree::UpdateType, 8> Updates;
@@ -243,7 +244,7 @@ bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
       Updates.reserve(IndirectBrs.size() + 2 * IndirectBrSuccs.size());
     for (auto *IBr : IndirectBrs) {
       SwitchPN->addIncoming(GetSwitchValue(IBr), IBr->getParent());
-      BranchInst::Create(SwitchBB, IBr);
+      BranchInst::Create(SwitchBB, IBr->getIterator());
       if (DTU) {
         Updates.push_back({DominatorTree::Insert, IBr->getParent(), SwitchBB});
         for (BasicBlock *SuccBB : IBr->successors())
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 2a0daf404c978..438ac1c3cc6e2 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -388,14 +388,15 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
       return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements();
     }));
 
+    BasicBlock::iterator insertPos = SVI->getIterator();
     auto *NewSVI1 =
         new ShuffleVectorInst(BI->getOperand(0), PoisonValue::get(BIOp0Ty),
-                              Mask, SVI->getName(), SVI);
+                              Mask, SVI->getName(), insertPos);
     auto *NewSVI2 = new ShuffleVectorInst(
         BI->getOperand(1), PoisonValue::get(BI->getOperand(1)->getType()), Mask,
-        SVI->getName(), SVI);
+        SVI->getName(), insertPos);
     BinaryOperator *NewBI = BinaryOperator::CreateWithCopiedFlags(
-        BI->getOpcode(), NewSVI1, NewSVI2, BI, BI->getName(), SVI);
+        BI->getOpcode(), NewSVI1, NewSVI2, BI, BI->getName(), insertPos);
     SVI->replaceAllUsesWith(NewBI);
     LLVM_DEBUG(dbgs() << "  Replaced: " << *BI << "\n    And   : " << *SVI
                       << "\n  With    : " << *NewSVI1 << "\n    And   : "
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index fe450cba4a333..09d282d12c5fb 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -472,7 +472,7 @@ bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) {
   Function *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty);
 
   Value *Op = CI->getArgOperand(0);
-  Op = CallInst::Create(Int, Op, CI->getName(), CI);
+  Op = CallInst::Create(Int, Op, CI->getName(), CI->getIterator());
 
   CI->replaceAllUsesWith(Op);
   CI->eraseFromParent();
diff --git a/llvm/lib/CodeGen/JMCInstrumenter.cpp b/llvm/lib/CodeGen/JMCInstrumenter.cpp
index 62a3819188752..e2aaebedf5a4f 100644
--- a/llvm/lib/CodeGen/JMCInstrumenter.cpp
+++ b/llvm/lib/CodeGen/JMCInstrumenter.cpp
@@ -227,7 +227,7 @@ bool runImpl(Module &M) {
     // FIXME: it would be nice to make CI scheduling boundary, although in
     //        practice it does not matter much.
     auto *CI = CallInst::Create(getCheckFunctionType(Ctx), CheckFunction,
-                                {Flag}, "", &*F.begin()->getFirstInsertionPt());
+                                {Flag}, "", F.begin()->getFirstInsertionPt());
     CI->addParamAttr(0, Attribute::NoUndef);
     if (UseX86FastCall) {
       CI->setCallingConv(CallingConv::X86_FastCall);
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index cfc8c28b99e56..481d9e341da37 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -1356,10 +1356,11 @@ InstrRefBasedLDV::findLocationForMemOperand(const MachineInstr &MI) {
   // from the stack at some point. Happily the memory operand will tell us
   // the size written to the stack.
   auto *MemOperand = *MI.memoperands_begin();
-  unsigned SizeInBits = MemOperand->getSizeInBits();
+  LocationSize SizeInBits = MemOperand->getSizeInBits();
+  assert(SizeInBits.hasValue() && "Expected to find a valid size!");
 
   // Find that position in the stack indexes we're tracking.
-  auto IdxIt = MTracker->StackSlotIdxes.find({SizeInBits, 0});
+  auto IdxIt = MTracker->StackSlotIdxes.find({SizeInBits.getValue(), 0});
   if (IdxIt == MTracker->StackSlotIdxes.end())
     // That index is not tracked. This is suprising, and unlikely to ever
     // occur, but the safe action is to indicate the variable is optimised out.
diff --git a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
index bc2ea3f05b6da..5caf20add2a11 100644
--- a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
+++ b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
@@ -39,6 +39,9 @@ LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
     return LLT::scalar(SizeInBits);
   }
 
+  if (Ty.isTokenTy())
+    return LLT::token();
+
   return LLT();
 }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index d41fc97cb8060..691c60d22724f 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1919,10 +1919,13 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
 
   if (Token.range().front() == 's') {
     auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
-    if (!verifyScalarSize(ScalarSize))
-      return error("invalid size for scalar type");
-
-    Ty = LLT::scalar(ScalarSize);
+    if (ScalarSize) {
+      if (!verifyScalarSize(ScalarSize))
+        return error("invalid size for scalar type");
+      Ty = LLT::scalar(ScalarSize);
+    } else {
+      Ty = LLT::token();
+    }
     lex();
     return false;
   } else if (Token.range().front() == 'p') {
@@ -1980,7 +1983,7 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
   if (Token.range().front() == 's') {
     auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
     if (!verifyScalarSize(ScalarSize))
-      return error("invalid size for scalar type");
+      return error("invalid size for scalar element in vector");
     Ty = LLT::scalar(ScalarSize);
   } else if (Token.range().front() == 'p') {
     const DataLayout &DL = MF.getDataLayout();
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 4ed44d1c06f48..8efe67a9a72be 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -982,7 +982,7 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
 }
 
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
-  // RemoveDIs: as there's no textual form for DPValues yet, print debug-info
+  // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
   // in dbg.value format.
   bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
   if (IsNewDbgInfoFormat)
@@ -996,7 +996,7 @@ void llvm::printMIR(raw_ostream &OS, const Module &M) {
 }
 
 void llvm::printMIR(raw_ostream &OS, const MachineFunction &MF) {
-  // RemoveDIs: as there's no textual form for DPValues yet, print debug-info
+  // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
   // in dbg.value format.
   bool IsNewDbgInfoFormat = MF.getFunction().IsNewDbgInfoFormat;
   if (IsNewDbgInfoFormat)
diff --git a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
index 812d57984e6ca..ccfc4565d3a9b 100644
--- a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
+++ b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -123,7 +123,7 @@ std::string VRegRenamer::getInstructionOpcodeHash(MachineInstr &MI) {
   llvm::transform(MI.uses(), std::back_inserter(MIOperands), GetHashableMO);
 
   for (const auto *Op : MI.memoperands()) {
-    MIOperands.push_back((unsigned)Op->getSize());
+    MIOperands.push_back((unsigned)Op->getSize().getValue());
     MIOperands.push_back((unsigned)Op->getFlags());
     MIOperands.push_back((unsigned)Op->getOffset());
     MIOperands.push_back((unsigned)Op->getSuccessOrdering());
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index c65937935ed82..a4c87a7678bd8 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -155,9 +155,6 @@ MachineCombiner::getOperandDef(const MachineOperand &MO) {
   // We need a virtual register definition.
   if (MO.isReg() && MO.getReg().isVirtual())
     DefInstr = MRI->getUniqueVRegDef(MO.getReg());
-  // PHI's have no depth etc.
-  if (DefInstr && DefInstr->isPHI())
-    DefInstr = nullptr;
   return DefInstr;
 }
 
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index 280d3a6a41edc..853de4c88caeb 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -184,7 +184,8 @@ uint64_t MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
   return alignTo(Offset, StackAlign);
 }
 
-void MachineFrameInfo::computeMaxCallFrameSize(const MachineFunction &MF) {
+void MachineFrameInfo::computeMaxCallFrameSize(
+    MachineFunction &MF, std::vector<MachineBasicBlock::iterator> *FrameSDOps) {
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
@@ -192,18 +193,14 @@ void MachineFrameInfo::computeMaxCallFrameSize(const MachineFunction &MF) {
          "Can only compute MaxCallFrameSize if Setup/Destroy opcode are known");
 
   MaxCallFrameSize = 0;
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
       unsigned Opcode = MI.getOpcode();
       if (Opcode == FrameSetupOpcode || Opcode == FrameDestroyOpcode) {
         unsigned Size = TII.getFrameSize(MI);
         MaxCallFrameSize = std::max(MaxCallFrameSize, Size);
-        AdjustsStack = true;
-      } else if (MI.isInlineAsm()) {
-        // Some inline asm's need a stack frame, as indicated by operand 1.
-        unsigned ExtraInfo = MI.getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
-        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
-          AdjustsStack = true;
+        if (FrameSDOps != nullptr)
+          FrameSDOps->push_back(&MI);
       }
     }
   }
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 323f1a6a1b2bd..ad53214992667 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -484,13 +484,17 @@ void MachineFunction::deleteMachineBasicBlock(MachineBasicBlock *MBB) {
 }
 
 MachineMemOperand *MachineFunction::getMachineMemOperand(
-    MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
-    Align base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
+    MachinePointerInfo PtrInfo, MachineMemOperand::Flags F, LocationSize Size,
+    Align BaseAlignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
     SyncScope::ID SSID, AtomicOrdering Ordering,
     AtomicOrdering FailureOrdering) {
+  assert((!Size.hasValue() ||
+          Size.getValue().getKnownMinValue() != ~UINT64_C(0)) &&
+         "Unexpected an unknown size to be represented using "
+         "LocationSize::beforeOrAfter()");
   return new (Allocator)
-      MachineMemOperand(PtrInfo, f, s, base_alignment, AAInfo, Ranges,
-                        SSID, Ordering, FailureOrdering);
+      MachineMemOperand(PtrInfo, F, Size, BaseAlignment, AAInfo, Ranges, SSID,
+                        Ordering, FailureOrdering);
 }
 
 MachineMemOperand *MachineFunction::getMachineMemOperand(
@@ -503,8 +507,14 @@ MachineMemOperand *MachineFunction::getMachineMemOperand(
                         Ordering, FailureOrdering);
 }
 
-MachineMemOperand *MachineFunction::getMachineMemOperand(
-    const MachineMemOperand *MMO, const MachinePointerInfo &PtrInfo, uint64_t Size) {
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      const MachinePointerInfo &PtrInfo,
+                                      LocationSize Size) {
+  assert((!Size.hasValue() ||
+          Size.getValue().getKnownMinValue() != ~UINT64_C(0)) &&
+         "Unexpected an unknown size to be represented using "
+         "LocationSize::beforeOrAfter()");
   return new (Allocator)
       MachineMemOperand(PtrInfo, MMO->getFlags(), Size, MMO->getBaseAlign(),
                         AAMDNodes(), nullptr, MMO->getSyncScopeID(),
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 6654e1d6cecee..fe2f9ccd33a33 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1302,10 +1302,10 @@ static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA,
   int64_t OffsetB = MMOb->getOffset();
   int64_t MinOffset = std::min(OffsetA, OffsetB);
 
-  uint64_t WidthA = MMOa->getSize();
-  uint64_t WidthB = MMOb->getSize();
-  bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
-  bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
+  LocationSize WidthA = MMOa->getSize();
+  LocationSize WidthB = MMOb->getSize();
+  bool KnownWidthA = WidthA.hasValue();
+  bool KnownWidthB = WidthB.hasValue();
 
   const Value *ValA = MMOa->getValue();
   const Value *ValB = MMOb->getValue();
@@ -1325,8 +1325,8 @@ static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA,
     if (!KnownWidthA || !KnownWidthB)
       return true;
     int64_t MaxOffset = std::max(OffsetA, OffsetB);
-    int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
-    return (MinOffset + LowWidth > MaxOffset);
+    LocationSize LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
+    return (MinOffset + (int)LowWidth.getValue() > MaxOffset);
   }
 
   if (!AA)
@@ -1338,10 +1338,10 @@ static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA,
   assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
   assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
 
-  int64_t OverlapA =
-      KnownWidthA ? WidthA + OffsetA - MinOffset : MemoryLocation::UnknownSize;
-  int64_t OverlapB =
-      KnownWidthB ? WidthB + OffsetB - MinOffset : MemoryLocation::UnknownSize;
+  int64_t OverlapA = KnownWidthA ? WidthA.getValue() + OffsetA - MinOffset
+                                 : MemoryLocation::UnknownSize;
+  int64_t OverlapB = KnownWidthB ? WidthB.getValue() + OffsetB - MinOffset
+                                 : MemoryLocation::UnknownSize;
 
   return !AA->isNoAlias(
       MemoryLocation(ValA, OverlapA, UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
@@ -2357,15 +2357,16 @@ using MMOList = SmallVector<const MachineMemOperand *, 2>;
 static LocationSize getSpillSlotSize(const MMOList &Accesses,
                                      const MachineFrameInfo &MFI) {
   uint64_t Size = 0;
-  for (const auto *A : Accesses)
+  for (const auto *A : Accesses) {
     if (MFI.isSpillSlotObjectIndex(
             cast<FixedStackPseudoSourceValue>(A->getPseudoValue())
                 ->getFrameIndex())) {
-      uint64_t S = A->getSize();
-      if (S == ~UINT64_C(0))
+      LocationSize S = A->getSize();
+      if (!S.hasValue())
         return LocationSize::beforeOrAfterPointer();
-      Size += S;
+      Size += S.getValue();
     }
+  }
   return Size;
 }
 
@@ -2374,10 +2375,8 @@ MachineInstr::getSpillSize(const TargetInstrInfo *TII) const {
   int FI;
   if (TII->isStoreToStackSlotPostFE(*this, FI)) {
     const MachineFrameInfo &MFI = getMF()->getFrameInfo();
-    if (MFI.isSpillSlotObjectIndex(FI)) {
-      uint64_t Size = (*memoperands_begin())->getSize();
-      return Size == ~UINT64_C(0) ? LocationSize::beforeOrAfterPointer() : Size;
-    }
+    if (MFI.isSpillSlotObjectIndex(FI))
+      return (*memoperands_begin())->getSize();
   }
   return std::nullopt;
 }
@@ -2395,10 +2394,8 @@ MachineInstr::getRestoreSize(const TargetInstrInfo *TII) const {
   int FI;
   if (TII->isLoadFromStackSlotPostFE(*this, FI)) {
     const MachineFrameInfo &MFI = getMF()->getFrameInfo();
-    if (MFI.isSpillSlotObjectIndex(FI)) {
-      uint64_t Size = (*memoperands_begin())->getSize();
-      return Size == ~UINT64_C(0) ? LocationSize::beforeOrAfterPointer() : Size;
-    }
+    if (MFI.isSpillSlotObjectIndex(FI))
+      return (*memoperands_begin())->getSize();
   }
   return std::nullopt;
 }
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index 1492c8c366fb4..1019c53e57c6f 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -198,6 +198,23 @@ MDNode *MachineLoop::getLoopID() const {
   return LoopID;
 }
 
+bool MachineLoop::isLoopInvariantImplicitPhysReg(Register Reg) const {
+  MachineFunction *MF = getHeader()->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  if (MRI->isConstantPhysReg(Reg))
+    return true;
+
+  if (!MF->getSubtarget()
+           .getRegisterInfo()
+           ->shouldAnalyzePhysregInMachineLoopInfo(Reg))
+    return false;
+
+  return !llvm::any_of(
+      MRI->def_instructions(Reg),
+      [this](const MachineInstr &MI) { return this->contains(&MI); });
+}
+
 bool MachineLoop::isLoopInvariant(MachineInstr &I,
                                   const Register ExcludeReg) const {
   MachineFunction *MF = I.getParent()->getParent();
@@ -226,7 +243,7 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I,
         // it could get allocated to something with a def during allocation.
         // However, if the physreg is known to always be caller saved/restored
         // then this use is safe to hoist.
-        if (!MRI->isConstantPhysReg(Reg) &&
+        if (!isLoopInvariantImplicitPhysReg(Reg) &&
             !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())) &&
             !TII->isIgnorableUse(MO))
           return false;
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index c7c0a1c20d57f..937ca539513af 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1101,24 +1101,26 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
   assert(getFailureOrdering() == FailureOrdering && "Value truncated");
 }
 
-MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
-                                     uint64_t s, Align a,
+MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags F,
+                                     LocationSize TS, Align BaseAlignment,
                                      const AAMDNodes &AAInfo,
                                      const MDNode *Ranges, SyncScope::ID SSID,
                                      AtomicOrdering Ordering,
                                      AtomicOrdering FailureOrdering)
-    : MachineMemOperand(ptrinfo, f,
-                        s == ~UINT64_C(0) ? LLT() : LLT::scalar(8 * s), a,
-                        AAInfo, Ranges, SSID, Ordering, FailureOrdering) {}
+    : MachineMemOperand(ptrinfo, F,
+                        !TS.hasValue() || TS.isScalable()
+                            ? LLT()
+                            : LLT::scalar(8 * TS.getValue().getKnownMinValue()),
+                        BaseAlignment, AAInfo, Ranges, SSID, Ordering,
+                        FailureOrdering) {}
 
 void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
   // The Value and Offset may differ due to CSE. But the flags and size
   // should be the same.
   assert(MMO->getFlags() == getFlags() && "Flags mismatch!");
-  assert((MMO->getSize() == ~UINT64_C(0) || getSize() == ~UINT64_C(0) ||
+  assert((!MMO->getSize().hasValue() || !getSize().hasValue() ||
           MMO->getSize() == getSize()) &&
          "Size mismatch!");
-
   if (MMO->getBaseAlign() >= getBaseAlign()) {
     // Update the alignment value.
     BaseAlign = MMO->getBaseAlign();
@@ -1240,7 +1242,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
        << "unknown-address";
   }
   MachineOperand::printOperandOffset(OS, getOffset());
-  if (getSize() > 0 && getAlign() != getSize())
+  if (!getSize().hasValue() ||
+      getAlign() != getSize().getValue().getKnownMinValue())
     OS << ", align " << getAlign().value();
   if (getAlign() != getBaseAlign())
     OS << ", basealign " << getBaseAlign().value();
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index d8cb681688339..eb42a78603d40 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -2732,19 +2732,20 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
   if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
     return true;
 
-  uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
-  uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
+  LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
+  LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
 
   // This is the main test, which checks the offset values and the loop
   // increment value to determine if the accesses may be loop carried.
-  if (AccessSizeS == MemoryLocation::UnknownSize ||
-      AccessSizeD == MemoryLocation::UnknownSize)
+  if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
     return true;
 
-  if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
+  if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
+      DeltaD < AccessSizeD.getValue())
     return true;
 
-  return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
+  return (OffsetS + (int64_t)AccessSizeS.getValue() <
+          OffsetD + (int64_t)AccessSizeD.getValue());
 }
 
 void SwingSchedulerDAG::postProcessDAG() {
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 1cd90474898e7..5abfbd5981fba 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -200,7 +200,7 @@ stable_hash llvm::stableHashValue(const MachineInstr &MI, bool HashVRegs,
   for (const auto *Op : MI.memoperands()) {
     if (!HashMemOperands)
       break;
-    HashComponents.push_back(static_cast<unsigned>(Op->getSize()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getSize().getValue()));
     HashComponents.push_back(static_cast<unsigned>(Op->getFlags()));
     HashComponents.push_back(static_cast<unsigned>(Op->getOffset()));
     HashComponents.push_back(static_cast<unsigned>(Op->getSuccessOrdering()));
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c2d6dd35e1cb2..c69d36fc7fdd6 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1195,13 +1195,16 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       const MachineMemOperand &MMO = **MI->memoperands_begin();
       if (MI->getOpcode() == TargetOpcode::G_ZEXTLOAD ||
           MI->getOpcode() == TargetOpcode::G_SEXTLOAD) {
-        if (MMO.getSizeInBits() >= ValTy.getSizeInBits())
+        if (TypeSize::isKnownGE(MMO.getSizeInBits().getValue(),
+                                ValTy.getSizeInBits()))
           report("Generic extload must have a narrower memory type", MI);
       } else if (MI->getOpcode() == TargetOpcode::G_LOAD) {
-        if (MMO.getSize() > ValTy.getSizeInBytes())
+        if (TypeSize::isKnownGT(MMO.getSize().getValue(),
+                                ValTy.getSizeInBytes()))
           report("load memory size cannot exceed result size", MI);
       } else if (MI->getOpcode() == TargetOpcode::G_STORE) {
-        if (ValTy.getSizeInBytes() < MMO.getSize())
+        if (TypeSize::isKnownLT(ValTy.getSizeInBytes(),
+                                MMO.getSize().getValue()))
           report("store memory size cannot exceed value size", MI);
       }
 
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0bef513342ff1..bdae94c4e6f88 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -979,8 +979,8 @@ void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
       NewMMOs.push_back(
           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
     } else {
-      NewMMOs.push_back(
-          MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
+      NewMMOs.push_back(MF.getMachineMemOperand(
+          MMO, 0, LocationSize::beforeOrAfterPointer()));
     }
   }
   NewMI.setMemRefs(MF, NewMMOs);
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index e8391afb8e3f8..26857c6a40889 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -12,8 +12,6 @@
 namespace llvm {
 
 DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
-  if (Translator)
-    S = Translator(S);
   auto I = Strings.insert({S, DwarfStringPoolEntry()});
   auto &Entry = I.first->second;
   if (I.second || !Entry.isIndexed()) {
@@ -28,9 +26,6 @@ DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
 StringRef NonRelocatableStringpool::internString(StringRef S) {
   DwarfStringPoolEntry Entry{nullptr, 0, DwarfStringPoolEntry::NotIndexed};
 
-  if (Translator)
-    S = Translator(S);
-
   auto InsertResult = Strings.insert({S, Entry});
   return InsertResult.first->getKey();
 }
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 8af17e63e25c7..eaf96ec5cbde8 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -228,9 +228,8 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
   FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(MF);
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
 
-  // Calculate the MaxCallFrameSize and AdjustsStack variables for the
-  // function's frame information. Also eliminates call frame pseudo
-  // instructions.
+  // Calculate the MaxCallFrameSize value for the function's frame
+  // information. Also eliminates call frame pseudo instructions.
   calculateCallFrameInfo(MF);
 
   // Determine placement of CSR spill/restore code and prolog/epilog code:
@@ -350,17 +349,13 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-/// Calculate the MaxCallFrameSize and AdjustsStack
-/// variables for the function's frame information and eliminate call frame
-/// pseudo instructions.
+/// Calculate the MaxCallFrameSize variable for the function's frame
+/// information and eliminate call frame pseudo instructions.
 void PEI::calculateCallFrameInfo(MachineFunction &MF) {
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  unsigned MaxCallFrameSize = 0;
-  bool AdjustsStack = MFI.adjustsStack();
-
   // Get the function call frame set-up and tear-down instruction opcode
   unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
@@ -370,26 +365,15 @@ void PEI::calculateCallFrameInfo(MachineFunction &MF) {
   if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
     return;
 
+  // (Re-)Compute the MaxCallFrameSize.
+  [[maybe_unused]] uint32_t MaxCFSIn =
+      MFI.isMaxCallFrameSizeComputed() ? MFI.getMaxCallFrameSize() : UINT32_MAX;
   std::vector<MachineBasicBlock::iterator> FrameSDOps;
-  for (MachineBasicBlock &BB : MF)
-    for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I)
-      if (TII.isFrameInstr(*I)) {
-        unsigned Size = TII.getFrameSize(*I);
-        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
-        AdjustsStack = true;
-        FrameSDOps.push_back(I);
-      } else if (I->isInlineAsm()) {
-        // Some inline asm's need a stack frame, as indicated by operand 1.
-        unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
-        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
-          AdjustsStack = true;
-      }
-
-  assert(!MFI.isMaxCallFrameSizeComputed() ||
-         (MFI.getMaxCallFrameSize() >= MaxCallFrameSize &&
-          !(AdjustsStack && !MFI.adjustsStack())));
-  MFI.setAdjustsStack(AdjustsStack);
-  MFI.setMaxCallFrameSize(MaxCallFrameSize);
+  MFI.computeMaxCallFrameSize(MF, &FrameSDOps);
+  assert(MFI.getMaxCallFrameSize() <= MaxCFSIn &&
+         "Recomputing MaxCFS gave a larger value.");
+  assert((FrameSDOps.empty() || MF.getFrameInfo().adjustsStack()) &&
+         "AdjustsStack not set in presence of a frame pseudo instruction.");
 
   if (TFI->canSimplifyCallFramePseudos(MF)) {
     // If call frames are not being included as part of the stack frame, and
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 40898d284a09a..2e03ae6aec94d 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -624,8 +624,8 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With
     // RemoveDIs turned on, SplitPt would instead point to the next
     // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs,
-    // tell splitBasicBlock that we want to include any DPValues attached to
-    // SplitPt in the splice.
+    // tell splitBasicBlock that we want to include any DbgVariableRecords
+    // attached to SplitPt in the splice.
     SplitPt.setHeadBit(true);
     BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
     BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
@@ -645,12 +645,13 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
       DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
     }
 
-    // Duplicate implementation for DPValues, the non-instruction debug-info
-    // record. Helper lambda for moving DPValues to the end block.
-    auto TransferDPValues = [&](Instruction &I) {
-      for (auto &DPValue : llvm::make_early_inc_range(I.getDbgRecordRange())) {
-        DPValue.removeFromParent();
-        EndBlock->insertDbgRecordBefore(&DPValue,
+    // Duplicate implementation for DbgRecords, the non-instruction debug-info
+    // format. Helper lambda for moving DbgRecords to the end block.
+    auto TransferDbgRecords = [&](Instruction &I) {
+      for (auto &DbgRecord :
+           llvm::make_early_inc_range(I.getDbgRecordRange())) {
+        DbgRecord.removeFromParent();
+        EndBlock->insertDbgRecordBefore(&DbgRecord,
                                         EndBlock->getFirstInsertionPt());
       }
     };
@@ -660,7 +661,7 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     // middle" of the select group.
     auto R = make_range(std::next(SI.getI()->getIterator()),
                         std::next(LastSI.getI()->getIterator()));
-    llvm::for_each(R, TransferDPValues);
+    llvm::for_each(R, TransferDbgRecords);
 
     // These are the new basic blocks for the conditional branch.
     // At least one will become an actual new basic block.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 40b078a201ace..f199625bf67ad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -597,8 +597,8 @@ namespace {
     SDValue foldSextSetcc(SDNode *N);
     SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
                               const SDLoc &DL);
-    SDValue foldSubToUSubSat(EVT DstVT, SDNode *N);
-    SDValue foldABSToABD(SDNode *N);
+    SDValue foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL);
+    SDValue foldABSToABD(SDNode *N, const SDLoc &DL);
     SDValue unfoldMaskedMerge(SDNode *N);
     SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
@@ -2529,6 +2529,23 @@ static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
 }
 
+// Attempt to form avgceilu(A, B) from (A | B) - ((A ^ B) >> 1)
+static SDValue combineFixedwidthToAVGCEILU(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+  if (TLI.isOperationLegal(ISD::AVGCEILU, VT)) {
+    SDValue A, B;
+    if (sd_match(N, m_Sub(m_Or(m_Value(A), m_Value(B)),
+                          m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
+                                m_SpecificInt(1))))) {
+      return DAG.getNode(ISD::AVGCEILU, DL, VT, A, B);
+    }
+  }
+  return SDValue();
+}
+
 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
 /// a shift and add with a different constant.
 static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
@@ -2703,11 +2720,11 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
   SDValue A, B, C;
 
   // fold ((0-A) + B) -> B-A
-  if (sd_match(N0, m_Sub(m_Zero(), m_Value(A))))
+  if (sd_match(N0, m_Neg(m_Value(A))))
     return DAG.getNode(ISD::SUB, DL, VT, N1, A);
 
   // fold (A + (0-B)) -> A-B
-  if (sd_match(N1, m_Sub(m_Zero(), m_Value(B))))
+  if (sd_match(N1, m_Neg(m_Value(B))))
     return DAG.getNode(ISD::SUB, DL, VT, N0, B);
 
   // fold (A+(B-A)) -> B
@@ -2820,6 +2837,23 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
   return SDValue();
 }
 
+// Attempt to form avgflooru(A, B) from (A & B) + ((A ^ B) >> 1)
+static SDValue combineFixedwidthToAVGFLOORU(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+  if (TLI.isOperationLegal(ISD::AVGFLOORU, VT)) {
+    SDValue A, B;
+    if (sd_match(N, m_Add(m_And(m_Value(A), m_Value(B)),
+                          m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
+                                m_SpecificInt(1))))) {
+      return DAG.getNode(ISD::AVGFLOORU, DL, VT, A, B);
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2835,6 +2869,10 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
+  // Try to match AVGFLOORU fixedwidth pattern
+  if (SDValue V = combineFixedwidthToAVGFLOORU(N, DAG))
+    return V;
+
   // fold (a+b) -> (a|b) iff a and b share no bits.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
       DAG.haveNoCommonBitsSet(N0, N1))
@@ -3596,7 +3634,7 @@ static SDValue getTruncatedUSUBSAT(EVT DstVT, EVT SrcVT, SDValue LHS,
 
 // Try to find umax(a,b) - b or a - umin(a,b) patterns that may be converted to
 // usubsat(a,b), optionally as a truncated type.
-SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) {
+SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) {
   if (N->getOpcode() != ISD::SUB ||
       !(!LegalOperations || hasOperation(ISD::USUBSAT, DstVT)))
     return SDValue();
@@ -3611,18 +3649,18 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) {
     SDValue MaxLHS = Op0.getOperand(0);
     SDValue MaxRHS = Op0.getOperand(1);
     if (MaxLHS == Op1)
-      return getTruncatedUSUBSAT(DstVT, SubVT, MaxRHS, Op1, DAG, SDLoc(N));
+      return getTruncatedUSUBSAT(DstVT, SubVT, MaxRHS, Op1, DAG, DL);
     if (MaxRHS == Op1)
-      return getTruncatedUSUBSAT(DstVT, SubVT, MaxLHS, Op1, DAG, SDLoc(N));
+      return getTruncatedUSUBSAT(DstVT, SubVT, MaxLHS, Op1, DAG, DL);
   }
 
   if (Op1.getOpcode() == ISD::UMIN && Op1.hasOneUse()) {
     SDValue MinLHS = Op1.getOperand(0);
     SDValue MinRHS = Op1.getOperand(1);
     if (MinLHS == Op0)
-      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinRHS, DAG, SDLoc(N));
+      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinRHS, DAG, DL);
     if (MinRHS == Op0)
-      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinLHS, DAG, SDLoc(N));
+      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinLHS, DAG, DL);
   }
 
   // sub(a,trunc(umin(zext(a),b))) -> usubsat(a,trunc(umin(b,SatLimit)))
@@ -3633,10 +3671,10 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) {
     SDValue MinRHS = Op1.getOperand(0).getOperand(1);
     if (MinLHS.getOpcode() == ISD::ZERO_EXTEND && MinLHS.getOperand(0) == Op0)
       return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinLHS, MinRHS,
-                                 DAG, SDLoc(N));
+                                 DAG, DL);
     if (MinRHS.getOpcode() == ISD::ZERO_EXTEND && MinRHS.getOperand(0) == Op0)
       return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinRHS, MinLHS,
-                                 DAG, SDLoc(N));
+                                 DAG, DL);
   }
 
   return SDValue();
@@ -3812,7 +3850,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getNOT(DL, B, VT));
 
   // fold (A - (-B * C)) -> (A + (B * C))
-  if (sd_match(N1, m_OneUse(m_Mul(m_Sub(m_Zero(), m_Value(B)), m_Value(C)))))
+  if (sd_match(N1, m_OneUse(m_Mul(m_Neg(m_Value(B)), m_Value(C)))))
     return DAG.getNode(ISD::ADD, DL, VT, N0,
                        DAG.getNode(ISD::MUL, DL, VT, B, C));
 
@@ -3828,10 +3866,14 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
+  // Try to match AVGCEILU fixedwidth pattern
+  if (SDValue V = combineFixedwidthToAVGCEILU(N, DAG))
+    return V;
+
   if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
     return V;
 
-  if (SDValue V = foldSubToUSubSat(VT, N))
+  if (SDValue V = foldSubToUSubSat(VT, N, DL))
     return V;
 
   // (A - B) - 1  ->  add (xor B, -1), A
@@ -6655,35 +6697,25 @@ static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
 
 /// For targets that support usubsat, match a bit-hack form of that operation
 /// that ends in 'and' and convert it.
-static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  EVT VT = N1.getValueType();
-
-  // Canonicalize SRA as operand 1.
-  if (N0.getOpcode() == ISD::SRA)
-    std::swap(N0, N1);
-
-  // xor/add with SMIN (signmask) are logically equivalent.
-  if (N0.getOpcode() != ISD::XOR && N0.getOpcode() != ISD::ADD)
-    return SDValue();
-
-  if (N1.getOpcode() != ISD::SRA || !N0.hasOneUse() || !N1.hasOneUse() ||
-      N0.getOperand(0) != N1.getOperand(0))
-    return SDValue();
-
+static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG, const SDLoc &DL) {
+  EVT VT = N->getValueType(0);
   unsigned BitWidth = VT.getScalarSizeInBits();
-  ConstantSDNode *XorC = isConstOrConstSplat(N0.getOperand(1), true);
-  ConstantSDNode *SraC = isConstOrConstSplat(N1.getOperand(1), true);
-  if (!XorC || !XorC->getAPIntValue().isSignMask() ||
-      !SraC || SraC->getAPIntValue() != BitWidth - 1)
-    return SDValue();
+  APInt SignMask = APInt::getSignMask(BitWidth);
 
   // (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128
   // (i8 X + 128) & (i8 X s>> 7) --> usubsat X, 128
-  SDLoc DL(N);
-  SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT);
-  return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask);
+  // xor/add with SMIN (signmask) are logically equivalent.
+  SDValue X;
+  if (!sd_match(N, m_And(m_OneUse(m_Xor(m_Value(X), m_SpecificInt(SignMask))),
+                         m_OneUse(m_Sra(m_Deferred(X),
+                                        m_SpecificInt(BitWidth - 1))))) &&
+      !sd_match(N, m_And(m_OneUse(m_Add(m_Value(X), m_SpecificInt(SignMask))),
+                         m_OneUse(m_Sra(m_Deferred(X),
+                                        m_SpecificInt(BitWidth - 1))))))
+    return SDValue();
+
+  return DAG.getNode(ISD::USUBSAT, DL, VT, X,
+                     DAG.getConstant(SignMask, DL, VT));
 }
 
 /// Given a bitwise logic operation N with a matching bitwise logic operand,
@@ -6773,34 +6805,34 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N1.getValueType();
+  SDLoc DL(N);
 
   // x & x --> x
   if (N0 == N1)
     return N0;
 
   // fold (and c1, c2) -> c1&c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, DL, VT, {N0, N1}))
     return C;
 
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
+    return DAG.getNode(ISD::AND, DL, VT, N1, N0);
 
   if (areBitwiseNotOfEachother(N0, N1))
-    return DAG.getConstant(APInt::getZero(VT.getScalarSizeInBits()), SDLoc(N),
-                           VT);
+    return DAG.getConstant(APInt::getZero(VT.getScalarSizeInBits()), DL, VT);
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
     // fold (and x, 0) -> 0, vector edition
     if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
       // do not return N1, because undef node may exist in N1
-      return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()),
-                             SDLoc(N), N1.getValueType());
+      return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()), DL,
+                             N1.getValueType());
 
     // fold (and x, -1) -> x, vector edition
     if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
@@ -6820,8 +6852,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         uint64_t ElementSize =
             LoadVT.getVectorElementType().getScalarSizeInBits();
         if (Splat->getAPIntValue().isMask(ElementSize)) {
-          auto NewLoad = DAG.getMaskedLoad(
-              ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(),
+          SDValue NewLoad = DAG.getMaskedLoad(
+              ExtVT, DL, MLoad->getChain(), MLoad->getBasePtr(),
               MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(),
               LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(),
               ISD::ZEXTLOAD, MLoad->isExpandingLoad());
@@ -6843,7 +6875,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   unsigned BitWidth = VT.getScalarSizeInBits();
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth)))
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
 
   if (SDValue R = foldAndOrOfSETCC(N, DAG))
     return R;
@@ -6852,12 +6884,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     return NewSel;
 
   // reassociate and
-  if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
+  if (SDValue RAND = reassociateOps(ISD::AND, DL, N0, N1, N->getFlags()))
     return RAND;
 
   // Fold and(vecreduce(x), vecreduce(y)) -> vecreduce(and(x, y))
-  if (SDValue SD = reassociateReduction(ISD::VECREDUCE_AND, ISD::AND, SDLoc(N),
-                                        VT, N0, N1))
+  if (SDValue SD =
+          reassociateReduction(ISD::VECREDUCE_AND, ISD::AND, DL, VT, N0, N1))
     return SD;
 
   // fold (and (or x, C), D) -> D if (C & D) == D
@@ -6877,18 +6909,16 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
     // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
     if (DAG.MaskedValueIsZero(N0Op0, Mask))
-      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0Op0);
+      return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0Op0);
 
     // fold (and (any_ext V), c) -> (zero_ext (and (trunc V), c)) if profitable.
     if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
         TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
         TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
-        TLI.isNarrowingProfitable(VT, SrcVT)) {
-      SDLoc DL(N);
+        TLI.isNarrowingProfitable(VT, SrcVT))
       return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
                          DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
                                      DAG.getZExtOrTrunc(N1, DL, SrcVT)));
-    }
   }
 
   // fold (and (ext (and V, c1)), c2) -> (and (ext V), (and c1, (ext c2)))
@@ -6900,7 +6930,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
         DAG.isConstantIntBuildVectorOrConstantInt(N0Op0.getOperand(1)) &&
         N0->hasOneUse() && N0Op0->hasOneUse()) {
-      SDLoc DL(N);
       SDValue NewMask =
           DAG.getNode(ISD::AND, DL, VT, N1,
                       DAG.getNode(ExtOpc, DL, VT, N0Op0.getOperand(1)));
@@ -6921,8 +6950,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
        N0.getOperand(0).getOpcode() == ISD::LOAD &&
        N0.getOperand(0).getResNo() == 0) ||
       (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
-    LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
-                                         N0 : N0.getOperand(0) );
+    auto *Load =
+        cast<LoadSDNode>((N0.getOpcode() == ISD::LOAD) ? N0 : N0.getOperand(0));
 
     // Get the constant (if applicable) the zero'th operand is being ANDed with.
     // This can be a pure constant or a vector splat, in which case we treat the
@@ -7032,9 +7061,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       //    (and (extract_subvector (zext|anyext|sext v) _) iN_mask)
       // => (extract_subvector (iN_zeroext v))
       SDValue ZeroExtExtendee =
-          DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), ExtVT, Extendee);
+          DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Extendee);
 
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, ZeroExtExtendee,
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ZeroExtExtendee,
                          N0.getOperand(1));
     }
   }
@@ -7051,8 +7080,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                        GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
 
       SDValue ZExtLoad = DAG.getMaskedGather(
-          DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
-          GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
+          DAG.getVTList(VT, MVT::Other), MemVT, DL, Ops, GN0->getMemOperand(),
+          GN0->getIndexType(), ISD::ZEXTLOAD);
 
       CombineTo(N, ZExtLoad);
       AddToWorklist(ZExtLoad.getNode());
@@ -7104,7 +7133,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         return SubRHS;
       if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
           SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
-        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
+        return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, SubRHS.getOperand(0));
     }
   }
 
@@ -7118,7 +7147,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (ISD::isUNINDEXEDLoad(N0.getNode()) &&
       (ISD::isEXTLoad(N0.getNode()) ||
        (ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    auto *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     // If we zero all the possible extended bits, then we can turn this into
     // a zextload if we are running before legalize or the operation is legal.
@@ -7173,10 +7202,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
   // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...).
   if (IsAndZeroExtMask(N0, N1))
-    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
 
   if (hasOperation(ISD::USUBSAT, VT))
-    if (SDValue V = foldAndToUsubsat(N, DAG))
+    if (SDValue V = foldAndToUsubsat(N, DAG, DL))
       return V;
 
   // Postpone until legalization completed to avoid interference with bswap
@@ -10733,7 +10762,7 @@ SDValue DAGCombiner::visitSHLSAT(SDNode *N) {
 // (ABS (SUB (EXTEND a), (EXTEND b))).
 // (TRUNC (ABS (SUB (EXTEND a), (EXTEND b)))).
 // Generates UABD/SABD instruction.
-SDValue DAGCombiner::foldABSToABD(SDNode *N) {
+SDValue DAGCombiner::foldABSToABD(SDNode *N, const SDLoc &DL) {
   EVT SrcVT = N->getValueType(0);
 
   if (N->getOpcode() == ISD::TRUNCATE)
@@ -10745,7 +10774,6 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDValue AbsOp1 = N->getOperand(0);
   SDValue Op0, Op1;
-  SDLoc DL(N);
 
   if (AbsOp1.getOpcode() != ISD::SUB)
     return SDValue();
@@ -10804,9 +10832,10 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N) {
 SDValue DAGCombiner::visitABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   // fold (abs c1) -> c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::ABS, SDLoc(N), VT, {N0}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::ABS, DL, VT, {N0}))
     return C;
   // fold (abs (abs x)) -> (abs x)
   if (N0.getOpcode() == ISD::ABS)
@@ -10815,7 +10844,7 @@ SDValue DAGCombiner::visitABS(SDNode *N) {
   if (DAG.SignBitIsZero(N0))
     return N0;
 
-  if (SDValue ABD = foldABSToABD(N))
+  if (SDValue ABD = foldABSToABD(N, DL))
     return ABD;
 
   // fold (abs (sign_extend_inreg x)) -> (zero_extend (abs (truncate x)))
@@ -10825,7 +10854,6 @@ SDValue DAGCombiner::visitABS(SDNode *N) {
     if (TLI.isTruncateFree(VT, ExtVT) && TLI.isZExtFree(ExtVT, VT) &&
         TLI.isTypeDesirableForOp(ISD::ABS, ExtVT) &&
         hasOperation(ISD::ABS, ExtVT)) {
-      SDLoc DL(N);
       return DAG.getNode(
           ISD::ZERO_EXTEND, DL, VT,
           DAG.getNode(ISD::ABS, DL, ExtVT,
@@ -14621,6 +14649,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT SrcVT = N0.getValueType();
   bool isLE = DAG.getDataLayout().isLittleEndian();
+  SDLoc DL(N);
 
   // trunc(undef) = undef
   if (N0.isUndef())
@@ -14628,10 +14657,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
   // fold (truncate (truncate x)) -> (truncate x)
   if (N0.getOpcode() == ISD::TRUNCATE)
-    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
 
   // fold (truncate c1) -> c1
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::TRUNCATE, SDLoc(N), VT, {N0}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, VT, {N0}))
     return C;
 
   // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
@@ -14640,10 +14669,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       N0.getOpcode() == ISD::ANY_EXTEND) {
     // if the source is smaller than the dest, we still need an extend.
     if (N0.getOperand(0).getValueType().bitsLT(VT))
-      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+      return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
     // if the source is larger than the dest, than we just need the truncate.
     if (N0.getOperand(0).getValueType().bitsGT(VT))
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
     // if the source and dest are the same type, we can drop both the extend
     // and the truncate.
     return N0.getOperand(0);
@@ -14657,8 +14686,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     SDValue ExtVal = N0.getOperand(1);
     EVT ExtVT = cast<VTSDNode>(ExtVal)->getVT();
     if (ExtVT.bitsLT(VT) && TLI.preferSextInRegOfTruncate(VT, SrcVT, ExtVT)) {
-      SDValue TrX = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X);
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, TrX, ExtVal);
+      SDValue TrX = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, TrX, ExtVal);
     }
   }
 
@@ -14693,8 +14722,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
       int Elt = EltNo->getAsZExtVal();
       int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
-
-      SDLoc DL(N);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
                          DAG.getBitcast(NVT, N0.getOperand(0)),
                          DAG.getVectorIdxConstant(Index, DL));
@@ -14709,7 +14736,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       SDValue Cond = N0.getOperand(0);
       SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
       SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
-      return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
+      return DAG.getNode(ISD::SELECT, DL, VT, Cond, TruncOp0, TruncOp1);
     }
   }
 
@@ -14721,22 +14748,20 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     KnownBits Known = DAG.computeKnownBits(Amt);
     unsigned Size = VT.getScalarSizeInBits();
     if (Known.countMaxActiveBits() <= Log2_32(Size)) {
-      SDLoc SL(N);
       EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
       if (AmtVT != Amt.getValueType()) {
-        Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
+        Amt = DAG.getZExtOrTrunc(Amt, DL, AmtVT);
         AddToWorklist(Amt.getNode());
       }
-      return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
+      return DAG.getNode(ISD::SHL, DL, VT, Trunc, Amt);
     }
   }
 
-  if (SDValue V = foldSubToUSubSat(VT, N0.getNode()))
+  if (SDValue V = foldSubToUSubSat(VT, N0.getNode(), DL))
     return V;
 
-  if (SDValue ABD = foldABSToABD(N))
+  if (SDValue ABD = foldABSToABD(N, DL))
     return ABD;
 
   // Attempt to pre-truncate BUILD_VECTOR sources.
@@ -14745,7 +14770,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
       // Avoid creating illegal types if running after type legalizer.
       (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {
-    SDLoc DL(N);
     EVT SVT = VT.getScalarType();
     SmallVector<SDValue, 8> TruncOps;
     for (const SDValue &Op : N0->op_values()) {
@@ -14759,7 +14783,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   if (N0.getOpcode() == ISD::SPLAT_VECTOR &&
       (!LegalTypes || TLI.isTypeLegal(VT.getScalarType())) &&
       (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, VT))) {
-    SDLoc DL(N);
     EVT SVT = VT.getScalarType();
     return DAG.getSplatVector(
         VT, DL, DAG.getNode(ISD::TRUNCATE, DL, SVT, N0->getOperand(0)));
@@ -14791,7 +14814,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
         Opnds.push_back(BuildVect.getOperand(i));
 
-      return DAG.getBuildVector(VT, SDLoc(N), Opnds);
+      return DAG.getBuildVector(VT, DL, Opnds);
     }
   }
 
@@ -14854,7 +14877,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
         AddToWorklist(NV.getNode());
         Opnds.push_back(NV);
       }
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
     }
   }
 
@@ -14868,11 +14891,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT &&
         (!LegalOperations ||
          TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) {
-      SDLoc SL(N);
-
       unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1;
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc,
-                         DAG.getVectorIdxConstant(Idx, SL));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecSrc,
+                         DAG.getVectorIdxConstant(Idx, DL));
     }
   }
 
@@ -14917,7 +14938,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       // we are extra cautious to not create an unsupported operation.
       // Target-specific changes are likely needed to avoid regressions here.
       if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) {
-        SDLoc DL(N);
         SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
         SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
         return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
@@ -14934,7 +14954,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     if (((!LegalOperations && N0.getOpcode() == ISD::UADDO_CARRY) ||
          TLI.isOperationLegal(N0.getOpcode(), VT)) &&
         N0.hasOneUse() && !N0->hasAnyUseOfValue(1)) {
-      SDLoc DL(N);
       SDValue X = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
       SDValue Y = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
       SDVTList VTs = DAG.getVTList(VT, N0->getValueType(1));
@@ -14951,7 +14970,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
             VT.getScalarSizeInBits() &&
         hasOperation(N0.getOpcode(), VT)) {
       return getTruncatedUSUBSAT(VT, SrcVT, N0.getOperand(0), N0.getOperand(1),
-                                 DAG, SDLoc(N));
+                                 DAG, DL);
     }
     break;
   }
@@ -22445,17 +22464,16 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     //      -> extract_vector_elt b, 0
     // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3
     //      -> extract_vector_elt b, 1
-    SDLoc SL(N);
     EVT ConcatVT = VecOp.getOperand(0).getValueType();
     unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
-    SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL,
+    SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, DL,
                                      Index.getValueType());
 
     SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts);
-    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
                               ConcatVT.getVectorElementType(),
                               ConcatOp, NewIdx);
-    return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt);
+    return DAG.getNode(ISD::BITCAST, DL, ScalarVT, Elt);
   }
 
   // Make sure we found a non-volatile load and the extractelement is
@@ -24183,7 +24201,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
   // TODO: Use "BaseIndexOffset" to make this more effective.
   SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL);
 
-  uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize());
+  LocationSize StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize());
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *MMO;
   if (Offset.isScalable()) {
@@ -27828,14 +27846,13 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
                  : (LSN->getAddressingMode() == ISD::PRE_DEC)
                      ? -1 * C->getSExtValue()
                      : 0;
-      uint64_t Size =
+      LocationSize Size =
           MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());
       return {LSN->isVolatile(),
               LSN->isAtomic(),
               LSN->getBasePtr(),
               Offset /*base offset*/,
-              Size != ~UINT64_C(0) ? LocationSize::precise(Size)
-                                   : LocationSize::beforeOrAfterPointer(),
+              Size,
               LSN->getMemOperand()};
     }
     if (const auto *LN = cast<LifetimeSDNode>(N))
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index cce91dbd9531c..8b834862fb4d8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1205,27 +1205,27 @@ void FastISel::handleDbgInfo(const Instruction *II) {
       continue;
     }
 
-    DPValue &DPV = cast<DPValue>(DR);
+    DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
 
     Value *V = nullptr;
-    if (!DPV.hasArgList())
-      V = DPV.getVariableLocationOp(0);
+    if (!DVR.hasArgList())
+      V = DVR.getVariableLocationOp(0);
 
     bool Res = false;
-    if (DPV.getType() == DPValue::LocationType::Value ||
-        DPV.getType() == DPValue::LocationType::Assign) {
-      Res = lowerDbgValue(V, DPV.getExpression(), DPV.getVariable(),
-                          DPV.getDebugLoc());
+    if (DVR.getType() == DbgVariableRecord::LocationType::Value ||
+        DVR.getType() == DbgVariableRecord::LocationType::Assign) {
+      Res = lowerDbgValue(V, DVR.getExpression(), DVR.getVariable(),
+                          DVR.getDebugLoc());
     } else {
-      assert(DPV.getType() == DPValue::LocationType::Declare);
-      if (FuncInfo.PreprocessedDPVDeclares.contains(&DPV))
+      assert(DVR.getType() == DbgVariableRecord::LocationType::Declare);
+      if (FuncInfo.PreprocessedDVRDeclares.contains(&DVR))
         continue;
-      Res = lowerDbgDeclare(V, DPV.getExpression(), DPV.getVariable(),
-                            DPV.getDebugLoc());
+      Res = lowerDbgDeclare(V, DVR.getExpression(), DVR.getVariable(),
+                            DVR.getDebugLoc());
     }
 
     if (!Res)
-      LLVM_DEBUG(dbgs() << "Dropping debug-info for " << DPV << "\n";);
+      LLVM_DEBUG(dbgs() << "Dropping debug-info for " << DVR << "\n";);
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index e01cd8cbf925a..8fb6b11b8805c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -358,7 +358,7 @@ void FunctionLoweringInfo::clear() {
   StatepointRelocationMaps.clear();
   PreferredExtendType.clear();
   PreprocessedDbgDeclares.clear();
-  PreprocessedDPVDeclares.clear();
+  PreprocessedDVRDeclares.clear();
 }
 
 /// CreateReg - Allocate a single virtual register for the given type.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2dccc45c803a0..808e3c622033e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -267,7 +267,9 @@ static MachineMemOperand *getStackAlignedMMO(SDValue StackPtr,
   auto &MFI = MF.getFrameInfo();
   int FI = cast<FrameIndexSDNode>(StackPtr)->getIndex();
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
-  uint64_t ObjectSize = isObjectScalable ? ~UINT64_C(0) : MFI.getObjectSize(FI);
+  LocationSize ObjectSize = isObjectScalable
+                                ? LocationSize::beforeOrAfterPointer()
+                                : LocationSize::precise(MFI.getObjectSize(FI));
   return MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
                                  ObjectSize, MFI.getObjectAlign(FI));
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 909c669abd120..93ce9c22af552 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -349,6 +349,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
                               N->getMemoryVT(), ResVT,
                               N->getChain(), N->getBasePtr(),
                               N->getMemOperand());
+  if (N->getOpcode() == ISD::ATOMIC_LOAD) {
+    ISD::LoadExtType ETy = cast<AtomicSDNode>(N)->getExtensionType();
+    if (ETy == ISD::NON_EXTLOAD) {
+      switch (TLI.getExtendForAtomicOps()) {
+      case ISD::SIGN_EXTEND:
+        ETy = ISD::SEXTLOAD;
+        break;
+      case ISD::ZERO_EXTEND:
+        ETy = ISD::ZEXTLOAD;
+        break;
+      case ISD::ANY_EXTEND:
+        ETy = ISD::EXTLOAD;
+        break;
+      default:
+        llvm_unreachable("Invalid atomic op extension");
+      }
+    }
+    cast<AtomicSDNode>(Res)->setExtensionType(ETy);
+  }
+
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -4527,6 +4547,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
+  unsigned Opc = N->getOpcode();
   SDLoc dl(N);
 
   // If we can emit an efficient shift operation, do so now.  Check to see if
@@ -4541,12 +4562,12 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
 
   // If this target supports shift_PARTS, use it.  First, map to the _PARTS opc.
   unsigned PartsOpc;
-  if (N->getOpcode() == ISD::SHL) {
+  if (Opc == ISD::SHL) {
     PartsOpc = ISD::SHL_PARTS;
-  } else if (N->getOpcode() == ISD::SRL) {
+  } else if (Opc == ISD::SRL) {
     PartsOpc = ISD::SRL_PARTS;
   } else {
-    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    assert(Opc == ISD::SRA && "Unknown shift!");
     PartsOpc = ISD::SRA_PARTS;
   }
 
@@ -4599,7 +4620,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
   // Otherwise, emit a libcall.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   bool isSigned;
-  if (N->getOpcode() == ISD::SHL) {
+  if (Opc == ISD::SHL) {
     isSigned = false; /*sign irrelevant*/
     if (VT == MVT::i16)
       LC = RTLIB::SHL_I16;
@@ -4609,7 +4630,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
       LC = RTLIB::SHL_I64;
     else if (VT == MVT::i128)
       LC = RTLIB::SHL_I128;
-  } else if (N->getOpcode() == ISD::SRL) {
+  } else if (Opc == ISD::SRL) {
     isSigned = false;
     if (VT == MVT::i16)
       LC = RTLIB::SRL_I16;
@@ -4620,7 +4641,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
     else if (VT == MVT::i128)
       LC = RTLIB::SRL_I128;
   } else {
-    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    assert(Opc == ISD::SRA && "Unknown shift!");
     isSigned = true;
     if (VT == MVT::i16)
       LC = RTLIB::SRA_I16;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5fb9d8d07d151..1f6e0097f31ab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1967,7 +1967,8 @@ void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       LD->getPointerInfo(), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, LD->getAAInfo(), LD->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, LD->getAAInfo(),
+      LD->getRanges());
 
   Lo =
       DAG.getLoadVP(LD->getAddressingMode(), ExtType, LoVT, dl, Ch, Ptr, Offset,
@@ -1990,8 +1991,8 @@ void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo,
           LoMemVT.getStoreSize().getFixedValue());
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
-        LD->getAAInfo(), LD->getRanges());
+        MPI, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
+        Alignment, LD->getAAInfo(), LD->getRanges());
 
     Hi = DAG.getLoadVP(LD->getAddressingMode(), ExtType, HiVT, dl, Ch, Ptr,
                        Offset, MaskHi, EVLHi, HiMemVT, MMO,
@@ -2070,8 +2071,8 @@ void DAGTypeLegalizer::SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD,
 
     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo(SLD->getPointerInfo().getAddrSpace()),
-        MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
-        SLD->getAAInfo(), SLD->getRanges());
+        MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
+        Alignment, SLD->getAAInfo(), SLD->getRanges());
 
     Hi = DAG.getStridedLoadVP(SLD->getAddressingMode(), SLD->getExtensionType(),
                               HiVT, DL, SLD->getChain(), Ptr, SLD->getOffset(),
@@ -2130,7 +2131,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MLD->getPointerInfo(), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, MLD->getAAInfo(),
+      LocationSize::beforeOrAfterPointer(), Alignment, MLD->getAAInfo(),
       MLD->getRanges());
 
   Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
@@ -2154,8 +2155,8 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
           LoMemVT.getStoreSize().getFixedValue());
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
-        MLD->getAAInfo(), MLD->getRanges());
+        MPI, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
+        Alignment, MLD->getAAInfo(), MLD->getRanges());
 
     Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi,
                            HiMemVT, MMO, MLD->getAddressingMode(), ExtType,
@@ -2217,7 +2218,8 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo(), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
+      N->getRanges());
 
   if (auto *MGT = dyn_cast<MaskedGatherSDNode>(N)) {
     SDValue PassThru = MGT->getPassThru();
@@ -2884,10 +2886,10 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo,
   auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
 
   MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOStore, MemoryLocation::UnknownSize,
+      PtrInfo, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
       Alignment);
   MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize,
+      PtrInfo, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
       Alignment);
 
   unsigned EltWidth = VT.getScalarSizeInBits() / 8;
@@ -3478,7 +3480,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) {
   SDValue Lo, Hi;
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo(), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
+      N->getRanges());
 
   Lo = DAG.getStoreVP(Ch, DL, DataLo, Ptr, Offset, MaskLo, EVLLo, LoMemVT, MMO,
                       N->getAddressingMode(), N->isTruncatingStore(),
@@ -3501,8 +3504,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) {
         LoMemVT.getStoreSize().getFixedValue());
 
   MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
-      N->getAAInfo(), N->getRanges());
+      MPI, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
+      Alignment, N->getAAInfo(), N->getRanges());
 
   Hi = DAG.getStoreVP(Ch, DL, DataHi, Ptr, Offset, MaskHi, EVLHi, HiMemVT, MMO,
                       N->getAddressingMode(), N->isTruncatingStore(),
@@ -3574,8 +3577,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(N->getPointerInfo().getAddrSpace()),
-      MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
-      N->getAAInfo(), N->getRanges());
+      MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
+      Alignment, N->getAAInfo(), N->getRanges());
 
   SDValue Hi = DAG.getStridedStoreVP(
       N->getChain(), DL, HiData, Ptr, N->getOffset(), N->getStride(), HiMask,
@@ -3626,7 +3629,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
   SDValue Lo, Hi, Res;
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo(), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
+      N->getRanges());
 
   Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
                           N->getAddressingMode(), N->isTruncatingStore(),
@@ -3651,8 +3655,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
           LoMemVT.getStoreSize().getFixedValue());
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
-        N->getAAInfo(), N->getRanges());
+        MPI, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
+        Alignment, N->getAAInfo(), N->getRanges());
 
     Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
                             N->getAddressingMode(), N->isTruncatingStore(),
@@ -3716,7 +3720,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) {
   SDValue Lo;
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo(), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
+      N->getRanges());
 
   if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) {
     SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale};
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b8c7d08da3e2d..9d73a42df2a47 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4070,6 +4070,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     if (Op.getResNo() == 0) {
       if (TLI->getExtendForAtomicOps() == ISD::ZERO_EXTEND)
         Known.Zero.setBitsFrom(MemBits);
+      else if (Op->getOpcode() == ISD::ATOMIC_LOAD &&
+               cast<AtomicSDNode>(Op)->getExtensionType() == ISD::ZEXTLOAD)
+        Known.Zero.setBitsFrom(MemBits);
     }
     break;
   }
@@ -4875,6 +4878,13 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
         return VTBits - Tmp + 1;
       if (TLI->getExtendForAtomicOps() == ISD::ZERO_EXTEND)
         return VTBits - Tmp;
+      if (Op->getOpcode() == ISD::ATOMIC_LOAD) {
+        ISD::LoadExtType ETy = cast<AtomicSDNode>(Op)->getExtensionType();
+        if (ETy == ISD::SEXTLOAD)
+          return VTBits - Tmp + 1;
+        if (ETy == ISD::ZEXTLOAD)
+          return VTBits - Tmp;
+      }
     }
     break;
   }
@@ -6039,30 +6049,14 @@ static std::optional<APInt> FoldValue(unsigned Opcode, const APInt &C1,
     APInt C2Ext = C2.zext(FullWidth);
     return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
   }
-  case ISD::AVGFLOORS: {
-    unsigned FullWidth = C1.getBitWidth() + 1;
-    APInt C1Ext = C1.sext(FullWidth);
-    APInt C2Ext = C2.sext(FullWidth);
-    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
-  }
-  case ISD::AVGFLOORU: {
-    unsigned FullWidth = C1.getBitWidth() + 1;
-    APInt C1Ext = C1.zext(FullWidth);
-    APInt C2Ext = C2.zext(FullWidth);
-    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
-  }
-  case ISD::AVGCEILS: {
-    unsigned FullWidth = C1.getBitWidth() + 1;
-    APInt C1Ext = C1.sext(FullWidth);
-    APInt C2Ext = C2.sext(FullWidth);
-    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
-  }
-  case ISD::AVGCEILU: {
-    unsigned FullWidth = C1.getBitWidth() + 1;
-    APInt C1Ext = C1.zext(FullWidth);
-    APInt C2Ext = C2.zext(FullWidth);
-    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
-  }
+  case ISD::AVGFLOORS:
+    return APIntOps::avgFloorS(C1, C2);
+  case ISD::AVGFLOORU:
+    return APIntOps::avgFloorU(C1, C2);
+  case ISD::AVGCEILS:
+    return APIntOps::avgCeilS(C1, C2);
+  case ISD::AVGCEILU:
+    return APIntOps::avgCeilU(C1, C2);
   case ISD::ABDS:
     return APIntOps::abds(C1, C2);
   case ISD::ABDU:
@@ -8408,11 +8402,12 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
 SDValue SelectionDAG::getMemIntrinsicNode(
     unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
     EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
-    MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
-  if (!Size && MemVT.isScalableVector())
-    Size = MemoryLocation::UnknownSize;
-  else if (!Size)
-    Size = MemVT.getStoreSize();
+    MachineMemOperand::Flags Flags, LocationSize Size,
+    const AAMDNodes &AAInfo) {
+  if (Size.hasValue() && MemVT.isScalableVector())
+    Size = LocationSize::beforeOrAfterPointer();
+  else if (Size.hasValue() && !Size.getValue())
+    Size = LocationSize::precise(MemVT.getStoreSize());
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
@@ -8574,7 +8569,7 @@ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
 
-  uint64_t Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
+  LocationSize Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
                                                    Alignment, AAInfo, Ranges);
@@ -8695,7 +8690,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
 
   MachineFunction &MF = getMachineFunction();
-  uint64_t Size =
+  LocationSize Size =
       MemoryLocation::getSizeOrUnknown(Val.getValueType().getStoreSize());
   MachineMemOperand *MMO =
       MF.getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo);
@@ -8844,7 +8839,7 @@ SDValue SelectionDAG::getLoadVP(
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
 
-  uint64_t Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
+  LocationSize Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
                                                    Alignment, AAInfo, Ranges);
@@ -9600,6 +9595,10 @@ SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) {
   if (ISD::matchUnaryPredicate(Y, isShiftTooBig, true))
     return getUNDEF(X.getValueType());
 
+  // shift i1/vXi1 X, Y --> X (any non-zero shift amount is undefined).
+  if (X.getValueType().getScalarType() == MVT::i1)
+    return X;
+
   return SDValue();
 }
 
@@ -11734,8 +11733,10 @@ MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
   // the MMO. This is because the MMO might indicate only a possible address
   // range instead of specifying the affected memory addresses precisely.
   // TODO: Make MachineMemOperands aware of scalable vectors.
-  assert(memvt.getStoreSize().getKnownMinValue() <= MMO->getSize() &&
-         "Size mismatch!");
+  assert(
+      (!MMO->getType().isValid() ||
+       memvt.getStoreSize().getKnownMinValue() <= MMO->getSize().getValue()) &&
+      "Size mismatch!");
 }
 
 /// Profile - Gather unique data for the node.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b6a35f7ad4c4c..dd19ee16d1d65 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1245,14 +1245,14 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
     }
   }
 
-  // We must skip DPValues if they've already been processed above as we
-  // have just emitted the debug values resulting from assignment tracking
-  // analysis, making any existing DPValues redundant (and probably less
-  // correct). We still need to process DPLabels. This does sink DPLabels
+  // We must skip DbgVariableRecords if they've already been processed above as
+  // we have just emitted the debug values resulting from assignment tracking
+  // analysis, making any existing DbgVariableRecords redundant (and probably
+  // less correct). We still need to process DPLabels. This does sink DPLabels
   // to the bottom of the group of debug records. That sholdn't be important
-  // as it does so deterministcally and ordering between DPLabels and DPValues
-  // is immaterial (other than for MIR/IR printing).
-  bool SkipDPValues = DAG.getFunctionVarLocs();
+  // as it does so deterministcally and ordering between DPLabels and
+  // DbgVariableRecords is immaterial (other than for MIR/IR printing).
+  bool SkipDbgVariableRecords = DAG.getFunctionVarLocs();
   // Is there is any debug-info attached to this instruction, in the form of
   // DbgRecord non-instruction debug-info records.
   for (DbgRecord &DR : I.getDbgRecordRange()) {
@@ -1264,44 +1264,45 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
       continue;
     }
 
-    if (SkipDPValues)
+    if (SkipDbgVariableRecords)
       continue;
-    DPValue &DPV = cast<DPValue>(DR);
-    DILocalVariable *Variable = DPV.getVariable();
-    DIExpression *Expression = DPV.getExpression();
+    DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
+    DILocalVariable *Variable = DVR.getVariable();
+    DIExpression *Expression = DVR.getExpression();
     dropDanglingDebugInfo(Variable, Expression);
 
-    if (DPV.getType() == DPValue::LocationType::Declare) {
-      if (FuncInfo.PreprocessedDPVDeclares.contains(&DPV))
+    if (DVR.getType() == DbgVariableRecord::LocationType::Declare) {
+      if (FuncInfo.PreprocessedDVRDeclares.contains(&DVR))
         continue;
-      LLVM_DEBUG(dbgs() << "SelectionDAG visiting dbg_declare: " << DPV
+      LLVM_DEBUG(dbgs() << "SelectionDAG visiting dbg_declare: " << DVR
                         << "\n");
-      handleDebugDeclare(DPV.getVariableLocationOp(0), Variable, Expression,
-                         DPV.getDebugLoc());
+      handleDebugDeclare(DVR.getVariableLocationOp(0), Variable, Expression,
+                         DVR.getDebugLoc());
       continue;
     }
 
-    // A DPValue with no locations is a kill location.
-    SmallVector<Value *, 4> Values(DPV.location_ops());
+    // A DbgVariableRecord with no locations is a kill location.
+    SmallVector<Value *, 4> Values(DVR.location_ops());
     if (Values.empty()) {
-      handleKillDebugValue(Variable, Expression, DPV.getDebugLoc(),
+      handleKillDebugValue(Variable, Expression, DVR.getDebugLoc(),
                            SDNodeOrder);
       continue;
     }
 
-    // A DPValue with an undef or absent location is also a kill location.
+    // A DbgVariableRecord with an undef or absent location is also a kill
+    // location.
     if (llvm::any_of(Values,
                      [](Value *V) { return !V || isa<UndefValue>(V); })) {
-      handleKillDebugValue(Variable, Expression, DPV.getDebugLoc(),
+      handleKillDebugValue(Variable, Expression, DVR.getDebugLoc(),
                            SDNodeOrder);
       continue;
     }
 
-    bool IsVariadic = DPV.hasArgList();
-    if (!handleDebugValue(Values, Variable, Expression, DPV.getDebugLoc(),
+    bool IsVariadic = DVR.hasArgList();
+    if (!handleDebugValue(Values, Variable, Expression, DVR.getDebugLoc(),
                           SDNodeOrder, IsVariadic)) {
       addDanglingDebugInfo(Values, Variable, Expression, IsVariadic,
-                           DPV.getDebugLoc(), SDNodeOrder);
+                           DVR.getDebugLoc(), SDNodeOrder);
     }
   }
 }
@@ -3037,7 +3038,8 @@ static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
     auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
                  MachineMemOperand::MODereferenceable;
     MachineMemOperand *MemRef = MF.getMachineMemOperand(
-        MPInfo, Flags, PtrTy.getSizeInBits() / 8, DAG.getEVTAlign(PtrTy));
+        MPInfo, Flags, LocationSize::precise(PtrTy.getSizeInBits() / 8),
+        DAG.getEVTAlign(PtrTy));
     DAG.setNodeMemRefs(Node, {MemRef});
   }
   if (PtrTy != PtrMemTy)
@@ -4753,7 +4755,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata());
+      LocationSize::beforeOrAfterPointer(), Alignment, I.getAAMetadata());
   SDValue StoreNode =
       DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
                          ISD::UNINDEXED, false /* Truncating */, IsCompressing);
@@ -4857,9 +4859,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOStore,
-      // TODO: Make MachineMemOperands aware of scalable
-      // vectors.
-      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata());
+      LocationSize::beforeOrAfterPointer(), Alignment, I.getAAMetadata());
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
@@ -4925,7 +4925,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, AAInfo, Ranges);
+      LocationSize::beforeOrAfterPointer(), Alignment, AAInfo, Ranges);
 
   SDValue Load =
       DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
@@ -4961,9 +4961,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOLoad,
-      // TODO: Make MachineMemOperands aware of scalable
-      // vectors.
-      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata(), Ranges);
+      LocationSize::beforeOrAfterPointer(), Alignment, I.getAAMetadata(), Ranges);
 
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
@@ -5003,9 +5001,9 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
-      DAG.getEVTAlign(MemVT), AAMDNodes(), nullptr, SSID, SuccessOrdering,
-      FailureOrdering);
+      MachinePointerInfo(I.getPointerOperand()), Flags,
+      LocationSize::precise(MemVT.getStoreSize()), DAG.getEVTAlign(MemVT),
+      AAMDNodes(), nullptr, SSID, SuccessOrdering, FailureOrdering);
 
   SDValue L = DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
                                    dl, MemVT, VTs, InChain,
@@ -5057,8 +5055,9 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
-      DAG.getEVTAlign(MemVT), AAMDNodes(), nullptr, SSID, Ordering);
+      MachinePointerInfo(I.getPointerOperand()), Flags,
+      LocationSize::precise(MemVT.getStoreSize()), DAG.getEVTAlign(MemVT),
+      AAMDNodes(), nullptr, SSID, Ordering);
 
   SDValue L =
     DAG.getAtomic(NT, dl, MemVT, InChain,
@@ -5103,8 +5102,9 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
   auto Flags = TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
-      I.getAlign(), AAMDNodes(), nullptr, SSID, Order);
+      MachinePointerInfo(I.getPointerOperand()), Flags,
+      LocationSize::precise(MemVT.getStoreSize()), I.getAlign(), AAMDNodes(),
+      nullptr, SSID, Order);
 
   InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
 
@@ -5140,8 +5140,9 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
-      I.getAlign(), AAMDNodes(), nullptr, SSID, Ordering);
+      MachinePointerInfo(I.getPointerOperand()), Flags,
+      LocationSize::precise(MemVT.getStoreSize()), I.getAlign(), AAMDNodes(),
+      nullptr, SSID, Ordering);
 
   SDValue Val = getValue(I.getValueOperand());
   if (Val.getValueType() != MemVT)
@@ -6904,7 +6905,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       auto MPI =
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
       MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-          MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize,
+          MPI, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
           TempAlign);
       Chain = DAG.getGetFPEnv(Chain, sdl, Temp, EnvVT, MMO);
       Res = DAG.getLoad(EnvVT, sdl, Chain, Temp, MPI);
@@ -6933,7 +6934,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       Chain = DAG.getStore(Chain, sdl, Env, Temp, MPI, TempAlign,
                            MachineMemOperand::MOStore);
       MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-          MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize,
+          MPI, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
           TempAlign);
       Chain = DAG.getSetFPEnv(Chain, sdl, Temp, EnvVT, MMO);
     }
@@ -8087,7 +8088,7 @@ void SelectionDAGBuilder::visitVPLoad(
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
   LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2],
                      MMO, false /*IsExpanding */);
   if (AddToChain)
@@ -8110,8 +8111,8 @@ void SelectionDAGBuilder::visitVPGather(
   unsigned AS =
     PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-     MachinePointerInfo(AS), MachineMemOperand::MOLoad,
-     MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+      MachinePointerInfo(AS), MachineMemOperand::MOLoad,
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
   SDValue Base, Index, Scale;
   ISD::MemIndexType IndexType;
   bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
@@ -8151,7 +8152,7 @@ void SelectionDAGBuilder::visitVPStore(
   SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
   ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset,
                       OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED,
                       /* IsTruncating */ false, /*IsCompressing*/ false);
@@ -8174,7 +8175,7 @@ void SelectionDAGBuilder::visitVPScatter(
       PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
   SDValue Base, Index, Scale;
   ISD::MemIndexType IndexType;
   bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
@@ -8217,7 +8218,7 @@ void SelectionDAGBuilder::visitVPStridedLoad(
   unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
 
   SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1],
                                     OpValues[2], OpValues[3], MMO,
@@ -8240,7 +8241,7 @@ void SelectionDAGBuilder::visitVPStridedStore(
   unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
 
   SDValue ST = DAG.getStridedStoreVP(
       getMemoryRoot(), DL, OpValues[0], OpValues[1],
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index fa71adc8da3f3..20375a0f92b23 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -841,6 +841,18 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   } else if (const MemSDNode *M = dyn_cast<MemSDNode>(this)) {
     OS << "<";
     printMemOperand(OS, *M->getMemOperand(), G);
+    if (auto *A = dyn_cast<AtomicSDNode>(M))
+      if (A->getOpcode() == ISD::ATOMIC_LOAD) {
+        bool doExt = true;
+        switch (A->getExtensionType()) {
+        default: doExt = false; break;
+        case ISD::EXTLOAD:  OS << ", anyext"; break;
+        case ISD::SEXTLOAD: OS << ", sext"; break;
+        case ISD::ZEXTLOAD: OS << ", zext"; break;
+        }
+        if (doExt)
+          OS << " from " << A->getMemoryVT();
+      }
     OS << ">";
   } else if (const BlockAddressSDNode *BA =
                dyn_cast<BlockAddressSDNode>(this)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c78c3ed294e46..d629c36bc792e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1461,12 +1461,12 @@ static void processDbgDeclares(FunctionLoweringInfo &FuncInfo) {
     if (DI && processDbgDeclare(FuncInfo, DI->getAddress(), DI->getExpression(),
                                 DI->getVariable(), DI->getDebugLoc()))
       FuncInfo.PreprocessedDbgDeclares.insert(DI);
-    for (const DPValue &DPV : DPValue::filter(I.getDbgRecordRange())) {
-      if (DPV.Type == DPValue::LocationType::Declare &&
-          processDbgDeclare(FuncInfo, DPV.getVariableLocationOp(0),
-                            DPV.getExpression(), DPV.getVariable(),
-                            DPV.getDebugLoc()))
-        FuncInfo.PreprocessedDPVDeclares.insert(&DPV);
+    for (const DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      if (DVR.Type == DbgVariableRecord::LocationType::Declare &&
+          processDbgDeclare(FuncInfo, DVR.getVariableLocationOp(0),
+                            DVR.getExpression(), DVR.getVariable(),
+                            DVR.getDebugLoc()))
+        FuncInfo.PreprocessedDVRDeclares.insert(&DVR);
     }
   }
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b3dc9de713731..16069c6c0dc31 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -611,6 +611,25 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
   return false;
 }
 
+static SDValue simplifyUseOfIntToFP(SDValue Op, const APInt &DemandedBits,
+                                    SelectionDAG &DAG) {
+  unsigned Opc = Op.getOpcode();
+  assert((Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP) &&
+         "Invalid Int -> FP Opcode");
+  if (!DemandedBits.isSignMask())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (Opc == ISD::UINT_TO_FP)
+    return DAG.getConstant(0, SDLoc(Op), VT);
+
+  EVT InnerVT = Op.getOperand(0).getValueType();
+  if (VT.getScalarSizeInBits() == InnerVT.getScalarSizeInBits())
+    return DAG.getBitcast(VT, Op.getOperand(0));
+
+  return SDValue();
+}
+
 bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
                                           DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -816,6 +835,11 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
     }
     break;
   }
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+    if (SDValue R = simplifyUseOfIntToFP(Op, DemandedBits, DAG))
+      return R;
+    break;
   case ISD::SIGN_EXTEND_INREG: {
     // If none of the extended bits are demanded, eliminate the sextinreg.
     SDValue Op0 = Op.getOperand(0);
@@ -2313,6 +2337,12 @@ bool TargetLowering::SimplifyDemandedBits(
     Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
     break;
   }
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+    if (SDValue R = simplifyUseOfIntToFP(Op, DemandedBits, TLO.DAG))
+      return TLO.CombineTo(Op, R);
+    Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+    break;
   case ISD::SIGN_EXTEND_INREG: {
     SDValue Op0 = Op.getOperand(0);
     EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
@@ -10694,7 +10724,7 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
 
       SDValue Lo, Hi;
       std::tie(Lo, Hi) = DAG.SplitVector(Op, dl);
-      Op = DAG.getNode(BaseOpcode, dl, HalfVT, Lo, Hi);
+      Op = DAG.getNode(BaseOpcode, dl, HalfVT, Lo, Hi, Node->getFlags());
       VT = HalfVT;
     }
   }
diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 4bad57d279e9e..20c827ce08d84 100644
--- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -202,7 +202,7 @@ SjLjEHPrepareImpl::setupFunctionContext(Function &F,
   auto &DL = F.getParent()->getDataLayout();
   const Align Alignment = DL.getPrefTypeAlign(FunctionContextTy);
   FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(), nullptr,
-                           Alignment, "fn_context", &EntryBB->front());
+                           Alignment, "fn_context", EntryBB->begin());
 
   // Fill in the function context structure.
   for (LandingPadInst *LPI : LPads) {
@@ -271,7 +271,7 @@ void SjLjEHPrepareImpl::lowerIncomingArguments(Function &F) {
     Value *TrueValue = ConstantInt::getTrue(F.getContext());
     Value *UndefValue = UndefValue::get(Ty);
     Instruction *SI = SelectInst::Create(
-        TrueValue, &AI, UndefValue, AI.getName() + ".tmp", &*AfterAllocaInsPt);
+        TrueValue, &AI, UndefValue, AI.getName() + ".tmp", AfterAllocaInsPt);
     AI.replaceAllUsesWith(SI);
 
     // Reset the operand, because it  was clobbered by the RAUW above.
@@ -386,7 +386,7 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
       if (Function *Callee = II->getCalledFunction())
         if (Callee->getIntrinsicID() == Intrinsic::donothing) {
           // Remove the NOP invoke.
-          BranchInst::Create(II->getNormalDest(), II);
+          BranchInst::Create(II->getNormalDest(), II->getIterator());
           II->eraseFromParent();
           continue;
         }
@@ -445,7 +445,7 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
 
     // Record the call site value for the back end so it stays associated with
     // the invoke.
-    CallInst::Create(CallSiteFn, CallSiteNum, "", Invokes[I]);
+    CallInst::Create(CallSiteFn, CallSiteNum, "", Invokes[I]->getIterator());
   }
 
   // Mark call instructions that aren't nounwind as no-action (call_site ==
@@ -462,8 +462,8 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
   }
 
   // Register the function context and make sure it's known to not throw
-  CallInst *Register =
-      CallInst::Create(RegisterFn, FuncCtx, "", EntryBB->getTerminator());
+  CallInst *Register = CallInst::Create(
+      RegisterFn, FuncCtx, "", EntryBB->getTerminator()->getIterator());
   Register->setDoesNotThrow();
 
   // Following any allocas not in the entry block, update the saved SP in the
@@ -480,7 +480,8 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
       }
       Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
       StackAddr->insertAfter(&I);
-      new StoreInst(StackAddr, StackPtr, true, StackAddr->getNextNode());
+      new StoreInst(StackAddr, StackPtr, true,
+                    std::next(StackAddr->getIterator()));
     }
   }
 
@@ -490,7 +491,7 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
     Instruction *InsertPoint = Return;
     if (CallInst *CI = Return->getParent()->getTerminatingMustTailCall())
       InsertPoint = CI;
-    CallInst::Create(UnregisterFn, FuncCtx, "", InsertPoint);
+    CallInst::Create(UnregisterFn, FuncCtx, "", InsertPoint->getIterator());
   }
 
   return true;
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 5b02c1bc39c0a..9fbd516acea8e 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1554,7 +1554,8 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
     SmallVector<uint64_t, 8> Ops;
     DIExpression::appendOffset(Ops, Offset);
     Ops.push_back(dwarf::DW_OP_deref_size);
-    Ops.push_back(MMO->getSize());
+    Ops.push_back(MMO->getSize().hasValue() ? MMO->getSize().getValue()
+                                            : ~UINT64_C(0));
     Expr = DIExpression::prependOpcodes(Expr, Ops);
     return ParamLoadedValue(*BaseOp, Expr);
   }
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 6943ce261d9d9..15b59421a0f44 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -2680,21 +2680,34 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForFunctionDescriptor(
 
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
     const MCSymbol *Sym, const TargetMachine &TM) const {
-  // Use TE storage-mapping class when large code model is enabled so that
-  // the chance of needing -bbigtoc is decreased. Also, the toc-entry for
-  // EH info is never referenced directly using instructions so it can be
-  // allocated with TE storage-mapping class.
-  // The "_$TLSML" symbol for TLS local-dynamic mode requires XMC_TC, otherwise
-  // the AIX assembler will complain.
+  const XCOFF::StorageMappingClass SMC = [](const MCSymbol *Sym,
+                                            const TargetMachine &TM) {
+    const MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(Sym);
+
+    // The "_$TLSML" symbol for TLS local-dynamic mode requires XMC_TC,
+    // otherwise the AIX assembler will complain.
+    if (XSym->getSymbolTableName() == "_$TLSML")
+      return XCOFF::XMC_TC;
+
+    // Use large code model toc entries for ehinfo symbols as they are
+    // never referenced directly. The runtime loads their TOC entry
+    // addresses from the trace-back table.
+    if (XSym->isEHInfo())
+      return XCOFF::XMC_TE;
+
+    // If the symbol does not have a code model specified use the module value.
+    if (!XSym->hasPerSymbolCodeModel())
+      return TM.getCodeModel() == CodeModel::Large ? XCOFF::XMC_TE
+                                                   : XCOFF::XMC_TC;
+
+    return XSym->getPerSymbolCodeModel() == MCSymbolXCOFF::CM_Large
+               ? XCOFF::XMC_TE
+               : XCOFF::XMC_TC;
+  }(Sym, TM);
+
   return getContext().getXCOFFSection(
       cast<MCSymbolXCOFF>(Sym)->getSymbolTableName(), SectionKind::getData(),
-      XCOFF::CsectProperties(
-          ((TM.getCodeModel() == CodeModel::Large &&
-            cast<MCSymbolXCOFF>(Sym)->getSymbolTableName() != "_$TLSML") ||
-           cast<MCSymbolXCOFF>(Sym)->isEHInfo())
-              ? XCOFF::XMC_TE
-              : XCOFF::XMC_TC,
-          XCOFF::XTY_SD));
+      XCOFF::CsectProperties(SMC, XCOFF::XTY_SD));
 }
 
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA(
diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp
index 95976c218c2f1..93a18f5bc9e8d 100644
--- a/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -1234,10 +1234,10 @@ AllocaInst *WinEHPrepareImpl::insertPHILoads(PHINode *PN, Function &F) {
     // that will dominate all uses.
     SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(PN->getName(), ".wineh.spillslot"),
-                               &F.getEntryBlock().front());
+                               F.getEntryBlock().begin());
     Value *V = new LoadInst(PN->getType(), SpillSlot,
                             Twine(PN->getName(), ".wineh.reload"),
-                            &*PHIBlock->getFirstInsertionPt());
+                            PHIBlock->getFirstInsertionPt());
     PN->replaceAllUsesWith(V);
     return SpillSlot;
   }
@@ -1309,7 +1309,7 @@ void WinEHPrepareImpl::insertPHIStore(
   }
 
   // Otherwise, insert the store at the end of the basic block.
-  new StoreInst(PredVal, SpillSlot, PredBlock->getTerminator());
+  new StoreInst(PredVal, SpillSlot, PredBlock->getTerminator()->getIterator());
 }
 
 void WinEHPrepareImpl::replaceUseWithLoad(
@@ -1319,7 +1319,7 @@ void WinEHPrepareImpl::replaceUseWithLoad(
   if (!SpillSlot)
     SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(V->getName(), ".wineh.spillslot"),
-                               &F.getEntryBlock().front());
+                               F.getEntryBlock().begin());
 
   auto *UsingInst = cast<Instruction>(U.getUser());
   if (auto *UsingPHI = dyn_cast<PHINode>(UsingInst)) {
@@ -1376,16 +1376,16 @@ void WinEHPrepareImpl::replaceUseWithLoad(
     Value *&Load = Loads[IncomingBlock];
     // Insert the load into the predecessor block
     if (!Load)
-      Load = new LoadInst(V->getType(), SpillSlot,
-                          Twine(V->getName(), ".wineh.reload"),
-                          /*isVolatile=*/false, IncomingBlock->getTerminator());
+      Load = new LoadInst(
+          V->getType(), SpillSlot, Twine(V->getName(), ".wineh.reload"),
+          /*isVolatile=*/false, IncomingBlock->getTerminator()->getIterator());
 
     U.set(Load);
   } else {
     // Reload right before the old use.
     auto *Load = new LoadInst(V->getType(), SpillSlot,
                               Twine(V->getName(), ".wineh.reload"),
-                              /*isVolatile=*/false, UsingInst);
+                              /*isVolatile=*/false, UsingInst->getIterator());
     U.set(Load);
   }
 }
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index 9b581a6c9ab77..60f664ece7eef 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -2701,8 +2701,8 @@ Error DWARFLinker::link() {
   // This Dwarf string pool which is used for emission. It must be used
   // serially as the order of calling getStringOffset matters for
   // reproducibility.
-  OffsetsStringPool DebugStrPool(StringsTranslator, true);
-  OffsetsStringPool DebugLineStrPool(StringsTranslator, false);
+  OffsetsStringPool DebugStrPool(true);
+  OffsetsStringPool DebugLineStrPool(false);
   DebugDieValuePool StringOffsetPool;
 
   // ODR Contexts for the optimize.
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
index 6d522370e440d..8b31b5ead29ad 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
@@ -32,10 +32,9 @@ using namespace dwarf_linker::classic;
 
 Expected<std::unique_ptr<DwarfStreamer>> DwarfStreamer::createStreamer(
     const Triple &TheTriple, DWARFLinkerBase::OutputFileType FileType,
-    raw_pwrite_stream &OutFile, DWARFLinkerBase::TranslatorFuncTy Translator,
-    DWARFLinkerBase::MessageHandlerTy Warning) {
+    raw_pwrite_stream &OutFile, DWARFLinkerBase::MessageHandlerTy Warning) {
   std::unique_ptr<DwarfStreamer> Streamer =
-      std::make_unique<DwarfStreamer>(FileType, OutFile, Translator, Warning);
+      std::make_unique<DwarfStreamer>(FileType, OutFile, Warning);
   if (Error Err = Streamer->init(TheTriple, "__DWARF"))
     return std::move(Err);
 
@@ -977,11 +976,10 @@ void DwarfStreamer::emitLineTableString(const DWARFDebugLine::Prologue &P,
 
   switch (String.getForm()) {
   case dwarf::DW_FORM_string: {
-    StringRef TranslatedString =
-        (Translator) ? Translator(*StringVal) : *StringVal;
-    Asm->OutStreamer->emitBytes(TranslatedString.data());
+    StringRef Str = *StringVal;
+    Asm->OutStreamer->emitBytes(Str.data());
     Asm->emitInt8(0);
-    LineSectionSize += TranslatedString.size() + 1;
+    LineSectionSize += Str.size() + 1;
   } break;
   case dwarf::DW_FORM_strp:
   case dwarf::DW_FORM_line_strp: {
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinker.cpp
index ad8d28a643174..31c7e867767a0 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinker.cpp
@@ -15,8 +15,6 @@ using namespace dwarf_linker::parallel;
 
 std::unique_ptr<DWARFLinker>
 DWARFLinker::createLinker(MessageHandlerTy ErrorHandler,
-                          MessageHandlerTy WarningHandler,
-                          TranslatorFuncTy StringsTranslator) {
-  return std::make_unique<DWARFLinkerImpl>(ErrorHandler, WarningHandler,
-                                           StringsTranslator);
+                          MessageHandlerTy WarningHandler) {
+  return std::make_unique<DWARFLinkerImpl>(ErrorHandler, WarningHandler);
 }
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h
index 38c261a8106fc..7ca81eb34f005 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h
@@ -21,7 +21,6 @@ class DWARFDie;
 namespace dwarf_linker {
 namespace parallel {
 
-using TranslatorFuncTy = std::function<StringRef(StringRef)>;
 using MessageHandlerTy = std::function<void(
     const Twine &Warning, StringRef Context, const DWARFDie *DIE)>;
 
@@ -95,19 +94,6 @@ class LinkingGlobalData {
   /// Returns global string pool.
   StringPool &getStringPool() { return Strings; }
 
-  /// Set translation function.
-  void setTranslator(TranslatorFuncTy Translator) {
-    this->Translator = Translator;
-  }
-
-  /// Translate specified string.
-  StringRef translateString(StringRef String) {
-    if (Translator)
-      return Translator(String);
-
-    return String;
-  }
-
   /// Returns linking options.
   const DWARFLinkerOptions &getOptions() const { return Options; }
 
@@ -161,7 +147,6 @@ class LinkingGlobalData {
 protected:
   llvm::parallel::PerThreadBumpPtrAllocator Allocator;
   StringPool Strings;
-  TranslatorFuncTy Translator;
   DWARFLinkerOptions Options;
   MessageHandlerTy WarningHandler;
   MessageHandlerTy ErrorHandler;
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
index 49b08997eb9c1..e68bf0c227a0a 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
@@ -20,11 +20,9 @@ using namespace dwarf_linker;
 using namespace dwarf_linker::parallel;
 
 DWARFLinkerImpl::DWARFLinkerImpl(MessageHandlerTy ErrorHandler,
-                                 MessageHandlerTy WarningHandler,
-                                 TranslatorFuncTy StringsTranslator)
+                                 MessageHandlerTy WarningHandler)
     : UniqueUnitID(0), DebugStrStrings(GlobalData),
       DebugLineStrStrings(GlobalData), CommonSections(GlobalData) {
-  GlobalData.setTranslator(StringsTranslator);
   GlobalData.setErrorHandler(ErrorHandler);
   GlobalData.setWarningHandler(WarningHandler);
 }
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
index 7c17c5b79c7c1..597bb1b4da59a 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
@@ -26,8 +26,7 @@ namespace parallel {
 class DWARFLinkerImpl : public DWARFLinker {
 public:
   DWARFLinkerImpl(MessageHandlerTy ErrorHandler,
-                  MessageHandlerTy WarningHandler,
-                  TranslatorFuncTy StringsTranslator);
+                  MessageHandlerTy WarningHandler);
 
   /// Add object file to be linked. Pre-load compile unit die. Call
   /// \p OnCUDieLoaded for each compile unit die. If specified \p File
diff --git a/llvm/lib/DWARFLinker/Parallel/OutputSections.h b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
index a21e4b2b75a50..0e1f2dae54bcc 100644
--- a/llvm/lib/DWARFLinker/Parallel/OutputSections.h
+++ b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
@@ -253,7 +253,7 @@ struct SectionDescriptor : SectionDescriptorBase {
 
   /// Emit specified inplace string value into the current section contents.
   void emitInplaceString(StringRef String) {
-    OS << GlobalData.translateString(String);
+    OS << String;
     emitIntVal(0, 1);
   }
 
diff --git a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
index 858f224777dba..f67536ef7a1a8 100644
--- a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
+++ b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
@@ -33,7 +33,7 @@ class StringEntryToDwarfStringPoolEntryMap {
       DwarfStringPoolEntryWithExtString *DataPtr =
           GlobalData.getAllocator()
               .Allocate<DwarfStringPoolEntryWithExtString>();
-      DataPtr->String = GlobalData.translateString(String->getKey());
+      DataPtr->String = String->getKey();
       DataPtr->Index = DwarfStringPoolEntry::NotIndexed;
       DataPtr->Offset = 0;
       DataPtr->Symbol = nullptr;
diff --git a/llvm/lib/DebugInfo/LogicalView/CMakeLists.txt b/llvm/lib/DebugInfo/LogicalView/CMakeLists.txt
index 38a174661b4f3..f5adab723927a 100644
--- a/llvm/lib/DebugInfo/LogicalView/CMakeLists.txt
+++ b/llvm/lib/DebugInfo/LogicalView/CMakeLists.txt
@@ -24,7 +24,7 @@ add_lv_impl_folder(Readers
   Readers/LVBinaryReader.cpp
   Readers/LVCodeViewReader.cpp
   Readers/LVCodeViewVisitor.cpp
-  Readers/LVELFReader.cpp
+  Readers/LVDWARFReader.cpp
   )
 
 list(APPEND LIBLV_ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/DebugInfo/LogicalView/LVReaderHandler.cpp b/llvm/lib/DebugInfo/LogicalView/LVReaderHandler.cpp
index 5f82f816dc19d..c841015405953 100644
--- a/llvm/lib/DebugInfo/LogicalView/LVReaderHandler.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/LVReaderHandler.cpp
@@ -14,7 +14,7 @@
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/LogicalView/Core/LVCompare.h"
 #include "llvm/DebugInfo/LogicalView/Readers/LVCodeViewReader.h"
-#include "llvm/DebugInfo/LogicalView/Readers/LVELFReader.h"
+#include "llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/Object/COFF.h"
@@ -48,8 +48,9 @@ Error LVReaderHandler::createReader(StringRef Filename, LVReaders &Readers,
         return std::make_unique<LVCodeViewReader>(Filename, FileFormatName,
                                                   *COFF, W, ExePath);
       }
-      if (Obj.isELF() || Obj.isMachO())
-        return std::make_unique<LVELFReader>(Filename, FileFormatName, Obj, W);
+      if (Obj.isELF() || Obj.isMachO() || Obj.isWasm())
+        return std::make_unique<LVDWARFReader>(Filename, FileFormatName, Obj,
+                                               W);
     }
     if (isa<PDBFile *>(Input)) {
       PDBFile &Pdb = *cast<PDBFile *>(Input);
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
index a0cd8b7839cf7..2d46414a6986a 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
@@ -146,6 +146,30 @@ bool LVBinaryReader::getSymbolTableIsComdat(StringRef Name) {
 
 void LVBinaryReader::mapVirtualAddress(const object::ObjectFile &Obj) {
   for (const object::SectionRef &Section : Obj.sections()) {
+    LLVM_DEBUG({
+      Expected<StringRef> SectionNameOrErr = Section.getName();
+      StringRef Name;
+      if (!SectionNameOrErr)
+        consumeError(SectionNameOrErr.takeError());
+      else
+        Name = *SectionNameOrErr;
+      dbgs() << "Index: " << format_decimal(Section.getIndex(), 3) << ", "
+             << "Address: " << hexValue(Section.getAddress()) << ", "
+             << "Size: " << hexValue(Section.getSize()) << ", "
+             << "Name: " << Name << "\n";
+      dbgs() << "isCompressed:   " << Section.isCompressed() << ", "
+             << "isText:         " << Section.isText() << ", "
+             << "isData:         " << Section.isData() << ", "
+             << "isBSS:          " << Section.isBSS() << ", "
+             << "isVirtual:      " << Section.isVirtual() << "\n";
+      dbgs() << "isBitcode:      " << Section.isBitcode() << ", "
+             << "isStripped:     " << Section.isStripped() << ", "
+             << "isBerkeleyText: " << Section.isBerkeleyText() << ", "
+             << "isBerkeleyData: " << Section.isBerkeleyData() << ", "
+             << "isDebugSection: " << Section.isDebugSection() << "\n";
+      dbgs() << "\n";
+    });
+
     if (!Section.isText() || Section.isVirtual() || !Section.getSize())
       continue;
 
@@ -161,8 +185,14 @@ void LVBinaryReader::mapVirtualAddress(const object::ObjectFile &Obj) {
       continue;
     }
     if ((*SectionNameOrErr).equals(".text") ||
-        (*SectionNameOrErr).equals(".code"))
+        (*SectionNameOrErr).equals("CODE") ||
+        (*SectionNameOrErr).equals(".code")) {
       DotTextSectionIndex = Section.getIndex();
+      // If the object is WebAssembly, update the address offset that
+      // will be added to DWARF DW_AT_* attributes.
+      if (Obj.isWasm())
+        WasmCodeSectionOffset = Section.getAddress();
+    }
   }
 
   // Process the symbol table.
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVELFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
similarity index 94%
rename from llvm/lib/DebugInfo/LogicalView/Readers/LVELFReader.cpp
rename to llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
index 4469092099dac..91e5a037054da 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVELFReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
@@ -1,4 +1,4 @@
-//===-- LVELFReader.cpp ---------------------------------------------------===//
+//===-- LVDWARFReader.cpp -------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This implements the LVELFReader class.
-// It supports ELF and Mach-O formats.
+// This implements the LVDWARFReader class.
+// It supports ELF, Mach-O and Wasm binary formats.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/LogicalView/Readers/LVELFReader.h"
+#include "llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
@@ -27,9 +27,9 @@ using namespace llvm;
 using namespace llvm::object;
 using namespace llvm::logicalview;
 
-#define DEBUG_TYPE "ElfReader"
+#define DEBUG_TYPE "DWARFReader"
 
-LVElement *LVELFReader::createElement(dwarf::Tag Tag) {
+LVElement *LVDWARFReader::createElement(dwarf::Tag Tag) {
   CurrentScope = nullptr;
   CurrentSymbol = nullptr;
   CurrentType = nullptr;
@@ -243,8 +243,9 @@ LVElement *LVELFReader::createElement(dwarf::Tag Tag) {
   return nullptr;
 }
 
-void LVELFReader::processOneAttribute(const DWARFDie &Die, LVOffset *OffsetPtr,
-                                      const AttributeSpec &AttrSpec) {
+void LVDWARFReader::processOneAttribute(const DWARFDie &Die,
+                                        LVOffset *OffsetPtr,
+                                        const AttributeSpec &AttrSpec) {
   uint64_t OffsetOnEntry = *OffsetPtr;
   DWARFUnit *U = Die.getDwarfUnit();
   const DWARFFormValue &FormValue =
@@ -415,6 +416,8 @@ void LVELFReader::processOneAttribute(const DWARFDie &Die, LVOffset *OffsetPtr,
       if (FoundLowPC) {
         if (CurrentLowPC == MaxAddress)
           CurrentElement->setIsDiscarded();
+        // Consider the case of WebAssembly.
+        CurrentLowPC += WasmCodeSectionOffset;
         if (CurrentElement->isCompileUnit())
           setCUBaseAddress(CurrentLowPC);
       }
@@ -429,10 +432,17 @@ void LVELFReader::processOneAttribute(const DWARFDie &Die, LVOffset *OffsetPtr,
         CurrentHighPC = *Address;
       if (std::optional<uint64_t> Offset = FormValue.getAsUnsignedConstant())
         // High PC is an offset from LowPC.
-        CurrentHighPC = CurrentLowPC + *Offset;
+        // Don't add the WebAssembly offset if we have seen a DW_AT_low_pc, as
+        // the CurrentLowPC has already that offset added. Basically, use the
+        // original DW_AT_loc_pc value.
+        CurrentHighPC =
+            (FoundLowPC ? CurrentLowPC - WasmCodeSectionOffset : CurrentLowPC) +
+            *Offset;
       // Store the real upper limit for the address range.
       if (UpdateHighAddress && CurrentHighPC > 0)
         --CurrentHighPC;
+      // Consider the case of WebAssembly.
+      CurrentHighPC += WasmCodeSectionOffset;
       if (CurrentElement->isCompileUnit())
         setCUHighAddress(CurrentHighPC);
     }
@@ -466,6 +476,9 @@ void LVELFReader::processOneAttribute(const DWARFDie &Die, LVOffset *OffsetPtr,
         // Store the real upper limit for the address range.
         if (UpdateHighAddress && Range.HighPC > 0)
           --Range.HighPC;
+        // Consider the case of WebAssembly.
+        Range.LowPC += WasmCodeSectionOffset;
+        Range.HighPC += WasmCodeSectionOffset;
         // Add the pair of addresses.
         CurrentScope->addObject(Range.LowPC, Range.HighPC);
         // If the scope is the CU, do not update the ranges set.
@@ -503,8 +516,8 @@ void LVELFReader::processOneAttribute(const DWARFDie &Die, LVOffset *OffsetPtr,
   }
 }
 
-LVScope *LVELFReader::processOneDie(const DWARFDie &InputDIE, LVScope *Parent,
-                                    DWARFDie &SkeletonDie) {
+LVScope *LVDWARFReader::processOneDie(const DWARFDie &InputDIE, LVScope *Parent,
+                                      DWARFDie &SkeletonDie) {
   // If the input DIE corresponds to the compile unit, it can be:
   // a) Simple DWARF: a standard DIE. Ignore the skeleton DIE (is empty).
   // b) Split DWARF: the DIE for the split DWARF. The skeleton is the DIE
@@ -676,8 +689,8 @@ LVScope *LVELFReader::processOneDie(const DWARFDie &InputDIE, LVScope *Parent,
   return CurrentScope;
 }
 
-void LVELFReader::traverseDieAndChildren(DWARFDie &DIE, LVScope *Parent,
-                                         DWARFDie &SkeletonDie) {
+void LVDWARFReader::traverseDieAndChildren(DWARFDie &DIE, LVScope *Parent,
+                                           DWARFDie &SkeletonDie) {
   // Process the current DIE.
   LVScope *Scope = processOneDie(DIE, Parent, SkeletonDie);
   if (Scope) {
@@ -697,13 +710,13 @@ void LVELFReader::traverseDieAndChildren(DWARFDie &DIE, LVScope *Parent,
   }
 }
 
-void LVELFReader::processLocationGaps() {
+void LVDWARFReader::processLocationGaps() {
   if (options().getAttributeAnyLocation())
     for (LVSymbol *Symbol : SymbolsWithLocations)
       Symbol->fillLocationGaps();
 }
 
-void LVELFReader::createLineAndFileRecords(
+void LVDWARFReader::createLineAndFileRecords(
     const DWARFDebugLine::LineTable *Lines) {
   if (!Lines)
     return;
@@ -735,7 +748,8 @@ void LVELFReader::createLineAndFileRecords(
       // and they will be released when its scope parent is deleted.
       LVLineDebug *Line = createLineDebug();
       CULines.push_back(Line);
-      Line->setAddress(Row.Address.Address);
+      // Consider the case of WebAssembly.
+      Line->setAddress(Row.Address.Address + WasmCodeSectionOffset);
       Line->setFilename(
           CompileUnit->getFilename(IncrementIndex ? Row.File + 1 : Row.File));
       Line->setLineNumber(Row.Line);
@@ -759,8 +773,8 @@ void LVELFReader::createLineAndFileRecords(
     }
 }
 
-std::string LVELFReader::getRegisterName(LVSmall Opcode,
-                                         ArrayRef<uint64_t> Operands) {
+std::string LVDWARFReader::getRegisterName(LVSmall Opcode,
+                                           ArrayRef<uint64_t> Operands) {
   // The 'prettyPrintRegisterOp' function uses the DWARFUnit to support
   // DW_OP_regval_type. At this point we are operating on a logical view
   // item, with no access to the underlying DWARF data used by LLVM.
@@ -787,7 +801,7 @@ std::string LVELFReader::getRegisterName(LVSmall Opcode,
   return Stream.str();
 }
 
-Error LVELFReader::createScopes() {
+Error LVDWARFReader::createScopes() {
   LLVM_DEBUG({
     W.startLine() << "\n";
     W.printString("File", Obj.getFileName().str());
@@ -967,11 +981,11 @@ Error LVELFReader::createScopes() {
 }
 
 // Get the location information for the associated attribute.
-void LVELFReader::processLocationList(dwarf::Attribute Attr,
-                                      const DWARFFormValue &FormValue,
-                                      const DWARFDie &Die,
-                                      uint64_t OffsetOnEntry,
-                                      bool CallSiteLocation) {
+void LVDWARFReader::processLocationList(dwarf::Attribute Attr,
+                                        const DWARFFormValue &FormValue,
+                                        const DWARFDie &Die,
+                                        uint64_t OffsetOnEntry,
+                                        bool CallSiteLocation) {
 
   auto ProcessLocationExpression = [&](const DWARFExpression &Expression) {
     for (const DWARFExpression::Operation &Op : Expression)
@@ -1048,10 +1062,10 @@ void LVELFReader::processLocationList(dwarf::Attribute Attr,
   }
 }
 
-void LVELFReader::processLocationMember(dwarf::Attribute Attr,
-                                        const DWARFFormValue &FormValue,
-                                        const DWARFDie &Die,
-                                        uint64_t OffsetOnEntry) {
+void LVDWARFReader::processLocationMember(dwarf::Attribute Attr,
+                                          const DWARFFormValue &FormValue,
+                                          const DWARFDie &Die,
+                                          uint64_t OffsetOnEntry) {
   // Check if the value is an integer constant.
   if (FormValue.isFormClass(DWARFFormValue::FC_Constant))
     // Add a record to hold a constant as location.
@@ -1063,8 +1077,8 @@ void LVELFReader::processLocationMember(dwarf::Attribute Attr,
 }
 
 // Update the current element with the reference.
-void LVELFReader::updateReference(dwarf::Attribute Attr,
-                                  const DWARFFormValue &FormValue) {
+void LVDWARFReader::updateReference(dwarf::Attribute Attr,
+                                    const DWARFFormValue &FormValue) {
   // FIXME: We are assuming that at most one Reference (DW_AT_specification,
   // DW_AT_abstract_origin, ...) and at most one Type (DW_AT_import, DW_AT_type)
   // appear in any single DIE, but this may not be true.
@@ -1116,8 +1130,8 @@ void LVELFReader::updateReference(dwarf::Attribute Attr,
 }
 
 // Get an element given the DIE offset.
-LVElement *LVELFReader::getElementForOffset(LVOffset Offset, LVElement *Element,
-                                            bool IsType) {
+LVElement *LVDWARFReader::getElementForOffset(LVOffset Offset,
+                                              LVElement *Element, bool IsType) {
   auto Iter = ElementTable.try_emplace(Offset).first;
   // Update the element and all the references pointing to this element.
   LVElementEntry &Entry = Iter->second;
@@ -1130,7 +1144,7 @@ LVElement *LVELFReader::getElementForOffset(LVOffset Offset, LVElement *Element,
   return Entry.Element;
 }
 
-Error LVELFReader::loadTargetInfo(const ObjectFile &Obj) {
+Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) {
   // Detect the architecture from the object file. We usually don't need OS
   // info to lookup a target and create register info.
   Triple TT;
@@ -1149,7 +1163,7 @@ Error LVELFReader::loadTargetInfo(const ObjectFile &Obj) {
   return loadGenericTargetInfo(TT.str(), FeaturesValue.getString());
 }
 
-void LVELFReader::mapRangeAddress(const ObjectFile &Obj) {
+void LVDWARFReader::mapRangeAddress(const ObjectFile &Obj) {
   for (auto Iter = Obj.symbol_begin(); Iter != Obj.symbol_end(); ++Iter) {
     const SymbolRef &Symbol = *Iter;
 
@@ -1225,9 +1239,9 @@ void LVELFReader::mapRangeAddress(const ObjectFile &Obj) {
   }
 }
 
-void LVELFReader::sortScopes() { Root->sort(); }
+void LVDWARFReader::sortScopes() { Root->sort(); }
 
-void LVELFReader::print(raw_ostream &OS) const {
+void LVDWARFReader::print(raw_ostream &OS) const {
   OS << "LVType\n";
   LLVM_DEBUG(dbgs() << "CreateReaders\n");
 }
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 5f29226c14b70..32c5e3251df62 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -628,13 +628,20 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
   ObjectPair Objects = ObjectsOrErr.get();
 
   std::unique_ptr<DIContext> Context;
-  // If this is a COFF object containing PDB info, use a PDBContext to
-  // symbolize. Otherwise, use DWARF.
+  // If this is a COFF object containing PDB info and not containing DWARF
+  // section, use a PDBContext to symbolize. Otherwise, use DWARF.
   if (auto CoffObject = dyn_cast<COFFObjectFile>(Objects.first)) {
     const codeview::DebugInfo *DebugInfo;
     StringRef PDBFileName;
     auto EC = CoffObject->getDebugPDBInfo(DebugInfo, PDBFileName);
-    if (!EC && DebugInfo != nullptr && !PDBFileName.empty()) {
+    // Use DWARF if there're DWARF sections.
+    bool HasDwarf =
+        llvm::any_of(Objects.first->sections(), [](SectionRef Section) -> bool {
+          if (Expected<StringRef> SectionName = Section.getName())
+            return SectionName.get() == ".debug_info";
+          return false;
+        });
+    if (!EC && !HasDwarf && DebugInfo != nullptr && !PDBFileName.empty()) {
       using namespace pdb;
       std::unique_ptr<IPDBSession> Session;
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 994acf5843642..1fa8a1274911c 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -255,7 +255,7 @@ struct ObjCImageInfoFlags {
 namespace llvm {
 namespace orc {
 
-MachOPlatform::HeaderOptions::BuildVersionOpts
+std::optional<MachOPlatform::HeaderOptions::BuildVersionOpts>
 MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(const Triple &TT,
                                                            uint32_t MinOS,
                                                            uint32_t SDK) {
@@ -263,26 +263,25 @@ MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(const Triple &TT,
   uint32_t Platform;
   switch (TT.getOS()) {
   case Triple::IOS:
-    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_IOS
-                                           : MachO::PLATFORM_IOSSIMULATOR;
+    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_IOSSIMULATOR
+                                           : MachO::PLATFORM_IOS;
     break;
   case Triple::MacOSX:
     Platform = MachO::PLATFORM_MACOS;
     break;
   case Triple::TvOS:
-    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_TVOS
-                                           : MachO::PLATFORM_TVOSSIMULATOR;
+    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_TVOSSIMULATOR
+                                           : MachO::PLATFORM_TVOS;
     break;
   case Triple::WatchOS:
-    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_WATCHOS
-                                           : MachO::PLATFORM_WATCHOSSIMULATOR;
+    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_WATCHOSSIMULATOR
+                                           : MachO::PLATFORM_WATCHOS;
     break;
   default:
-    Platform = MachO::PLATFORM_UNKNOWN;
-    break;
+    return std::nullopt;
   }
 
-  return {Platform, MinOS, SDK};
+  return MachOPlatform::HeaderOptions::BuildVersionOpts{Platform, MinOS, SDK};
 }
 
 Expected<std::unique_ptr<MachOPlatform>> MachOPlatform::Create(
@@ -1725,11 +1724,9 @@ jitlink::Block &createHeaderBlock(MachOPlatform &MOP,
   else
     B.template addLoadCommand<MachO::LC_ID_DYLIB>(JD.getName(), 0, 0, 0);
 
-  if (Opts.BuildVersion)
+  for (auto &BV : Opts.BuildVersions)
     B.template addLoadCommand<MachO::LC_BUILD_VERSION>(
-        Opts.BuildVersion->Platform, Opts.BuildVersion->MinOS,
-        Opts.BuildVersion->SDK, static_cast<uint32_t>(0));
-
+        BV.Platform, BV.MinOS, BV.SDK, static_cast<uint32_t>(0));
   for (auto &D : Opts.LoadDylibs)
     B.template addLoadCommand<MachO::LC_LOAD_DYLIB>(
         D.Name, D.Timestamp, D.CurrentVersion, D.CompatibilityVersion);
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp
index d346214d3ae29..57ac991ee37f3 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.cpp
@@ -87,7 +87,7 @@ static void registerJITLoaderVTuneUnregisterImpl(
   for (auto &Method : UM) {
     JITEventWrapper::Wrapper->iJIT_NotifyEvent(
         iJVM_EVENT_TYPE_METHOD_UNLOAD_START,
-        const_cast<unsigned long *>(&Method.first));
+        const_cast<uint64_t *>(&Method.first));
   }
 }
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
index e870c5aa2ba6b..37936d6000c89 100644
--- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
@@ -44,6 +44,7 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple) {
   case Triple::ppcle:
   case Triple::ppc64:
   case Triple::ppc64le:
+  case Triple::systemz:
   case Triple::x86:
   case Triple::x86_64:
     ActiveTraits.set(unsigned(TraitProperty::device_kind_cpu));
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c74c898e1e882..16507a69ea850 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4883,8 +4883,11 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData(
     return InsertPointTy();
 
   // Disable TargetData CodeGen on Device pass.
-  if (Config.IsTargetDevice.value_or(false))
+  if (Config.IsTargetDevice.value_or(false)) {
+    if (BodyGenCB)
+      Builder.restoreIP(BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv));
     return Builder.saveIP();
+  }
 
   Builder.restoreIP(CodeGenIP);
   bool IsStandAlone = !BodyGenCB;
@@ -5070,10 +5073,15 @@ FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
 
 static void replaceConstatExprUsesInFuncWithInstr(ConstantExpr *ConstExpr,
                                                   Function *Func) {
-  for (User *User : make_early_inc_range(ConstExpr->users()))
-    if (auto *Instr = dyn_cast<Instruction>(User))
-      if (Instr->getFunction() == Func)
-        Instr->replaceUsesOfWith(ConstExpr, ConstExpr->getAsInstruction(Instr));
+  for (User *User : make_early_inc_range(ConstExpr->users())) {
+    if (auto *Instr = dyn_cast<Instruction>(User)) {
+      if (Instr->getFunction() == Func) {
+        Instruction *ConstInst = ConstExpr->getAsInstruction();
+        ConstInst->insertBefore(*Instr->getParent(), Instr->getIterator());
+        Instr->replaceUsesOfWith(ConstExpr, ConstInst);
+      }
+    }
+  }
 }
 
 static void replaceConstantValueUsesInFuncWithInstr(llvm::Value *Input,
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 1beb4c069a699..d7fee609a2ba8 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -862,7 +862,7 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   void processInstructionMetadata(const Instruction &I);
 
   /// Add all of the metadata from a DbgRecord.
-  void processDbgRecordMetadata(const DbgRecord &DPV);
+  void processDbgRecordMetadata(const DbgRecord &DVR);
 };
 
 } // end namespace llvm
@@ -1139,13 +1139,19 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 }
 
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
-  if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
+  if (const DbgVariableRecord *DVR = dyn_cast<const DbgVariableRecord>(&DR)) {
     // Process metadata used by DbgRecords; we only specifically care about the
     // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
     // Expression fields should only be printed inline and so do not use a slot.
-    CreateMetadataSlot(DPV->getRawVariable());
-    if (DPV->isDbgAssign())
-      CreateMetadataSlot(cast<MDNode>(DPV->getRawAssignID()));
+    // Note: The above doesn't apply for empty-metadata operands.
+    if (auto *Empty = dyn_cast<MDNode>(DVR->getRawLocation()))
+      CreateMetadataSlot(Empty);
+    CreateMetadataSlot(DVR->getRawVariable());
+    if (DVR->isDbgAssign()) {
+      CreateMetadataSlot(cast<MDNode>(DVR->getRawAssignID()));
+      if (auto *Empty = dyn_cast<MDNode>(DVR->getRawAddress()))
+        CreateMetadataSlot(Empty);
+    }
   } else if (const DPLabel *DPL = dyn_cast<const DPLabel>(&DR)) {
     CreateMetadataSlot(DPL->getRawLabel());
   } else {
@@ -2135,6 +2141,16 @@ static void writeDIDerivedType(raw_ostream &Out, const DIDerivedType *N,
     Printer.printInt("dwarfAddressSpace", *DWARFAddressSpace,
                      /* ShouldSkipZero */ false);
   Printer.printMetadata("annotations", N->getRawAnnotations());
+  if (auto PtrAuthData = N->getPtrAuthData()) {
+    Printer.printInt("ptrAuthKey", PtrAuthData->key());
+    Printer.printBool("ptrAuthIsAddressDiscriminated",
+                      PtrAuthData->isAddressDiscriminated());
+    Printer.printInt("ptrAuthExtraDiscriminator",
+                     PtrAuthData->extraDiscriminator());
+    Printer.printBool("ptrAuthIsaPointer", PtrAuthData->isaPointer());
+    Printer.printBool("ptrAuthAuthenticatesNullValues",
+                      PtrAuthData->authenticatesNullValues());
+  }
   Out << ")";
 }
 
@@ -2703,10 +2719,10 @@ class AssemblyWriter {
   void printInstructionLine(const Instruction &I);
   void printInstruction(const Instruction &I);
   void printDPMarker(const DPMarker &DPI);
-  void printDPValue(const DPValue &DPI);
+  void printDbgVariableRecord(const DbgVariableRecord &DVR);
   void printDPLabel(const DPLabel &DPL);
-  void printDbgRecord(const DbgRecord &DPI);
-  void printDbgRecordLine(const DbgRecord &DPI);
+  void printDbgRecord(const DbgRecord &DR);
+  void printDbgRecordLine(const DbgRecord &DR);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
   void printUseLists(const Function *F);
@@ -4592,7 +4608,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 void AssemblyWriter::printDPMarker(const DPMarker &Marker) {
   // There's no formal representation of a DPMarker -- print purely as a
   // debugging aid.
-  for (const DbgRecord &DPR : Marker.StoredDPValues) {
+  for (const DbgRecord &DPR : Marker.StoredDbgRecords) {
     printDbgRecord(DPR);
     Out << "\n";
   }
@@ -4604,46 +4620,47 @@ void AssemblyWriter::printDPMarker(const DPMarker &Marker) {
 }
 
 void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
-  if (auto *DPV = dyn_cast<DPValue>(&DR))
-    printDPValue(*DPV);
+  if (auto *DVR = dyn_cast<DbgVariableRecord>(&DR))
+    printDbgVariableRecord(*DVR);
   else if (auto *DPL = dyn_cast<DPLabel>(&DR))
     printDPLabel(*DPL);
   else
     llvm_unreachable("Unexpected DbgRecord kind");
 }
 
-void AssemblyWriter::printDPValue(const DPValue &DPV) {
+void AssemblyWriter::printDbgVariableRecord(const DbgVariableRecord &DVR) {
   auto WriterCtx = getContext();
   Out << "#dbg_";
-  switch (DPV.getType()) {
-  case DPValue::LocationType::Value:
+  switch (DVR.getType()) {
+  case DbgVariableRecord::LocationType::Value:
     Out << "value";
     break;
-  case DPValue::LocationType::Declare:
+  case DbgVariableRecord::LocationType::Declare:
     Out << "declare";
     break;
-  case DPValue::LocationType::Assign:
+  case DbgVariableRecord::LocationType::Assign:
     Out << "assign";
     break;
   default:
-    llvm_unreachable("Tried to print a DPValue with an invalid LocationType!");
+    llvm_unreachable(
+        "Tried to print a DbgVariableRecord with an invalid LocationType!");
   }
   Out << "(";
-  WriteAsOperandInternal(Out, DPV.getRawLocation(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DVR.getRawLocation(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, DPV.getRawVariable(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DVR.getRawVariable(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, DPV.getRawExpression(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DVR.getRawExpression(), WriterCtx, true);
   Out << ", ";
-  if (DPV.isDbgAssign()) {
-    WriteAsOperandInternal(Out, DPV.getRawAssignID(), WriterCtx, true);
+  if (DVR.isDbgAssign()) {
+    WriteAsOperandInternal(Out, DVR.getRawAssignID(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, DPV.getRawAddress(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DVR.getRawAddress(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, DPV.getRawAddressExpression(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DVR.getRawAddressExpression(), WriterCtx, true);
     Out << ", ";
   }
-  WriteAsOperandInternal(Out, DPV.getDebugLoc().getAsMDNode(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DVR.getDebugLoc().getAsMDNode(), WriterCtx, true);
   Out << ")";
 }
 
@@ -4897,7 +4914,7 @@ void DPMarker::print(raw_ostream &ROS, bool IsForDebug) const {
   print(ROS, MST, IsForDebug);
 }
 
-void DPValue::print(raw_ostream &ROS, bool IsForDebug) const {
+void DbgVariableRecord::print(raw_ostream &ROS, bool IsForDebug) const {
 
   ModuleSlotTracker MST(getModuleFromDPI(this), true);
   print(ROS, MST, IsForDebug);
@@ -4924,8 +4941,8 @@ void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
   print(ROS, MST, IsForDebug);
 }
 
-void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
-                    bool IsForDebug) const {
+void DbgVariableRecord::print(raw_ostream &ROS, ModuleSlotTracker &MST,
+                              bool IsForDebug) const {
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4938,7 +4955,7 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                           ? Marker->getParent()->getParent()
                           : nullptr);
   AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug);
-  W.printDPValue(*this);
+  W.printDbgVariableRecord(*this);
 }
 
 void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a10a55bb9dd82..a316aa55bb808 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -647,11 +647,12 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
     // v16i8 respectively.
     if (Name.consume_front("bfdot.")) {
       // (arm|aarch64).neon.bfdot.*'.
-      Intrinsic::ID ID = StringSwitch<Intrinsic::ID>(Name)
-                             .Cases("v2f32.v8i8", "v4f32.v16i8",
-                                    IsArm ? Intrinsic::arm_neon_bfdot
-                                          : Intrinsic::aarch64_neon_bfdot)
-                             .Default(Intrinsic::not_intrinsic);
+      Intrinsic::ID ID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .Cases("v2f32.v8i8", "v4f32.v16i8",
+                     IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfdot
+                           : (Intrinsic::ID)Intrinsic::aarch64_neon_bfdot)
+              .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
         size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits();
         assert((OperandWidth == 64 || OperandWidth == 128) &&
@@ -674,12 +675,15 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
         // (arm|aarch64).neon.bfm*.v4f32.v16i8'.
         Intrinsic::ID ID =
             StringSwitch<Intrinsic::ID>(Name)
-                .Case("mla", IsArm ? Intrinsic::arm_neon_bfmmla
-                                   : Intrinsic::aarch64_neon_bfmmla)
-                .Case("lalb", IsArm ? Intrinsic::arm_neon_bfmlalb
-                                    : Intrinsic::aarch64_neon_bfmlalb)
-                .Case("lalt", IsArm ? Intrinsic::arm_neon_bfmlalt
-                                    : Intrinsic::aarch64_neon_bfmlalt)
+                .Case("mla",
+                      IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfmmla
+                            : (Intrinsic::ID)Intrinsic::aarch64_neon_bfmmla)
+                .Case("lalb",
+                      IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfmlalb
+                            : (Intrinsic::ID)Intrinsic::aarch64_neon_bfmlalb)
+                .Case("lalt",
+                      IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfmlalt
+                            : (Intrinsic::ID)Intrinsic::aarch64_neon_bfmlalt)
                 .Default(Intrinsic::not_intrinsic);
         if (ID != Intrinsic::not_intrinsic) {
           NewFn = Intrinsic::getDeclaration(F->getParent(), ID);
@@ -1052,6 +1056,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   }
   case 'd':
     if (Name.consume_front("dbg.")) {
+      // Mark debug intrinsics for upgrade to new debug format.
+      if (F->getParent()->IsNewDbgInfoFormat) {
+        if (Name == "addr" || Name == "value" || Name == "assign" ||
+            Name == "declare" || Name == "label") {
+          // There's no function to replace these with.
+          NewFn = nullptr;
+          // But we do want these to get upgraded.
+          return true;
+        }
+      }
+      // Update llvm.dbg.addr intrinsics even in "new debug mode"; they'll get
+      // converted to DbgVariableRecords later.
       if (Name == "addr" || (Name == "value" && F->arg_size() == 4)) {
         rename(F);
         NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::dbg_value);
@@ -2322,6 +2338,59 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
   llvm_unreachable("Unknown function for AMDGPU intrinsic upgrade.");
 }
 
+/// Helper to unwrap intrinsic call MetadataAsValue operands.
+template <typename MDType>
+static MDType *unwrapMAVOp(CallBase *CI, unsigned Op) {
+  if (MetadataAsValue *MAV = dyn_cast<MetadataAsValue>(CI->getArgOperand(Op)))
+    return dyn_cast<MDType>(MAV->getMetadata());
+  return nullptr;
+}
+
+/// Convert debug intrinsic calls to non-instruction debug records.
+/// \p Name - Final part of the intrinsic name, e.g. 'value' in llvm.dbg.value.
+/// \p CI - The debug intrinsic call.
+static void upgradeDbgIntrinsicToDbgRecord(StringRef Name, CallBase *CI) {
+  DbgRecord *DR = nullptr;
+  if (Name == "label") {
+    DR = new DPLabel(unwrapMAVOp<DILabel>(CI, 0), CI->getDebugLoc());
+  } else if (Name == "assign") {
+    DR = new DbgVariableRecord(
+        unwrapMAVOp<Metadata>(CI, 0), unwrapMAVOp<DILocalVariable>(CI, 1),
+        unwrapMAVOp<DIExpression>(CI, 2), unwrapMAVOp<DIAssignID>(CI, 3),
+        unwrapMAVOp<Metadata>(CI, 4), unwrapMAVOp<DIExpression>(CI, 5),
+        CI->getDebugLoc());
+  } else if (Name == "declare") {
+    DR = new DbgVariableRecord(
+        unwrapMAVOp<Metadata>(CI, 0), unwrapMAVOp<DILocalVariable>(CI, 1),
+        unwrapMAVOp<DIExpression>(CI, 2), CI->getDebugLoc(),
+        DbgVariableRecord::LocationType::Declare);
+  } else if (Name == "addr") {
+    // Upgrade dbg.addr to dbg.value with DW_OP_deref.
+    DIExpression *Expr = unwrapMAVOp<DIExpression>(CI, 2);
+    Expr = DIExpression::append(Expr, dwarf::DW_OP_deref);
+    DR = new DbgVariableRecord(unwrapMAVOp<Metadata>(CI, 0),
+                               unwrapMAVOp<DILocalVariable>(CI, 1), Expr,
+                               CI->getDebugLoc());
+  } else if (Name == "value") {
+    // An old version of dbg.value had an extra offset argument.
+    unsigned VarOp = 1;
+    unsigned ExprOp = 2;
+    if (CI->arg_size() == 4) {
+      auto *Offset = dyn_cast_or_null<Constant>(CI->getArgOperand(1));
+      // Nonzero offset dbg.values get dropped without a replacement.
+      if (!Offset || !Offset->isZeroValue())
+        return;
+      VarOp = 2;
+      ExprOp = 3;
+    }
+    DR = new DbgVariableRecord(
+        unwrapMAVOp<Metadata>(CI, 0), unwrapMAVOp<DILocalVariable>(CI, VarOp),
+        unwrapMAVOp<DIExpression>(CI, ExprOp), CI->getDebugLoc());
+  }
+  assert(DR && "Unhandled intrinsic kind in upgrade to DbgRecord");
+  CI->getParent()->insertDbgRecordBefore(DR, CI->getIterator());
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
@@ -2347,6 +2416,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     bool IsNVVM = Name.consume_front("nvvm.");
     bool IsARM = Name.consume_front("arm.");
     bool IsAMDGCN = Name.consume_front("amdgcn.");
+    bool IsDbg = Name.consume_front("dbg.");
 
     if (IsX86 && Name.starts_with("sse4a.movnt.")) {
       SmallVector<Metadata *, 1> Elts;
@@ -2451,7 +2521,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       return;
     }
 
-    Value *Rep;
+    Value *Rep = nullptr;
     // Upgrade packed integer vector compare intrinsics to compare instructions.
     if (IsX86 && (Name.starts_with("sse2.pcmp") ||
                   Name.starts_with("avx2.pcmp"))) {
@@ -4186,6 +4256,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = upgradeARMIntrinsicCall(Name, CI, F, Builder);
     } else if (IsAMDGCN) {
       Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
+    } else if (IsDbg && CI->getModule()->IsNewDbgInfoFormat) {
+      upgradeDbgIntrinsicToDbgRecord(Name, CI);
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
@@ -5172,72 +5244,6 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
     Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
 }
 
-// Check if the module attribute is present and not zero.
-static bool isModuleAttributeSet(Module &M, const StringRef &ModAttr) {
-  const auto *Attr =
-      mdconst::extract_or_null<ConstantInt>(M.getModuleFlag(ModAttr));
-  return Attr && Attr->getZExtValue();
-}
-
-// Copy an attribute from module to the function if exists.
-// First value of the pair is used when the module attribute is not zero
-// the second otherwise.
-static void
-CopyModuleAttributeToFunction(Function &F, StringRef FnAttrName,
-                              StringRef ModAttrName,
-                              std::pair<StringRef, StringRef> Values) {
-  if (F.hasFnAttribute(FnAttrName))
-    return;
-  F.addFnAttr(FnAttrName, isModuleAttributeSet(*F.getParent(), ModAttrName)
-                              ? Values.first
-                              : Values.second);
-}
-
-// Copy a boolean attribute from module to the function if exists.
-// Module attribute treated false if zero otherwise true.
-static void CopyModuleAttributeToFunction(Function &F, StringRef AttrName) {
-  CopyModuleAttributeToFunction(
-      F, AttrName, AttrName,
-      std::make_pair<StringRef, StringRef>("true", "false"));
-}
-
-// Copy an attribute from module to the function if exists.
-// First value of the pair is used when the module attribute is not zero
-// the second otherwise.
-static void
-CopyModuleAttributeToFunction(Function &F, StringRef AttrName,
-                              std::pair<StringRef, StringRef> Values) {
-  CopyModuleAttributeToFunction(F, AttrName, AttrName, Values);
-}
-
-void llvm::CopyModuleAttrToFunctions(Module &M) {
-  Triple T(M.getTargetTriple());
-  if (!T.isThumb() && !T.isARM() && !T.isAArch64())
-    return;
-
-  for (Function &F : M.getFunctionList()) {
-    if (F.isDeclaration())
-      continue;
-
-    if (!F.hasFnAttribute("sign-return-address")) {
-      StringRef SignType = "none";
-      if (isModuleAttributeSet(M, "sign-return-address"))
-        SignType = "non-leaf";
-
-      if (isModuleAttributeSet(M, "sign-return-address-all"))
-        SignType = "all";
-
-      F.addFnAttr("sign-return-address", SignType);
-    }
-    CopyModuleAttributeToFunction(F, "branch-target-enforcement");
-    CopyModuleAttributeToFunction(F, "branch-protection-pauth-lr");
-    CopyModuleAttributeToFunction(F, "guarded-control-stack");
-    CopyModuleAttributeToFunction(
-        F, "sign-return-address-key",
-        std::make_pair<StringRef, StringRef>("b_key", "a_key"));
-  }
-}
-
 static bool isOldLoopArgument(Metadata *MD) {
   auto *T = dyn_cast_or_null<MDTuple>(MD);
   if (!T)
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 7ead7ce3bf08a..2dff6e4d6d9c3 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -36,6 +36,11 @@ cl::opt<bool>
                                  "through iterators, eliminating intrinsics"),
                         cl::init(true));
 
+bool WriteNewDbgInfoFormatToBitcode /*set default value in cl::init() below*/;
+cl::opt<bool, true> WriteNewDbgInfoFormatToBitcode2(
+    "write-experimental-debuginfo-iterators-to-bitcode", cl::Hidden,
+    cl::location(WriteNewDbgInfoFormatToBitcode), cl::init(false));
+
 DPMarker *BasicBlock::createMarker(Instruction *I) {
   assert(IsNewDbgInfoFormat &&
          "Tried to create a marker in a non new debug-info block!");
@@ -63,37 +68,37 @@ DPMarker *BasicBlock::createMarker(InstListType::iterator It) {
 void BasicBlock::convertToNewDbgValues() {
   IsNewDbgInfoFormat = true;
 
-  // Iterate over all instructions in the instruction list, collecting dbg.value
-  // instructions and converting them to DPValues. Once we find a "real"
-  // instruction, attach all those DPValues to a DPMarker in that instruction.
-  SmallVector<DbgRecord *, 4> DPVals;
+  // Iterate over all instructions in the instruction list, collecting debug
+  // info intrinsics and converting them to DbgRecords. Once we find a "real"
+  // instruction, attach all those DbgRecords to a DPMarker in that instruction.
+  SmallVector<DbgRecord *, 4> DbgVarRecs;
   for (Instruction &I : make_early_inc_range(InstList)) {
     assert(!I.DbgMarker && "DbgMarker already set on old-format instrs?");
     if (DbgVariableIntrinsic *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
-      // Convert this dbg.value to a DPValue.
-      DPValue *Value = new DPValue(DVI);
-      DPVals.push_back(Value);
+      // Convert this dbg.value to a DbgVariableRecord.
+      DbgVariableRecord *Value = new DbgVariableRecord(DVI);
+      DbgVarRecs.push_back(Value);
       DVI->eraseFromParent();
       continue;
     }
 
     if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(&I)) {
-      DPVals.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc()));
+      DbgVarRecs.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc()));
       DLI->eraseFromParent();
       continue;
     }
 
-    if (DPVals.empty())
+    if (DbgVarRecs.empty())
       continue;
 
-    // Create a marker to store DPValues in.
+    // Create a marker to store DbgRecords in.
     createMarker(&I);
     DPMarker *Marker = I.DbgMarker;
 
-    for (DbgRecord *DPV : DPVals)
-      Marker->insertDbgRecord(DPV, false);
+    for (DbgRecord *DVR : DbgVarRecs)
+      Marker->insertDbgRecord(DVR, false);
 
-    DPVals.clear();
+    DbgVarRecs.clear();
   }
 }
 
@@ -102,7 +107,7 @@ void BasicBlock::convertFromNewDbgValues() {
   IsNewDbgInfoFormat = false;
 
   // Iterate over the block, finding instructions annotated with DPMarkers.
-  // Convert any attached DPValues to dbg.values and insert ahead of the
+  // Convert any attached DbgRecords to debug intrinsics and insert ahead of the
   // instruction.
   for (auto &Inst : *this) {
     if (!Inst.DbgMarker)
@@ -116,7 +121,7 @@ void BasicBlock::convertFromNewDbgValues() {
     Marker.eraseFromParent();
   }
 
-  // Assume no trailing DPValues: we could technically create them at the end
+  // Assume no trailing DbgRecords: we could technically create them at the end
   // of the block, after a terminator, but this would be non-cannonical and
   // indicates that something else is broken somewhere.
   assert(!getTrailingDbgRecords());
@@ -691,15 +696,15 @@ void BasicBlock::renumberInstructions() {
   NumInstrRenumberings++;
 }
 
-void BasicBlock::flushTerminatorDbgValues() {
-  // If we erase the terminator in a block, any DPValues will sink and "fall
+void BasicBlock::flushTerminatorDbgRecords() {
+  // If we erase the terminator in a block, any DbgRecords will sink and "fall
   // off the end", existing after any terminator that gets inserted. With
   // dbg.value intrinsics we would just insert the terminator at end() and
-  // the dbg.values would come before the terminator. With DPValues, we must
+  // the dbg.values would come before the terminator. With DbgRecords, we must
   // do this manually.
   // To get out of this unfortunate form, whenever we insert a terminator,
-  // check whether there's anything trailing at the end and move those DPValues
-  // in front of the terminator.
+  // check whether there's anything trailing at the end and move those
+  // DbgRecords in front of the terminator.
 
   // Do nothing if we're not in new debug-info format.
   if (!IsNewDbgInfoFormat)
@@ -710,15 +715,15 @@ void BasicBlock::flushTerminatorDbgValues() {
   if (!Term)
     return;
 
-  // Are there any dangling DPValues?
-  DPMarker *TrailingDPValues = getTrailingDbgRecords();
-  if (!TrailingDPValues)
+  // Are there any dangling DbgRecords?
+  DPMarker *TrailingDbgRecords = getTrailingDbgRecords();
+  if (!TrailingDbgRecords)
     return;
 
-  // Transfer DPValues from the trailing position onto the terminator.
+  // Transfer DbgRecords from the trailing position onto the terminator.
   createMarker(Term);
-  Term->DbgMarker->absorbDebugValues(*TrailingDPValues, false);
-  TrailingDPValues->eraseFromParent();
+  Term->DbgMarker->absorbDebugValues(*TrailingDbgRecords, false);
+  TrailingDbgRecords->eraseFromParent();
   deleteTrailingDbgRecords();
 }
 
@@ -735,7 +740,7 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
   // If an optimisation pass attempts to splice the contents of the block from
   // BB1->begin() to BB1->getTerminator(), then the dbg.value will be
   // transferred to the destination.
-  // However, in the "new" DPValue format for debug-info, that range is empty:
+  // However, in the "new" DbgRecord format for debug-info, that range is empty:
   // begin() returns an iterator to the terminator, as there will only be a
   // single instruction in the block. We must piece together from the bits set
   // in the iterators whether there was the intention to transfer any debug
@@ -750,16 +755,16 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
   bool ReadFromHead = First.getHeadBit();
 
   // If the source block is completely empty, including no terminator, then
-  // transfer any trailing DPValues that are still hanging around. This can
+  // transfer any trailing DbgRecords that are still hanging around. This can
   // occur when a block is optimised away and the terminator has been moved
   // somewhere else.
   if (Src->empty()) {
-    DPMarker *SrcTrailingDPValues = Src->getTrailingDbgRecords();
-    if (!SrcTrailingDPValues)
+    DPMarker *SrcTrailingDbgRecords = Src->getTrailingDbgRecords();
+    if (!SrcTrailingDbgRecords)
       return;
 
     Dest->adoptDbgRecords(Src, Src->end(), InsertAtHead);
-    // adoptDbgRecords should have released the trailing DPValues.
+    // adoptDbgRecords should have released the trailing DbgRecords.
     assert(!Src->getTrailingDbgRecords());
     return;
   }
@@ -785,8 +790,8 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
   /* Do a quick normalisation before calling the real splice implementation. We
      might be operating on a degenerate basic block that has no instructions
      in it, a legitimate transient state. In that case, Dest will be end() and
-     any DPValues temporarily stored in the TrailingDPValues map in LLVMContext.
-     We might illustrate it thus:
+     any DbgRecords temporarily stored in the TrailingDbgRecords map in
+     LLVMContext. We might illustrate it thus:
 
                          Dest
                            |
@@ -795,35 +800,35 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
                                 |               |
                                First           Last
 
-     However: does the caller expect the "~" DPValues to end up before or after
-     the spliced segment? This is communciated in the "Head" bit of Dest, which
-     signals whether the caller called begin() or end() on this block.
+     However: does the caller expect the "~" DbgRecords to end up before or
+     after the spliced segment? This is communciated in the "Head" bit of Dest,
+     which signals whether the caller called begin() or end() on this block.
 
-     If the head bit is set, then all is well, we leave DPValues trailing just
+     If the head bit is set, then all is well, we leave DbgRecords trailing just
      like how dbg.value instructions would trail after instructions spliced to
      the beginning of this block.
 
-     If the head bit isn't set, then try to jam the "~" DPValues onto the front
-     of the First instruction, then splice like normal, which joins the "~"
-     DPValues with the "+" DPValues. However if the "+" DPValues are supposed to
-     be left behind in Src, then:
-      * detach the "+" DPValues,
-      * move the "~" DPValues onto First,
+     If the head bit isn't set, then try to jam the "~" DbgRecords onto the
+     front of the First instruction, then splice like normal, which joins the
+     "~" DbgRecords with the "+" DbgRecords. However if the "+" DbgRecords are
+     supposed to be left behind in Src, then:
+      * detach the "+" DbgRecords,
+      * move the "~" DbgRecords onto First,
       * splice like normal,
-      * replace the "+" DPValues onto the Last position.
+      * replace the "+" DbgRecords onto the Last position.
      Complicated, but gets the job done. */
 
-  // If we're inserting at end(), and not in front of dangling DPValues, then
-  // move the DPValues onto "First". They'll then be moved naturally in the
+  // If we're inserting at end(), and not in front of dangling DbgRecords, then
+  // move the DbgRecords onto "First". They'll then be moved naturally in the
   // splice process.
-  DPMarker *MoreDanglingDPValues = nullptr;
-  DPMarker *OurTrailingDPValues = getTrailingDbgRecords();
-  if (Dest == end() && !Dest.getHeadBit() && OurTrailingDPValues) {
-    // Are the "+" DPValues not supposed to move? If so, detach them
+  DPMarker *MoreDanglingDbgRecords = nullptr;
+  DPMarker *OurTrailingDbgRecords = getTrailingDbgRecords();
+  if (Dest == end() && !Dest.getHeadBit() && OurTrailingDbgRecords) {
+    // Are the "+" DbgRecords not supposed to move? If so, detach them
     // temporarily.
     if (!First.getHeadBit() && First->hasDbgRecords()) {
-      MoreDanglingDPValues = Src->getMarker(First);
-      MoreDanglingDPValues->removeFromParent();
+      MoreDanglingDbgRecords = Src->getMarker(First);
+      MoreDanglingDbgRecords->removeFromParent();
     }
 
     if (First->hasDbgRecords()) {
@@ -839,8 +844,8 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
       // No current marker, create one and absorb in. (FIXME: we can avoid an
       // allocation in the future).
       DPMarker *CurMarker = Src->createMarker(&*First);
-      CurMarker->absorbDebugValues(*OurTrailingDPValues, false);
-      OurTrailingDPValues->eraseFromParent();
+      CurMarker->absorbDebugValues(*OurTrailingDbgRecords, false);
+      OurTrailingDbgRecords->eraseFromParent();
     }
     deleteTrailingDbgRecords();
     First.setHeadBit(true);
@@ -849,16 +854,16 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
   // Call the main debug-info-splicing implementation.
   spliceDebugInfoImpl(Dest, Src, First, Last);
 
-  // Do we have some "+" DPValues hanging around that weren't supposed to move,
-  // and we detached to make things easier?
-  if (!MoreDanglingDPValues)
+  // Do we have some "+" DbgRecords hanging around that weren't supposed to
+  // move, and we detached to make things easier?
+  if (!MoreDanglingDbgRecords)
     return;
 
   // FIXME: we could avoid an allocation here sometimes. (adoptDbgRecords
   // requires an iterator).
   DPMarker *LastMarker = Src->createMarker(Last);
-  LastMarker->absorbDebugValues(*MoreDanglingDPValues, true);
-  MoreDanglingDPValues->eraseFromParent();
+  LastMarker->absorbDebugValues(*MoreDanglingDbgRecords, true);
+  MoreDanglingDbgRecords->eraseFromParent();
 }
 
 void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
@@ -870,15 +875,16 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
   bool InsertAtHead = Dest.getHeadBit();
   bool ReadFromHead = First.getHeadBit();
   // Use this flag to signal the abnormal case, where we don't want to copy the
-  // DPValues ahead of the "Last" position.
+  // DbgRecords ahead of the "Last" position.
   bool ReadFromTail = !Last.getTailBit();
   bool LastIsEnd = (Last == Src->end());
 
   /*
     Here's an illustration of what we're about to do. We have two blocks, this
     and Src, and two segments of list. Each instruction is marked by a capital
-    while potential DPValue debug-info is marked out by "-" characters and a few
-    other special characters (+:=) where I want to highlight what's going on.
+    while potential DbgRecord debug-info is marked out by "-" characters and a
+    few other special characters (+:=) where I want to highlight what's going
+    on.
 
                                                  Dest
                                                    |
@@ -889,18 +895,18 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
 
     The splice method is going to take all the instructions from First up to
     (but not including) Last and insert them in _front_ of Dest, forming one
-    long list. All the DPValues attached to instructions _between_ First and
+    long list. All the DbgRecords attached to instructions _between_ First and
     Last need no maintenence. However, we have to do special things with the
-    DPValues marked with the +:= characters. We only have three positions:
-    should the "+" DPValues be transferred, and if so to where? Do we move the
-    ":" DPValues? Would they go in front of the "=" DPValues, or should the "="
-    DPValues go before "+" DPValues?
+    DbgRecords marked with the +:= characters. We only have three positions:
+    should the "+" DbgRecords be transferred, and if so to where? Do we move the
+    ":" DbgRecords? Would they go in front of the "=" DbgRecords, or should the
+    "=" DbgRecords go before "+" DbgRecords?
 
     We're told which way it should be by the bits carried in the iterators. The
     "Head" bit indicates whether the specified position is supposed to be at the
-    front of the attached DPValues (true) or not (false). The Tail bit is true
-    on the other end of a range: is the range intended to include DPValues up to
-    the end (false) or not (true).
+    front of the attached DbgRecords (true) or not (false). The Tail bit is true
+    on the other end of a range: is the range intended to include DbgRecords up
+    to the end (false) or not (true).
 
     FIXME: the tail bit doesn't need to be distinct from the head bit, we could
     combine them.
@@ -934,15 +940,16 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
 
    */
 
-  // Detach the marker at Dest -- this lets us move the "====" DPValues around.
+  // Detach the marker at Dest -- this lets us move the "====" DbgRecords
+  // around.
   DPMarker *DestMarker = nullptr;
   if (Dest != end()) {
     if ((DestMarker = getMarker(Dest)))
       DestMarker->removeFromParent();
   }
 
-  // If we're moving the tail range of DPValues (":::"), absorb them into the
-  // front of the DPValues at Dest.
+  // If we're moving the tail range of DbgRecords (":::"), absorb them into the
+  // front of the DbgRecords at Dest.
   if (ReadFromTail && Src->getMarker(Last)) {
     DPMarker *FromLast = Src->getMarker(Last);
     if (LastIsEnd) {
@@ -956,7 +963,7 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
     }
   }
 
-  // If we're _not_ reading from the head of First, i.e. the "++++" DPValues,
+  // If we're _not_ reading from the head of First, i.e. the "++++" DbgRecords,
   // move their markers onto Last. They remain in the Src block. No action
   // needed.
   if (!ReadFromHead && First->hasDbgRecords()) {
@@ -970,16 +977,16 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
     }
   }
 
-  // Finally, do something with the "====" DPValues we detached.
+  // Finally, do something with the "====" DbgRecords we detached.
   if (DestMarker) {
     if (InsertAtHead) {
-      // Insert them at the end of the DPValues at Dest. The "::::" DPValues
+      // Insert them at the end of the DbgRecords at Dest. The "::::" DbgRecords
       // might be in front of them.
       DPMarker *NewDestMarker = createMarker(Dest);
       NewDestMarker->absorbDebugValues(*DestMarker, false);
     } else {
       // Insert them right at the start of the range we moved, ahead of First
-      // and the "++++" DPValues.
+      // and the "++++" DbgRecords.
       DPMarker *FirstMarker = createMarker(First);
       FirstMarker->absorbDebugValues(*DestMarker, true);
     }
@@ -990,10 +997,10 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
     // any trailing debug-info at the end of the block would "normally" have
     // been pushed in front of "First". Move it there now.
     DPMarker *FirstMarker = getMarker(First);
-    DPMarker *TrailingDPValues = getTrailingDbgRecords();
-    if (TrailingDPValues) {
-      FirstMarker->absorbDebugValues(*TrailingDPValues, true);
-      TrailingDPValues->eraseFromParent();
+    DPMarker *TrailingDbgRecords = getTrailingDbgRecords();
+    if (TrailingDbgRecords) {
+      FirstMarker->absorbDebugValues(*TrailingDbgRecords, true);
+      TrailingDbgRecords->eraseFromParent();
       deleteTrailingDbgRecords();
     }
   }
@@ -1024,24 +1031,24 @@ void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
   // And move the instructions.
   getInstList().splice(Dest, Src->getInstList(), First, Last);
 
-  flushTerminatorDbgValues();
+  flushTerminatorDbgRecords();
 }
 
-void BasicBlock::insertDbgRecordAfter(DbgRecord *DPV, Instruction *I) {
+void BasicBlock::insertDbgRecordAfter(DbgRecord *DR, Instruction *I) {
   assert(IsNewDbgInfoFormat);
   assert(I->getParent() == this);
 
   iterator NextIt = std::next(I->getIterator());
   DPMarker *NextMarker = createMarker(NextIt);
-  NextMarker->insertDbgRecord(DPV, true);
+  NextMarker->insertDbgRecord(DR, true);
 }
 
-void BasicBlock::insertDbgRecordBefore(DbgRecord *DPV,
+void BasicBlock::insertDbgRecordBefore(DbgRecord *DR,
                                        InstListType::iterator Where) {
   assert(Where == end() || Where->getParent() == this);
   bool InsertAtHead = Where.getHeadBit();
   DPMarker *M = createMarker(Where);
-  M->insertDbgRecord(DPV, InsertAtHead);
+  M->insertDbgRecord(DR, InsertAtHead);
 }
 
 DPMarker *BasicBlock::getNextMarker(Instruction *I) {
@@ -1057,38 +1064,40 @@ DPMarker *BasicBlock::getMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::reinsertInstInDbgRecords(
-    Instruction *I, std::optional<DPValue::self_iterator> Pos) {
+    Instruction *I, std::optional<DbgRecord::self_iterator> Pos) {
   // "I" was originally removed from a position where it was
-  // immediately in front of Pos. Any DPValues on that position then "fell down"
-  // onto Pos. "I" has been re-inserted at the front of that wedge of DPValues,
-  // shuffle them around to represent the original positioning. To illustrate:
+  // immediately in front of Pos. Any DbgRecords on that position then "fell
+  // down" onto Pos. "I" has been re-inserted at the front of that wedge of
+  // DbgRecords, shuffle them around to represent the original positioning. To
+  // illustrate:
   //
   //   Instructions:  I1---I---I0
-  //       DPValues:    DDD DDD
+  //       DbgRecords:    DDD DDD
   //
   // Instruction "I" removed,
   //
   //   Instructions:  I1------I0
-  //       DPValues:    DDDDDD
+  //       DbgRecords:    DDDDDD
   //                       ^Pos
   //
   // Instruction "I" re-inserted (now):
   //
   //   Instructions:  I1---I------I0
-  //       DPValues:        DDDDDD
+  //       DbgRecords:        DDDDDD
   //                           ^Pos
   //
   // After this method completes:
   //
   //   Instructions:  I1---I---I0
-  //       DPValues:    DDD DDD
+  //       DbgRecords:    DDD DDD
 
-  // This happens if there were no DPValues on I0. Are there now DPValues there?
+  // This happens if there were no DbgRecords on I0. Are there now DbgRecords
+  // there?
   if (!Pos) {
     DPMarker *NextMarker = getNextMarker(I);
     if (!NextMarker)
       return;
-    if (NextMarker->StoredDPValues.empty())
+    if (NextMarker->StoredDbgRecords.empty())
       return;
     // There are DPMarkers there now -- they fell down from "I".
     DPMarker *ThisMarker = createMarker(I);
@@ -1096,15 +1105,15 @@ void BasicBlock::reinsertInstInDbgRecords(
     return;
   }
 
-  // Is there even a range of DPValues to move?
+  // Is there even a range of DbgRecords to move?
   DPMarker *DPM = (*Pos)->getMarker();
-  auto Range = make_range(DPM->StoredDPValues.begin(), (*Pos));
+  auto Range = make_range(DPM->StoredDbgRecords.begin(), (*Pos));
   if (Range.begin() == Range.end())
     return;
 
   // Otherwise: splice.
   DPMarker *ThisMarker = createMarker(I);
-  assert(ThisMarker->StoredDPValues.empty());
+  assert(ThisMarker->StoredDbgRecords.empty());
   ThisMarker->absorbDebugValues(Range, *DPM, true);
 }
 
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index ad575966537bf..6dd3dc144fe67 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1154,10 +1154,9 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2) {
                                 GV->getType()->getAddressSpace()))
         return ICmpInst::ICMP_UGT;
     }
-  } else {
+  } else if (auto *CE1 = dyn_cast<ConstantExpr>(V1)) {
     // Ok, the LHS is known to be a constantexpr.  The RHS can be any of a
     // constantexpr, a global, block address, or a simple constant.
-    ConstantExpr *CE1 = cast<ConstantExpr>(V1);
     Constant *CE1Op0 = CE1->getOperand(0);
 
     switch (CE1->getOpcode()) {
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index eff6b8b70bc0f..8ffa0ed66a19f 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -3303,7 +3303,7 @@ Value *ConstantExpr::handleOperandChangeImpl(Value *From, Value *ToV) {
       NewOps, this, From, To, NumUpdated, OperandNo);
 }
 
-Instruction *ConstantExpr::getAsInstruction(Instruction *InsertBefore) const {
+Instruction *ConstantExpr::getAsInstruction() const {
   SmallVector<Value *, 4> ValueOperands(operands());
   ArrayRef<Value*> Ops(ValueOperands);
 
@@ -3322,32 +3322,31 @@ Instruction *ConstantExpr::getAsInstruction(Instruction *InsertBefore) const {
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
     return CastInst::Create((Instruction::CastOps)getOpcode(), Ops[0],
-                            getType(), "", InsertBefore);
+                            getType(), "");
   case Instruction::InsertElement:
-    return InsertElementInst::Create(Ops[0], Ops[1], Ops[2], "", InsertBefore);
+    return InsertElementInst::Create(Ops[0], Ops[1], Ops[2], "");
   case Instruction::ExtractElement:
-    return ExtractElementInst::Create(Ops[0], Ops[1], "", InsertBefore);
+    return ExtractElementInst::Create(Ops[0], Ops[1], "");
   case Instruction::ShuffleVector:
-    return new ShuffleVectorInst(Ops[0], Ops[1], getShuffleMask(), "",
-                                 InsertBefore);
+    return new ShuffleVectorInst(Ops[0], Ops[1], getShuffleMask(), "");
 
   case Instruction::GetElementPtr: {
     const auto *GO = cast<GEPOperator>(this);
     if (GO->isInBounds())
-      return GetElementPtrInst::CreateInBounds(
-          GO->getSourceElementType(), Ops[0], Ops.slice(1), "", InsertBefore);
+      return GetElementPtrInst::CreateInBounds(GO->getSourceElementType(),
+                                               Ops[0], Ops.slice(1), "");
     return GetElementPtrInst::Create(GO->getSourceElementType(), Ops[0],
-                                     Ops.slice(1), "", InsertBefore);
+                                     Ops.slice(1), "");
   }
   case Instruction::ICmp:
   case Instruction::FCmp:
     return CmpInst::Create((Instruction::OtherOps)getOpcode(),
                            (CmpInst::Predicate)getPredicate(), Ops[0], Ops[1],
-                           "", InsertBefore);
+                           "");
   default:
     assert(getNumOperands() == 2 && "Must be binary operator?");
     BinaryOperator *BO = BinaryOperator::Create(
-        (Instruction::BinaryOps)getOpcode(), Ops[0], Ops[1], "", InsertBefore);
+        (Instruction::BinaryOps)getOpcode(), Ops[0], Ops[1], "");
     if (isa<OverflowingBinaryOperator>(BO)) {
       BO->setHasNoUnsignedWrap(SubclassOptionalData &
                                OverflowingBinaryOperator::NoUnsignedWrap);
diff --git a/llvm/lib/IR/ConvergenceVerifier.cpp b/llvm/lib/IR/ConvergenceVerifier.cpp
index e73aeaade5f71..6708f85819df7 100644
--- a/llvm/lib/IR/ConvergenceVerifier.cpp
+++ b/llvm/lib/IR/ConvergenceVerifier.cpp
@@ -75,14 +75,14 @@ GenericConvergenceVerifier<SSAContext>::findAndCheckConvergenceTokenUsed(
 
 template <>
 bool GenericConvergenceVerifier<SSAContext>::isInsideConvergentFunction(
-    const InstructionT &I) {
+    const Instruction &I) {
   auto *F = I.getFunction();
   return F->isConvergent();
 }
 
 template <>
 bool GenericConvergenceVerifier<SSAContext>::isConvergent(
-    const InstructionT &I) {
+    const Instruction &I) {
   if (auto *CB = dyn_cast<CallBase>(&I)) {
     return CB->isConvergent();
   }
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 9b4bb3ced8bbe..9e721f963c41e 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -404,6 +404,14 @@ void LLVMAddModuleFlag(LLVMModuleRef M, LLVMModuleFlagBehavior Behavior,
                            {Key, KeyLen}, unwrap(Val));
 }
 
+LLVMBool LLVMIsNewDbgInfoFormat(LLVMModuleRef M) {
+  return unwrap(M)->IsNewDbgInfoFormat;
+}
+
+void LLVMSetIsNewDbgInfoFormat(LLVMModuleRef M, LLVMBool UseNewFormat) {
+  unwrap(M)->setIsNewDbgInfoFormat(UseNewFormat);
+}
+
 /*--.. Printing modules ....................................................--*/
 
 void LLVMDumpModule(LLVMModuleRef M) {
@@ -2412,6 +2420,38 @@ void LLVMSetGC(LLVMValueRef Fn, const char *GC) {
     F->clearGC();
 }
 
+LLVMValueRef LLVMGetPrefixData(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return wrap(F->getPrefixData());
+}
+
+LLVMBool LLVMHasPrefixData(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return F->hasPrefixData();
+}
+
+void LLVMSetPrefixData(LLVMValueRef Fn, LLVMValueRef prefixData) {
+  Function *F = unwrap<Function>(Fn);
+  Constant *prefix = unwrap<Constant>(prefixData);
+  F->setPrefixData(prefix);
+}
+
+LLVMValueRef LLVMGetPrologueData(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return wrap(F->getPrologueData());
+}
+
+LLVMBool LLVMHasPrologueData(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return F->hasPrologueData();
+}
+
+void LLVMSetPrologueData(LLVMValueRef Fn, LLVMValueRef prologueData) {
+  Function *F = unwrap<Function>(Fn);
+  Constant *prologue = unwrap<Constant>(prologueData);
+  F->setPrologueData(prologue);
+}
+
 void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                              LLVMAttributeRef A) {
   unwrap<Function>(F)->addAttributeAtIndex(Idx, unwrap(A));
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index c673abd8bc30d..f10b5acb980d6 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -296,7 +296,20 @@ DIStringType *DIBuilder::createStringType(StringRef Name,
 
 DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy, 0,
-                            0, 0, std::nullopt, DINode::FlagZero);
+                            0, 0, std::nullopt, std::nullopt, DINode::FlagZero);
+}
+
+DIDerivedType *DIBuilder::createPtrAuthQualifiedType(
+    DIType *FromTy, unsigned Key, bool IsAddressDiscriminated,
+    unsigned ExtraDiscriminator, bool IsaPointer,
+    bool AuthenticatesNullValues) {
+  return DIDerivedType::get(VMContext, dwarf::DW_TAG_LLVM_ptrauth_type, "",
+                            nullptr, 0, nullptr, FromTy, 0, 0, 0, std::nullopt,
+                            std::optional<DIDerivedType::PtrAuthData>(
+                                std::in_place, Key, IsAddressDiscriminated,
+                                ExtraDiscriminator, IsaPointer,
+                                AuthenticatesNullValues),
+                            DINode::FlagZero);
 }
 
 DIDerivedType *
@@ -307,8 +320,8 @@ DIBuilder::createPointerType(DIType *PointeeTy, uint64_t SizeInBits,
   // FIXME: Why is there a name here?
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_pointer_type, Name,
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, DWARFAddressSpace, DINode::FlagZero,
-                            nullptr, Annotations);
+                            AlignInBits, 0, DWARFAddressSpace, std::nullopt,
+                            DINode::FlagZero, nullptr, Annotations);
 }
 
 DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
@@ -318,7 +331,8 @@ DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
                                                   DINode::DIFlags Flags) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_ptr_to_member_type, "",
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, std::nullopt, Flags, Base);
+                            AlignInBits, 0, std::nullopt, std::nullopt, Flags,
+                            Base);
 }
 
 DIDerivedType *
@@ -327,7 +341,7 @@ DIBuilder::createReferenceType(unsigned Tag, DIType *RTy, uint64_t SizeInBits,
                                std::optional<unsigned> DWARFAddressSpace) {
   assert(RTy && "Unable to create reference type");
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, RTy,
-                            SizeInBits, AlignInBits, 0, DWARFAddressSpace,
+                            SizeInBits, AlignInBits, 0, DWARFAddressSpace, {},
                             DINode::FlagZero);
 }
 
@@ -338,15 +352,16 @@ DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
                                         DINodeArray Annotations) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_typedef, Name, File,
                             LineNo, getNonCompileUnitScope(Context), Ty, 0,
-                            AlignInBits, 0, std::nullopt, Flags, nullptr,
-                            Annotations);
+                            AlignInBits, 0, std::nullopt, std::nullopt, Flags,
+                            nullptr, Annotations);
 }
 
 DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
   assert(Ty && "Invalid type!");
   assert(FriendTy && "Invalid friend type!");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_friend, "", nullptr, 0, Ty,
-                            FriendTy, 0, 0, 0, std::nullopt, DINode::FlagZero);
+                            FriendTy, 0, 0, 0, std::nullopt, std::nullopt,
+                            DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
@@ -358,7 +373,7 @@ DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
       ConstantInt::get(IntegerType::get(VMContext, 32), VBPtrOffset));
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_inheritance, "", nullptr,
                             0, Ty, BaseTy, 0, 0, BaseOffset, std::nullopt,
-                            Flags, ExtraData);
+                            std::nullopt, Flags, ExtraData);
 }
 
 DIDerivedType *DIBuilder::createMemberType(
@@ -368,7 +383,7 @@ DIDerivedType *DIBuilder::createMemberType(
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty,
                             SizeInBits, AlignInBits, OffsetInBits, std::nullopt,
-                            Flags, nullptr, Annotations);
+                            std::nullopt, Flags, nullptr, Annotations);
 }
 
 static ConstantAsMetadata *getConstantOrNull(Constant *C) {
@@ -381,10 +396,10 @@ DIDerivedType *DIBuilder::createVariantMemberType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
     Constant *Discriminant, DINode::DIFlags Flags, DIType *Ty) {
-  return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
-                            LineNumber, getNonCompileUnitScope(Scope), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, std::nullopt,
-                            Flags, getConstantOrNull(Discriminant));
+  return DIDerivedType::get(
+      VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
+      getNonCompileUnitScope(Scope), Ty, SizeInBits, AlignInBits, OffsetInBits,
+      std::nullopt, std::nullopt, Flags, getConstantOrNull(Discriminant));
 }
 
 DIDerivedType *DIBuilder::createBitFieldMemberType(
@@ -395,7 +410,7 @@ DIDerivedType *DIBuilder::createBitFieldMemberType(
   return DIDerivedType::get(
       VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
       getNonCompileUnitScope(Scope), Ty, SizeInBits, /*AlignInBits=*/0,
-      OffsetInBits, std::nullopt, Flags,
+      OffsetInBits, std::nullopt, std::nullopt, Flags,
       ConstantAsMetadata::get(ConstantInt::get(IntegerType::get(VMContext, 64),
                                                StorageOffsetInBits)),
       Annotations);
@@ -409,7 +424,8 @@ DIBuilder::createStaticMemberType(DIScope *Scope, StringRef Name, DIFile *File,
   Flags |= DINode::FlagStaticMember;
   return DIDerivedType::get(VMContext, Tag, Name, File, LineNumber,
                             getNonCompileUnitScope(Scope), Ty, 0, AlignInBits,
-                            0, std::nullopt, Flags, getConstantOrNull(Val));
+                            0, std::nullopt, std::nullopt, Flags,
+                            getConstantOrNull(Val));
 }
 
 DIDerivedType *
@@ -420,7 +436,7 @@ DIBuilder::createObjCIVar(StringRef Name, DIFile *File, unsigned LineNumber,
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(File), Ty,
                             SizeInBits, AlignInBits, OffsetInBits, std::nullopt,
-                            Flags, PropertyNode);
+                            std::nullopt, Flags, PropertyNode);
 }
 
 DIObjCProperty *
@@ -555,10 +571,10 @@ DIDerivedType *DIBuilder::createSetType(DIScope *Scope, StringRef Name,
                                         DIFile *File, unsigned LineNo,
                                         uint64_t SizeInBits,
                                         uint32_t AlignInBits, DIType *Ty) {
-  auto *R =
-      DIDerivedType::get(VMContext, dwarf::DW_TAG_set_type, Name, File, LineNo,
-                         getNonCompileUnitScope(Scope), Ty, SizeInBits,
-                         AlignInBits, 0, std::nullopt, DINode::FlagZero);
+  auto *R = DIDerivedType::get(VMContext, dwarf::DW_TAG_set_type, Name, File,
+                               LineNo, getNonCompileUnitScope(Scope), Ty,
+                               SizeInBits, AlignInBits, 0, std::nullopt,
+                               std::nullopt, DINode::FlagZero);
   trackIfUnresolved(R);
   return R;
 }
@@ -951,14 +967,14 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
   assert(Link && "Linked instruction must have DIAssign metadata attached");
 
   if (M.IsNewDbgInfoFormat) {
-    DPValue *DPV = DPValue::createDPVAssign(Val, SrcVar, ValExpr, Link, Addr,
-                                            AddrExpr, DL);
+    DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
+        Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
     BasicBlock *InsertBB = LinkedInstr->getParent();
     // Insert after LinkedInstr.
     BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
     Instruction *InsertBefore = NextIt == InsertBB->end() ? nullptr : &*NextIt;
-    insertDPValue(DPV, InsertBB, InsertBefore, true);
-    return DPV;
+    insertDbgVariableRecord(DVR, InsertBB, InsertBefore, true);
+    return DVR;
   }
 
   LLVMContext &Ctx = LinkedInstr->getContext();
@@ -1040,9 +1056,10 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(
     llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr,
     const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) {
   if (M.IsNewDbgInfoFormat) {
-    DPValue *DPV = DPValue::createDPValue(Val, VarInfo, Expr, DL);
-    insertDPValue(DPV, InsertBB, InsertBefore);
-    return DPV;
+    DbgVariableRecord *DVR =
+        DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
+    insertDbgVariableRecord(DVR, InsertBB, InsertBefore);
+    return DVR;
   }
 
   if (!ValueFn)
@@ -1062,9 +1079,10 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
          "Expected matching subprograms");
 
   if (M.IsNewDbgInfoFormat) {
-    DPValue *DPV = DPValue::createDPVDeclare(Storage, VarInfo, Expr, DL);
-    insertDPValue(DPV, InsertBB, InsertBefore);
-    return DPV;
+    DbgVariableRecord *DVR =
+        DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
+    insertDbgVariableRecord(DVR, InsertBB, InsertBefore);
+    return DVR;
   }
 
   if (!DeclareFn)
@@ -1081,13 +1099,15 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
   return B.CreateCall(DeclareFn, Args);
 }
 
-void DIBuilder::insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
-                              Instruction *InsertBefore, bool InsertAtHead) {
+void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR,
+                                        BasicBlock *InsertBB,
+                                        Instruction *InsertBefore,
+                                        bool InsertAtHead) {
   assert(InsertBefore || InsertBB);
-  trackIfUnresolved(DPV->getVariable());
-  trackIfUnresolved(DPV->getExpression());
-  if (DPV->isDbgAssign())
-    trackIfUnresolved(DPV->getAddressExpression());
+  trackIfUnresolved(DVR->getVariable());
+  trackIfUnresolved(DVR->getExpression());
+  if (DVR->isDbgAssign())
+    trackIfUnresolved(DVR->getAddressExpression());
 
   BasicBlock::iterator InsertPt;
   if (InsertBB && InsertBefore)
@@ -1095,7 +1115,7 @@ void DIBuilder::insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
   else if (InsertBB)
     InsertPt = InsertBB->end();
   InsertPt.setHeadBit(InsertAtHead);
-  InsertBB->insertDbgRecordBefore(DPV, InsertPt);
+  InsertBB->insertDbgRecordBefore(DVR, InsertPt);
 }
 
 Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn,
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e63b1e67dad7c..09bce9df1f332 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -63,7 +63,7 @@ TinyPtrVector<DbgDeclareInst *> llvm::findDbgDeclares(Value *V) {
 
   return Declares;
 }
-TinyPtrVector<DPValue *> llvm::findDPVDeclares(Value *V) {
+TinyPtrVector<DbgVariableRecord *> llvm::findDVRDeclares(Value *V) {
   // This function is hot. Check whether the value has any metadata to avoid a
   // DenseMap lookup.
   if (!V->isUsedByMetadata())
@@ -72,18 +72,19 @@ TinyPtrVector<DPValue *> llvm::findDPVDeclares(Value *V) {
   if (!L)
     return {};
 
-  TinyPtrVector<DPValue *> Declares;
-  for (DPValue *DPV : L->getAllDPValueUsers())
-    if (DPV->getType() == DPValue::LocationType::Declare)
-      Declares.push_back(DPV);
+  TinyPtrVector<DbgVariableRecord *> Declares;
+  for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers())
+    if (DVR->getType() == DbgVariableRecord::LocationType::Declare)
+      Declares.push_back(DVR);
 
   return Declares;
 }
 
-template <typename IntrinsicT,
-          DPValue::LocationType Type = DPValue::LocationType::Any>
-static void findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
-                              SmallVectorImpl<DPValue *> *DPValues) {
+template <typename IntrinsicT, DbgVariableRecord::LocationType Type =
+                                   DbgVariableRecord::LocationType::Any>
+static void
+findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
+                  SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
   // This function is hot. Check whether the value has any metadata to avoid a
   // DenseMap lookup.
   if (!V->isUsedByMetadata())
@@ -96,25 +97,27 @@ static void findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
   // V will also appear twice in a dbg.assign if its used in the both the value
   // and address components.
   SmallPtrSet<IntrinsicT *, 4> EncounteredIntrinsics;
-  SmallPtrSet<DPValue *, 4> EncounteredDPValues;
+  SmallPtrSet<DbgVariableRecord *, 4> EncounteredDbgVariableRecords;
 
   /// Append IntrinsicT users of MetadataAsValue(MD).
-  auto AppendUsers = [&Ctx, &EncounteredIntrinsics, &EncounteredDPValues,
-                      &Result, DPValues](Metadata *MD) {
+  auto AppendUsers = [&Ctx, &EncounteredIntrinsics,
+                      &EncounteredDbgVariableRecords, &Result,
+                      DbgVariableRecords](Metadata *MD) {
     if (auto *MDV = MetadataAsValue::getIfExists(Ctx, MD)) {
       for (User *U : MDV->users())
         if (IntrinsicT *DVI = dyn_cast<IntrinsicT>(U))
           if (EncounteredIntrinsics.insert(DVI).second)
             Result.push_back(DVI);
     }
-    if (!DPValues)
+    if (!DbgVariableRecords)
       return;
-    // Get DPValues that use this as a single value.
+    // Get DbgVariableRecords that use this as a single value.
     if (LocalAsMetadata *L = dyn_cast<LocalAsMetadata>(MD)) {
-      for (DPValue *DPV : L->getAllDPValueUsers()) {
-        if (Type == DPValue::LocationType::Any || DPV->getType() == Type)
-          if (EncounteredDPValues.insert(DPV).second)
-            DPValues->push_back(DPV);
+      for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers()) {
+        if (Type == DbgVariableRecord::LocationType::Any ||
+            DVR->getType() == Type)
+          if (EncounteredDbgVariableRecords.insert(DVR).second)
+            DbgVariableRecords->push_back(DVR);
       }
     }
   };
@@ -123,27 +126,30 @@ static void findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
     AppendUsers(L);
     for (Metadata *AL : L->getAllArgListUsers()) {
       AppendUsers(AL);
-      if (!DPValues)
+      if (!DbgVariableRecords)
         continue;
       DIArgList *DI = cast<DIArgList>(AL);
-      for (DPValue *DPV : DI->getAllDPValueUsers())
-        if (Type == DPValue::LocationType::Any || DPV->getType() == Type)
-          if (EncounteredDPValues.insert(DPV).second)
-            DPValues->push_back(DPV);
+      for (DbgVariableRecord *DVR : DI->getAllDbgVariableRecordUsers())
+        if (Type == DbgVariableRecord::LocationType::Any ||
+            DVR->getType() == Type)
+          if (EncounteredDbgVariableRecords.insert(DVR).second)
+            DbgVariableRecords->push_back(DVR);
     }
   }
 }
 
-void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues,
-                         Value *V, SmallVectorImpl<DPValue *> *DPValues) {
-  findDbgIntrinsics<DbgValueInst, DPValue::LocationType::Value>(DbgValues, V,
-                                                                DPValues);
+void llvm::findDbgValues(
+    SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
+    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
+  findDbgIntrinsics<DbgValueInst, DbgVariableRecord::LocationType::Value>(
+      DbgValues, V, DbgVariableRecords);
 }
 
-void llvm::findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers,
-                        Value *V, SmallVectorImpl<DPValue *> *DPValues) {
-  findDbgIntrinsics<DbgVariableIntrinsic, DPValue::LocationType::Any>(
-      DbgUsers, V, DPValues);
+void llvm::findDbgUsers(
+    SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers, Value *V,
+    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
+  findDbgIntrinsics<DbgVariableIntrinsic, DbgVariableRecord::LocationType::Any>(
+      DbgUsers, V, DbgVariableRecords);
 }
 
 DISubprogram *llvm::getDISubprogram(const MDNode *Scope) {
@@ -164,16 +170,16 @@ DebugLoc llvm::getDebugValueLoc(DbgVariableIntrinsic *DII) {
   return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt);
 }
 
-DebugLoc llvm::getDebugValueLoc(DPValue *DPV) {
+DebugLoc llvm::getDebugValueLoc(DbgVariableRecord *DVR) {
   // Original dbg.declare must have a location.
-  const DebugLoc &DeclareLoc = DPV->getDebugLoc();
+  const DebugLoc &DeclareLoc = DVR->getDebugLoc();
   MDNode *Scope = DeclareLoc.getScope();
   DILocation *InlinedAt = DeclareLoc.getInlinedAt();
   // Because no machine insts can come from debug intrinsics, only the scope
   // and inlinedAt is significant. Zero line numbers are used in case this
   // DebugLoc leaks into any adjacent instructions. Produce an unknown location
   // with the correct scope / inlinedAt fields.
-  return DILocation::get(DPV->getContext(), 0, 0, Scope, InlinedAt);
+  return DILocation::get(DVR->getContext(), 0, 0, Scope, InlinedAt);
 }
 
 //===----------------------------------------------------------------------===//
@@ -253,8 +259,8 @@ void DebugInfoFinder::processLocation(const Module &M, const DILocation *Loc) {
 }
 
 void DebugInfoFinder::processDbgRecord(const Module &M, const DbgRecord &DR) {
-  if (const DPValue *DPV = dyn_cast<const DPValue>(&DR))
-    processVariable(M, DPV->getVariable());
+  if (const DbgVariableRecord *DVR = dyn_cast<const DbgVariableRecord>(&DR))
+    processVariable(M, DVR->getVariable());
   processLocation(M, DR.getDebugLoc().get());
 }
 
@@ -895,7 +901,7 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
         if (I.hasMetadataOtherThanDebugLoc())
           I.setMetadata("heapallocsite", nullptr);
 
-        // Strip any DPValues attached.
+        // Strip any DbgRecords attached.
         I.dropDbgRecords();
       }
     }
@@ -1336,9 +1342,9 @@ LLVMMetadataRef LLVMDIBuilderCreatePointerType(
     LLVMDIBuilderRef Builder, LLVMMetadataRef PointeeTy,
     uint64_t SizeInBits, uint32_t AlignInBits, unsigned AddressSpace,
     const char *Name, size_t NameLen) {
-  return wrap(unwrap(Builder)->createPointerType(unwrapDI<DIType>(PointeeTy),
-                                         SizeInBits, AlignInBits,
-                                         AddressSpace, {Name, NameLen}));
+  return wrap(unwrap(Builder)->createPointerType(
+      unwrapDI<DIType>(PointeeTy), SizeInBits, AlignInBits, AddressSpace,
+      {Name, NameLen}));
 }
 
 LLVMMetadataRef LLVMDIBuilderCreateStructType(
@@ -1663,6 +1669,12 @@ LLVMValueRef
 LLVMDIBuilderInsertDeclareBefore(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
                                  LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
                                  LLVMMetadataRef DL, LLVMValueRef Instr) {
+  return LLVMDIBuilderInsertDeclareIntrinsicBefore(Builder, Storage, VarInfo,
+                                                   Expr, DL, Instr);
+}
+LLVMValueRef LLVMDIBuilderInsertDeclareIntrinsicBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMValueRef Instr) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare(
       unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
       unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
@@ -1671,11 +1683,27 @@ LLVMDIBuilderInsertDeclareBefore(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
          "Inserted a DbgRecord into function using old debug info mode");
   return wrap(cast<Instruction *>(DbgInst));
 }
+LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMValueRef Instr) {
+  return wrap(
+      unwrap(Builder)
+          ->insertDeclare(unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
+                          unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
+                          unwrap<Instruction>(Instr))
+          .get<DbgRecord *>());
+}
 
 LLVMValueRef
 LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
                                 LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
                                 LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
+  return LLVMDIBuilderInsertDeclareIntrinsicAtEnd(Builder, Storage, VarInfo,
+                                                  Expr, DL, Block);
+}
+LLVMValueRef LLVMDIBuilderInsertDeclareIntrinsicAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare(
       unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
       unwrap<DIExpression>(Expr), unwrap<DILocation>(DL), unwrap(Block));
@@ -1683,10 +1711,26 @@ LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
          "Inserted a DbgRecord into function using old debug info mode");
   return wrap(cast<Instruction *>(DbgInst));
 }
+LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
+  return wrap(unwrap(Builder)
+                  ->insertDeclare(unwrap(Storage),
+                                  unwrap<DILocalVariable>(VarInfo),
+                                  unwrap<DIExpression>(Expr),
+                                  unwrap<DILocation>(DL), unwrap(Block))
+                  .get<DbgRecord *>());
+}
 
 LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(
     LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
     LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) {
+  return LLVMDIBuilderInsertDbgValueIntrinsicBefore(Builder, Val, VarInfo, Expr,
+                                                    DebugLoc, Instr);
+}
+LLVMValueRef LLVMDIBuilderInsertDbgValueIntrinsicBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
       unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
       unwrap<DILocation>(DebugLoc), unwrap<Instruction>(Instr));
@@ -1694,10 +1738,26 @@ LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(
          "Inserted a DbgRecord into function using old debug info mode");
   return wrap(cast<Instruction *>(DbgInst));
 }
+LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) {
+  return wrap(unwrap(Builder)
+                  ->insertDbgValueIntrinsic(
+                      unwrap(Val), unwrap<DILocalVariable>(VarInfo),
+                      unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
+                      unwrap<Instruction>(Instr))
+                  .get<DbgRecord *>());
+}
 
 LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(
     LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
     LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) {
+  return LLVMDIBuilderInsertDbgValueIntrinsicAtEnd(Builder, Val, VarInfo, Expr,
+                                                   DebugLoc, Block);
+}
+LLVMValueRef LLVMDIBuilderInsertDbgValueIntrinsicAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
       unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
       unwrap<DILocation>(DebugLoc), unwrap(Block));
@@ -1705,6 +1765,16 @@ LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(
          "Inserted a DbgRecord into function using old debug info mode");
   return wrap(cast<Instruction *>(DbgInst));
 }
+LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) {
+  return wrap(unwrap(Builder)
+                  ->insertDbgValueIntrinsic(
+                      unwrap(Val), unwrap<DILocalVariable>(VarInfo),
+                      unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
+                      unwrap(Block))
+                  .get<DbgRecord *>());
+}
 
 LLVMMetadataRef LLVMDIBuilderCreateAutoVariable(
     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
@@ -1800,14 +1870,14 @@ AssignmentMarkerRange at::getAssignmentMarkers(DIAssignID *ID) {
 
 void at::deleteAssignmentMarkers(const Instruction *Inst) {
   auto Range = getAssignmentMarkers(Inst);
-  SmallVector<DPValue *> DPVAssigns = getDPVAssignmentMarkers(Inst);
-  if (Range.empty() && DPVAssigns.empty())
+  SmallVector<DbgVariableRecord *> DVRAssigns = getDVRAssignmentMarkers(Inst);
+  if (Range.empty() && DVRAssigns.empty())
     return;
   SmallVector<DbgAssignIntrinsic *> ToDelete(Range.begin(), Range.end());
   for (auto *DAI : ToDelete)
     DAI->eraseFromParent();
-  for (auto *DPV : DPVAssigns)
-    DPV->eraseFromParent();
+  for (auto *DVR : DVRAssigns)
+    DVR->eraseFromParent();
 }
 
 void at::RAUW(DIAssignID *Old, DIAssignID *New) {
@@ -1825,12 +1895,12 @@ void at::RAUW(DIAssignID *Old, DIAssignID *New) {
 
 void at::deleteAll(Function *F) {
   SmallVector<DbgAssignIntrinsic *, 12> ToDelete;
-  SmallVector<DPValue *, 12> DPToDelete;
+  SmallVector<DbgVariableRecord *, 12> DPToDelete;
   for (BasicBlock &BB : *F) {
     for (Instruction &I : BB) {
-      for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange()))
-        if (DPV.isDbgAssign())
-          DPToDelete.push_back(&DPV);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+        if (DVR.isDbgAssign())
+          DPToDelete.push_back(&DVR);
       if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
         ToDelete.push_back(DAI);
       else
@@ -1839,20 +1909,20 @@ void at::deleteAll(Function *F) {
   }
   for (auto *DAI : ToDelete)
     DAI->eraseFromParent();
-  for (auto *DPV : DPToDelete)
-    DPV->eraseFromParent();
+  for (auto *DVR : DPToDelete)
+    DVR->eraseFromParent();
 }
 
 /// Get the FragmentInfo for the variable if it exists, otherwise return a
 /// FragmentInfo that covers the entire variable if the variable size is
 /// known, otherwise return a zero-sized fragment.
 static DIExpression::FragmentInfo
-getFragmentOrEntireVariable(const DPValue *DPV) {
+getFragmentOrEntireVariable(const DbgVariableRecord *DVR) {
   DIExpression::FragmentInfo VariableSlice(0, 0);
   // Get the fragment or variable size, or zero.
-  if (auto Sz = DPV->getFragmentSizeInBits())
+  if (auto Sz = DVR->getFragmentSizeInBits())
     VariableSlice.SizeInBits = *Sz;
-  if (auto Frag = DPV->getExpression()->getFragmentInfo())
+  if (auto Frag = DVR->getExpression()->getFragmentInfo())
     VariableSlice.OffsetInBits = Frag->OffsetInBits;
   return VariableSlice;
 }
@@ -2016,10 +2086,10 @@ bool at::calculateFragmentIntersect(
 }
 bool at::calculateFragmentIntersect(
     const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
-    uint64_t SliceSizeInBits, const DPValue *DPVAssign,
+    uint64_t SliceSizeInBits, const DbgVariableRecord *DVRAssign,
     std::optional<DIExpression::FragmentInfo> &Result) {
   return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits,
-                                        SliceSizeInBits, DPVAssign, Result);
+                                        SliceSizeInBits, DVRAssign, Result);
 }
 
 /// Collect constant properies (base, size, offset) of \p StoreDest.
@@ -2113,7 +2183,7 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
   DIExpression *AddrExpr =
       DIExpression::get(StoreLikeInst.getContext(), std::nullopt);
   if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) {
-    auto *Assign = DPValue::createLinkedDPVAssign(
+    auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
         &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
     (void)Assign;
     LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
@@ -2231,7 +2301,7 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   // storage" is limited to Allocas). We'll use this to find dbg.declares to
   // delete after running `trackAssignments`.
   DenseMap<const AllocaInst *, SmallPtrSet<DbgDeclareInst *, 2>> DbgDeclares;
-  DenseMap<const AllocaInst *, SmallPtrSet<DPValue *, 2>> DPVDeclares;
+  DenseMap<const AllocaInst *, SmallPtrSet<DbgVariableRecord *, 2>> DVRDeclares;
   // Create another similar map of {storage : variables} that we'll pass to
   // trackAssignments.
   StorageToVarsMap Vars;
@@ -2257,9 +2327,9 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   };
   for (auto &BB : F) {
     for (auto &I : BB) {
-      for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange())) {
-        if (DPV.isDbgDeclare())
-          ProcessDeclare(&DPV, DPVDeclares);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (DVR.isDbgDeclare())
+          ProcessDeclare(&DVR, DVRDeclares);
       }
       if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(&I))
         ProcessDeclare(DDI, DbgDeclares);
@@ -2300,8 +2370,8 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   };
   for (auto &P : DbgDeclares)
     DeleteSubsumedDeclare(at::getAssignmentMarkers(P.first), P.second);
-  for (auto &P : DPVDeclares)
-    DeleteSubsumedDeclare(at::getDPVAssignmentMarkers(P.first), P.second);
+  for (auto &P : DVRDeclares)
+    DeleteSubsumedDeclare(at::getDVRAssignmentMarkers(P.first), P.second);
   return Changed;
 }
 
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 28f96653d815b..570515505607f 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -34,6 +34,10 @@ cl::opt<bool> EnableFSDiscriminator(
     cl::desc("Enable adding flow sensitive discriminators"));
 } // namespace llvm
 
+uint32_t DIType::getAlignInBits() const {
+  return (getTag() == dwarf::DW_TAG_LLVM_ptrauth_type ? 0 : SubclassData32);
+}
+
 const DIExpression::FragmentInfo DebugVariable::DefaultFragment = {
     std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
 
@@ -42,10 +46,10 @@ DebugVariable::DebugVariable(const DbgVariableIntrinsic *DII)
       Fragment(DII->getExpression()->getFragmentInfo()),
       InlinedAt(DII->getDebugLoc().getInlinedAt()) {}
 
-DebugVariable::DebugVariable(const DPValue *DPV)
-    : Variable(DPV->getVariable()),
-      Fragment(DPV->getExpression()->getFragmentInfo()),
-      InlinedAt(DPV->getDebugLoc().getInlinedAt()) {}
+DebugVariable::DebugVariable(const DbgVariableRecord *DVR)
+    : Variable(DVR->getVariable()),
+      Fragment(DVR->getExpression()->getFragmentInfo()),
+      InlinedAt(DVR->getDebugLoc().getInlinedAt()) {}
 
 DebugVariableAggregate::DebugVariableAggregate(const DbgVariableIntrinsic *DVI)
     : DebugVariable(DVI->getVariable(), std::nullopt,
@@ -731,26 +735,32 @@ Constant *DIDerivedType::getDiscriminantValue() const {
   return nullptr;
 }
 
-DIDerivedType *
-DIDerivedType::getImpl(LLVMContext &Context, unsigned Tag, MDString *Name,
-                       Metadata *File, unsigned Line, Metadata *Scope,
-                       Metadata *BaseType, uint64_t SizeInBits,
-                       uint32_t AlignInBits, uint64_t OffsetInBits,
-                       std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-                       Metadata *ExtraData, Metadata *Annotations,
-                       StorageType Storage, bool ShouldCreate) {
+DIDerivedType *DIDerivedType::getImpl(
+    LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
+    unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
+    uint32_t AlignInBits, uint64_t OffsetInBits,
+    std::optional<unsigned> DWARFAddressSpace,
+    std::optional<PtrAuthData> PtrAuthData, DIFlags Flags, Metadata *ExtraData,
+    Metadata *Annotations, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIDerivedType,
                         (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                         AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                         ExtraData, Annotations));
+                         AlignInBits, OffsetInBits, DWARFAddressSpace,
+                         PtrAuthData, Flags, ExtraData, Annotations));
   Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData, Annotations};
   DEFINE_GETIMPL_STORE(DIDerivedType,
                        (Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
-                        DWARFAddressSpace, Flags),
+                        DWARFAddressSpace, PtrAuthData, Flags),
                        Ops);
 }
 
+std::optional<DIDerivedType::PtrAuthData>
+DIDerivedType::getPtrAuthData() const {
+  return getTag() == dwarf::DW_TAG_LLVM_ptrauth_type
+             ? std::optional<PtrAuthData>(PtrAuthData(SubclassData32))
+             : std::nullopt;
+}
+
 DICompositeType *DICompositeType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
@@ -1880,11 +1890,12 @@ DIExpression *DIExpression::append(const DIExpression *Expr,
 DIExpression *DIExpression::appendToStack(const DIExpression *Expr,
                                           ArrayRef<uint64_t> Ops) {
   assert(Expr && !Ops.empty() && "Can't append ops to this expression");
-  assert(none_of(Ops,
-                 [](uint64_t Op) {
-                   return Op == dwarf::DW_OP_stack_value ||
-                          Op == dwarf::DW_OP_LLVM_fragment;
-                 }) &&
+  assert(std::none_of(expr_op_iterator(Ops.begin()),
+                      expr_op_iterator(Ops.end()),
+                      [](auto Op) {
+                        return Op.getOp() == dwarf::DW_OP_stack_value ||
+                               Op.getOp() == dwarf::DW_OP_LLVM_fragment;
+                      }) &&
          "Can't append this op");
 
   // Append a DW_OP_deref after Expr's current op list if it's non-empty and
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 019b00c2e2087..1fb435f46a5fc 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -1,4 +1,4 @@
-//======-- DebugProgramInstruction.cpp - Implement DPValues/DPMarkers --======//
+//=====-- DebugProgramInstruction.cpp - Implement DbgRecords/DPMarkers --=====//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -28,7 +28,7 @@ template class DbgRecordParamRef<DIExpression>;
 template class DbgRecordParamRef<DILabel>;
 template class DbgRecordParamRef<DILocalVariable>;
 
-DPValue::DPValue(const DbgVariableIntrinsic *DVI)
+DbgVariableRecord::DbgVariableRecord(const DbgVariableIntrinsic *DVI)
     : DbgRecord(ValueKind, DVI->getDebugLoc()),
       DebugValueUser({DVI->getRawLocation(), nullptr, nullptr}),
       Variable(DVI->getVariable()), Expression(DVI->getExpression()),
@@ -51,25 +51,27 @@ DPValue::DPValue(const DbgVariableIntrinsic *DVI)
   }
   default:
     llvm_unreachable(
-        "Trying to create a DPValue with an invalid intrinsic type!");
+        "Trying to create a DbgVariableRecord with an invalid intrinsic type!");
   }
 }
 
-DPValue::DPValue(const DPValue &DPV)
-    : DbgRecord(ValueKind, DPV.getDebugLoc()), DebugValueUser(DPV.DebugValues),
-      Type(DPV.getType()), Variable(DPV.getVariable()),
-      Expression(DPV.getExpression()),
-      AddressExpression(DPV.AddressExpression) {}
+DbgVariableRecord::DbgVariableRecord(const DbgVariableRecord &DVR)
+    : DbgRecord(ValueKind, DVR.getDebugLoc()), DebugValueUser(DVR.DebugValues),
+      Type(DVR.getType()), Variable(DVR.getVariable()),
+      Expression(DVR.getExpression()),
+      AddressExpression(DVR.AddressExpression) {}
 
-DPValue::DPValue(Metadata *Location, DILocalVariable *DV, DIExpression *Expr,
-                 const DILocation *DI, LocationType Type)
+DbgVariableRecord::DbgVariableRecord(Metadata *Location, DILocalVariable *DV,
+                                     DIExpression *Expr, const DILocation *DI,
+                                     LocationType Type)
     : DbgRecord(ValueKind, DI), DebugValueUser({Location, nullptr, nullptr}),
       Type(Type), Variable(DV), Expression(Expr) {}
 
-DPValue::DPValue(Metadata *Value, DILocalVariable *Variable,
-                 DIExpression *Expression, DIAssignID *AssignID,
-                 Metadata *Address, DIExpression *AddressExpression,
-                 const DILocation *DI)
+DbgVariableRecord::DbgVariableRecord(Metadata *Value, DILocalVariable *Variable,
+                                     DIExpression *Expression,
+                                     DIAssignID *AssignID, Metadata *Address,
+                                     DIExpression *AddressExpression,
+                                     const DILocation *DI)
     : DbgRecord(ValueKind, DI), DebugValueUser({Value, Address, AssignID}),
       Type(LocationType::Assign), Variable(Variable), Expression(Expression),
       AddressExpression(AddressExpression) {}
@@ -77,7 +79,7 @@ DPValue::DPValue(Metadata *Value, DILocalVariable *Variable,
 void DbgRecord::deleteRecord() {
   switch (RecordKind) {
   case ValueKind:
-    delete cast<DPValue>(this);
+    delete cast<DbgVariableRecord>(this);
     return;
   case LabelKind:
     delete cast<DPLabel>(this);
@@ -89,7 +91,7 @@ void DbgRecord::deleteRecord() {
 void DbgRecord::print(raw_ostream &O, bool IsForDebug) const {
   switch (RecordKind) {
   case ValueKind:
-    cast<DPValue>(this)->print(O, IsForDebug);
+    cast<DbgVariableRecord>(this)->print(O, IsForDebug);
     return;
   case LabelKind:
     cast<DPLabel>(this)->print(O, IsForDebug);
@@ -102,7 +104,7 @@ void DbgRecord::print(raw_ostream &O, ModuleSlotTracker &MST,
                       bool IsForDebug) const {
   switch (RecordKind) {
   case ValueKind:
-    cast<DPValue>(this)->print(O, MST, IsForDebug);
+    cast<DbgVariableRecord>(this)->print(O, MST, IsForDebug);
     return;
   case LabelKind:
     cast<DPLabel>(this)->print(O, MST, IsForDebug);
@@ -116,7 +118,8 @@ bool DbgRecord::isIdenticalToWhenDefined(const DbgRecord &R) const {
     return false;
   switch (RecordKind) {
   case ValueKind:
-    return cast<DPValue>(this)->isIdenticalToWhenDefined(*cast<DPValue>(&R));
+    return cast<DbgVariableRecord>(this)->isIdenticalToWhenDefined(
+        *cast<DbgVariableRecord>(&R));
   case LabelKind:
     return cast<DPLabel>(this)->getLabel() == cast<DPLabel>(R).getLabel();
   };
@@ -131,7 +134,7 @@ DbgInfoIntrinsic *
 DbgRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
   switch (RecordKind) {
   case ValueKind:
-    return cast<DPValue>(this)->createDebugIntrinsic(M, InsertBefore);
+    return cast<DbgVariableRecord>(this)->createDebugIntrinsic(M, InsertBefore);
   case LabelKind:
     return cast<DPLabel>(this)->createDebugIntrinsic(M, InsertBefore);
   };
@@ -153,79 +156,83 @@ DPLabel *DPLabel::createUnresolvedDPLabel(MDNode *Label, MDNode *DL) {
   return new DPLabel(Label, DL);
 }
 
-DPValue::DPValue(DPValue::LocationType Type, Metadata *Val, MDNode *Variable,
-                 MDNode *Expression, MDNode *AssignID, Metadata *Address,
-                 MDNode *AddressExpression, MDNode *DI)
+DbgVariableRecord::DbgVariableRecord(DbgVariableRecord::LocationType Type,
+                                     Metadata *Val, MDNode *Variable,
+                                     MDNode *Expression, MDNode *AssignID,
+                                     Metadata *Address,
+                                     MDNode *AddressExpression, MDNode *DI)
     : DbgRecord(ValueKind, DebugLoc(DI)),
       DebugValueUser({Val, Address, AssignID}), Type(Type), Variable(Variable),
       Expression(Expression), AddressExpression(AddressExpression) {}
 
-DPValue *DPValue::createUnresolvedDPValue(DPValue::LocationType Type,
-                                          Metadata *Val, MDNode *Variable,
-                                          MDNode *Expression, MDNode *AssignID,
-                                          Metadata *Address,
-                                          MDNode *AddressExpression,
-                                          MDNode *DI) {
-  return new DPValue(Type, Val, Variable, Expression, AssignID, Address,
-                     AddressExpression, DI);
+DbgVariableRecord *DbgVariableRecord::createUnresolvedDbgVariableRecord(
+    DbgVariableRecord::LocationType Type, Metadata *Val, MDNode *Variable,
+    MDNode *Expression, MDNode *AssignID, Metadata *Address,
+    MDNode *AddressExpression, MDNode *DI) {
+  return new DbgVariableRecord(Type, Val, Variable, Expression, AssignID,
+                               Address, AddressExpression, DI);
 }
 
-DPValue *DPValue::createDPValue(Value *Location, DILocalVariable *DV,
-                                DIExpression *Expr, const DILocation *DI) {
-  return new DPValue(ValueAsMetadata::get(Location), DV, Expr, DI,
-                     LocationType::Value);
+DbgVariableRecord *
+DbgVariableRecord::createDbgVariableRecord(Value *Location, DILocalVariable *DV,
+                                           DIExpression *Expr,
+                                           const DILocation *DI) {
+  return new DbgVariableRecord(ValueAsMetadata::get(Location), DV, Expr, DI,
+                               LocationType::Value);
 }
 
-DPValue *DPValue::createDPValue(Value *Location, DILocalVariable *DV,
-                                DIExpression *Expr, const DILocation *DI,
-                                DPValue &InsertBefore) {
-  auto *NewDPValue = createDPValue(Location, DV, Expr, DI);
-  NewDPValue->insertBefore(&InsertBefore);
-  return NewDPValue;
+DbgVariableRecord *DbgVariableRecord::createDbgVariableRecord(
+    Value *Location, DILocalVariable *DV, DIExpression *Expr,
+    const DILocation *DI, DbgVariableRecord &InsertBefore) {
+  auto *NewDbgVariableRecord = createDbgVariableRecord(Location, DV, Expr, DI);
+  NewDbgVariableRecord->insertBefore(&InsertBefore);
+  return NewDbgVariableRecord;
 }
 
-DPValue *DPValue::createDPVDeclare(Value *Address, DILocalVariable *DV,
-                                   DIExpression *Expr, const DILocation *DI) {
-  return new DPValue(ValueAsMetadata::get(Address), DV, Expr, DI,
-                     LocationType::Declare);
+DbgVariableRecord *DbgVariableRecord::createDVRDeclare(Value *Address,
+                                                       DILocalVariable *DV,
+                                                       DIExpression *Expr,
+                                                       const DILocation *DI) {
+  return new DbgVariableRecord(ValueAsMetadata::get(Address), DV, Expr, DI,
+                               LocationType::Declare);
 }
 
-DPValue *DPValue::createDPVDeclare(Value *Address, DILocalVariable *DV,
-                                   DIExpression *Expr, const DILocation *DI,
-                                   DPValue &InsertBefore) {
-  auto *NewDPVDeclare = createDPVDeclare(Address, DV, Expr, DI);
-  NewDPVDeclare->insertBefore(&InsertBefore);
-  return NewDPVDeclare;
+DbgVariableRecord *
+DbgVariableRecord::createDVRDeclare(Value *Address, DILocalVariable *DV,
+                                    DIExpression *Expr, const DILocation *DI,
+                                    DbgVariableRecord &InsertBefore) {
+  auto *NewDVRDeclare = createDVRDeclare(Address, DV, Expr, DI);
+  NewDVRDeclare->insertBefore(&InsertBefore);
+  return NewDVRDeclare;
 }
 
-DPValue *DPValue::createDPVAssign(Value *Val, DILocalVariable *Variable,
-                                  DIExpression *Expression,
-                                  DIAssignID *AssignID, Value *Address,
-                                  DIExpression *AddressExpression,
-                                  const DILocation *DI) {
-  return new DPValue(ValueAsMetadata::get(Val), Variable, Expression, AssignID,
-                     ValueAsMetadata::get(Address), AddressExpression, DI);
+DbgVariableRecord *DbgVariableRecord::createDVRAssign(
+    Value *Val, DILocalVariable *Variable, DIExpression *Expression,
+    DIAssignID *AssignID, Value *Address, DIExpression *AddressExpression,
+    const DILocation *DI) {
+  return new DbgVariableRecord(ValueAsMetadata::get(Val), Variable, Expression,
+                               AssignID, ValueAsMetadata::get(Address),
+                               AddressExpression, DI);
 }
 
-DPValue *DPValue::createLinkedDPVAssign(Instruction *LinkedInstr, Value *Val,
-                                        DILocalVariable *Variable,
-                                        DIExpression *Expression,
-                                        Value *Address,
-                                        DIExpression *AddressExpression,
-                                        const DILocation *DI) {
+DbgVariableRecord *DbgVariableRecord::createLinkedDVRAssign(
+    Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable,
+    DIExpression *Expression, Value *Address, DIExpression *AddressExpression,
+    const DILocation *DI) {
   auto *Link = LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID);
   assert(Link && "Linked instruction must have DIAssign metadata attached");
-  auto *NewDPVAssign = DPValue::createDPVAssign(Val, Variable, Expression,
-                                                cast<DIAssignID>(Link), Address,
-                                                AddressExpression, DI);
-  LinkedInstr->getParent()->insertDbgRecordAfter(NewDPVAssign, LinkedInstr);
-  return NewDPVAssign;
+  auto *NewDVRAssign = DbgVariableRecord::createDVRAssign(
+      Val, Variable, Expression, cast<DIAssignID>(Link), Address,
+      AddressExpression, DI);
+  LinkedInstr->getParent()->insertDbgRecordAfter(NewDVRAssign, LinkedInstr);
+  return NewDVRAssign;
 }
 
-iterator_range<DPValue::location_op_iterator> DPValue::location_ops() const {
+iterator_range<DbgVariableRecord::location_op_iterator>
+DbgVariableRecord::location_ops() const {
   auto *MD = getRawLocation();
-  // If a Value has been deleted, the "location" for this DPValue will be
-  // replaced by nullptr. Return an empty range.
+  // If a Value has been deleted, the "location" for this DbgVariableRecord will
+  // be replaced by nullptr. Return an empty range.
   if (!MD)
     return {location_op_iterator(static_cast<ValueAsMetadata *>(nullptr)),
             location_op_iterator(static_cast<ValueAsMetadata *>(nullptr))};
@@ -245,13 +252,13 @@ iterator_range<DPValue::location_op_iterator> DPValue::location_ops() const {
           location_op_iterator(static_cast<ValueAsMetadata *>(nullptr))};
 }
 
-unsigned DPValue::getNumVariableLocationOps() const {
+unsigned DbgVariableRecord::getNumVariableLocationOps() const {
   if (hasArgList())
     return cast<DIArgList>(getRawLocation())->getArgs().size();
   return 1;
 }
 
-Value *DPValue::getVariableLocationOp(unsigned OpIdx) const {
+Value *DbgVariableRecord::getVariableLocationOp(unsigned OpIdx) const {
   auto *MD = getRawLocation();
   if (!MD)
     return nullptr;
@@ -261,7 +268,7 @@ Value *DPValue::getVariableLocationOp(unsigned OpIdx) const {
   if (isa<MDNode>(MD))
     return nullptr;
   assert(isa<ValueAsMetadata>(MD) &&
-         "Attempted to get location operand from DPValue with none.");
+         "Attempted to get location operand from DbgVariableRecord with none.");
   auto *V = cast<ValueAsMetadata>(MD);
   assert(OpIdx == 0 && "Operand Index must be 0 for a debug intrinsic with a "
                        "single location operand.");
@@ -274,8 +281,9 @@ static ValueAsMetadata *getAsMetadata(Value *V) {
                                  : ValueAsMetadata::get(V);
 }
 
-void DPValue::replaceVariableLocationOp(Value *OldValue, Value *NewValue,
-                                        bool AllowEmpty) {
+void DbgVariableRecord::replaceVariableLocationOp(Value *OldValue,
+                                                  Value *NewValue,
+                                                  bool AllowEmpty) {
   assert(NewValue && "Values must be non-null");
 
   bool DbgAssignAddrReplaced = isDbgAssign() && OldValue == getAddress();
@@ -307,7 +315,8 @@ void DPValue::replaceVariableLocationOp(Value *OldValue, Value *NewValue,
   setRawLocation(DIArgList::get(getVariableLocationOp(0)->getContext(), MDs));
 }
 
-void DPValue::replaceVariableLocationOp(unsigned OpIdx, Value *NewValue) {
+void DbgVariableRecord::replaceVariableLocationOp(unsigned OpIdx,
+                                                  Value *NewValue) {
   assert(OpIdx < getNumVariableLocationOps() && "Invalid Operand Index");
 
   if (!hasArgList()) {
@@ -326,8 +335,8 @@ void DPValue::replaceVariableLocationOp(unsigned OpIdx, Value *NewValue) {
   setRawLocation(DIArgList::get(getVariableLocationOp(0)->getContext(), MDs));
 }
 
-void DPValue::addVariableLocationOps(ArrayRef<Value *> NewValues,
-                                     DIExpression *NewExpr) {
+void DbgVariableRecord::addVariableLocationOps(ArrayRef<Value *> NewValues,
+                                               DIExpression *NewExpr) {
   assert(NewExpr->hasAllLocationOps(getNumVariableLocationOps() +
                                     NewValues.size()) &&
          "NewExpr for debug variable intrinsic does not reference every "
@@ -342,7 +351,7 @@ void DPValue::addVariableLocationOps(ArrayRef<Value *> NewValues,
   setRawLocation(DIArgList::get(getVariableLocationOp(0)->getContext(), MDs));
 }
 
-void DPValue::setKillLocation() {
+void DbgVariableRecord::setKillLocation() {
   // TODO: When/if we remove duplicate values from DIArgLists, we don't need
   // this set anymore.
   SmallPtrSet<Value *, 4> RemovedValues;
@@ -354,13 +363,13 @@ void DPValue::setKillLocation() {
   }
 }
 
-bool DPValue::isKillLocation() const {
+bool DbgVariableRecord::isKillLocation() const {
   return (getNumVariableLocationOps() == 0 &&
           !getExpression()->isComplex()) ||
          any_of(location_ops(), [](Value *V) { return isa<UndefValue>(V); });
 }
 
-std::optional<uint64_t> DPValue::getFragmentSizeInBits() const {
+std::optional<uint64_t> DbgVariableRecord::getFragmentSizeInBits() const {
   if (auto Fragment = getExpression()->getFragmentInfo())
     return Fragment->SizeInBits;
   return getVariable()->getSizeInBits();
@@ -369,21 +378,24 @@ std::optional<uint64_t> DPValue::getFragmentSizeInBits() const {
 DbgRecord *DbgRecord::clone() const {
   switch (RecordKind) {
   case ValueKind:
-    return cast<DPValue>(this)->clone();
+    return cast<DbgVariableRecord>(this)->clone();
   case LabelKind:
     return cast<DPLabel>(this)->clone();
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
 
-DPValue *DPValue::clone() const { return new DPValue(*this); }
+DbgVariableRecord *DbgVariableRecord::clone() const {
+  return new DbgVariableRecord(*this);
+}
 
 DPLabel *DPLabel::clone() const {
   return new DPLabel(getLabel(), getDebugLoc());
 }
 
 DbgVariableIntrinsic *
-DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
+DbgVariableRecord::createDebugIntrinsic(Module *M,
+                                        Instruction *InsertBefore) const {
   [[maybe_unused]] DICompileUnit *Unit =
       getDebugLoc().get()->getScope()->getSubprogram()->getUnit();
   assert(M && Unit &&
@@ -394,24 +406,25 @@ DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
 
   // Work out what sort of intrinsic we're going to produce.
   switch (getType()) {
-  case DPValue::LocationType::Declare:
+  case DbgVariableRecord::LocationType::Declare:
     IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare);
     break;
-  case DPValue::LocationType::Value:
+  case DbgVariableRecord::LocationType::Value:
     IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_value);
     break;
-  case DPValue::LocationType::Assign:
+  case DbgVariableRecord::LocationType::Assign:
     IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign);
     break;
-  case DPValue::LocationType::End:
-  case DPValue::LocationType::Any:
+  case DbgVariableRecord::LocationType::End:
+  case DbgVariableRecord::LocationType::Any:
     llvm_unreachable("Invalid LocationType");
   }
 
-  // Create the intrinsic from this DPValue's information, optionally insert
-  // into the target location.
+  // Create the intrinsic from this DbgVariableRecord's information, optionally
+  // insert into the target location.
   DbgVariableIntrinsic *DVI;
-  assert(getRawLocation() && "DPValue's RawLocation should be non-null.");
+  assert(getRawLocation() &&
+         "DbgVariableRecord's RawLocation should be non-null.");
   if (isDbgAssign()) {
     Value *AssignArgs[] = {
         MetadataAsValue::get(Context, getRawLocation()),
@@ -451,7 +464,7 @@ DbgLabelInst *DPLabel::createDebugIntrinsic(Module *M,
   return DbgLabel;
 }
 
-Value *DPValue::getAddress() const {
+Value *DbgVariableRecord::getAddress() const {
   auto *MD = getRawAddress();
   if (auto *V = dyn_cast<ValueAsMetadata>(MD))
     return V->getValue();
@@ -461,18 +474,20 @@ Value *DPValue::getAddress() const {
   return nullptr;
 }
 
-DIAssignID *DPValue::getAssignID() const {
+DIAssignID *DbgVariableRecord::getAssignID() const {
   return cast<DIAssignID>(DebugValues[2]);
 }
 
-void DPValue::setAssignId(DIAssignID *New) { resetDebugValue(2, New); }
+void DbgVariableRecord::setAssignId(DIAssignID *New) {
+  resetDebugValue(2, New);
+}
 
-void DPValue::setKillAddress() {
+void DbgVariableRecord::setKillAddress() {
   resetDebugValue(
       1, ValueAsMetadata::get(UndefValue::get(getAddress()->getType())));
 }
 
-bool DPValue::isKillAddress() const {
+bool DbgVariableRecord::isKillAddress() const {
   Value *Addr = getAddress();
   return !Addr || isa<UndefValue>(Addr);
 }
@@ -541,21 +556,21 @@ void DbgRecord::moveAfter(DbgRecord *MoveAfter) {
 ///////////////////////////////////////////////////////////////////////////////
 
 // An empty, global, DPMarker for the purpose of describing empty ranges of
-// DPValues.
+// DbgRecords.
 DPMarker DPMarker::EmptyDPMarker;
 
 void DPMarker::dropDbgRecords() {
-  while (!StoredDPValues.empty()) {
-    auto It = StoredDPValues.begin();
+  while (!StoredDbgRecords.empty()) {
+    auto It = StoredDbgRecords.begin();
     DbgRecord *DR = &*It;
-    StoredDPValues.erase(It);
+    StoredDbgRecords.erase(It);
     DR->deleteRecord();
   }
 }
 
 void DPMarker::dropOneDbgRecord(DbgRecord *DR) {
   assert(DR->getMarker() == this);
-  StoredDPValues.erase(DR->getIterator());
+  StoredDbgRecords.erase(DR->getIterator());
   DR->deleteRecord();
 }
 
@@ -566,15 +581,15 @@ const BasicBlock *DPMarker::getParent() const {
 BasicBlock *DPMarker::getParent() { return MarkedInstr->getParent(); }
 
 void DPMarker::removeMarker() {
-  // Are there any DPValues in this DPMarker? If not, nothing to preserve.
+  // Are there any DbgRecords in this DPMarker? If not, nothing to preserve.
   Instruction *Owner = MarkedInstr;
-  if (StoredDPValues.empty()) {
+  if (StoredDbgRecords.empty()) {
     eraseFromParent();
     Owner->DbgMarker = nullptr;
     return;
   }
 
-  // The attached DPValues need to be preserved; attach them to the next
+  // The attached DbgRecords need to be preserved; attach them to the next
   // instruction. If there isn't a next instruction, put them on the
   // "trailing" list.
   DPMarker *NextMarker = Owner->getParent()->getNextMarker(Owner);
@@ -610,15 +625,15 @@ void DPMarker::eraseFromParent() {
 }
 
 iterator_range<DbgRecord::self_iterator> DPMarker::getDbgRecordRange() {
-  return make_range(StoredDPValues.begin(), StoredDPValues.end());
+  return make_range(StoredDbgRecords.begin(), StoredDbgRecords.end());
 }
 iterator_range<DbgRecord::const_self_iterator>
 DPMarker::getDbgRecordRange() const {
-  return make_range(StoredDPValues.begin(), StoredDPValues.end());
+  return make_range(StoredDbgRecords.begin(), StoredDbgRecords.end());
 }
 
 void DbgRecord::removeFromParent() {
-  getMarker()->StoredDPValues.erase(getIterator());
+  getMarker()->StoredDbgRecords.erase(getIterator());
   Marker = nullptr;
 }
 
@@ -628,29 +643,29 @@ void DbgRecord::eraseFromParent() {
 }
 
 void DPMarker::insertDbgRecord(DbgRecord *New, bool InsertAtHead) {
-  auto It = InsertAtHead ? StoredDPValues.begin() : StoredDPValues.end();
-  StoredDPValues.insert(It, *New);
+  auto It = InsertAtHead ? StoredDbgRecords.begin() : StoredDbgRecords.end();
+  StoredDbgRecords.insert(It, *New);
   New->setMarker(this);
 }
 void DPMarker::insertDbgRecord(DbgRecord *New, DbgRecord *InsertBefore) {
   assert(InsertBefore->getMarker() == this &&
-         "DPValue 'InsertBefore' must be contained in this DPMarker!");
-  StoredDPValues.insert(InsertBefore->getIterator(), *New);
+         "DbgRecord 'InsertBefore' must be contained in this DPMarker!");
+  StoredDbgRecords.insert(InsertBefore->getIterator(), *New);
   New->setMarker(this);
 }
 void DPMarker::insertDbgRecordAfter(DbgRecord *New, DbgRecord *InsertAfter) {
   assert(InsertAfter->getMarker() == this &&
-         "DPValue 'InsertAfter' must be contained in this DPMarker!");
-  StoredDPValues.insert(++(InsertAfter->getIterator()), *New);
+         "DbgRecord 'InsertAfter' must be contained in this DPMarker!");
+  StoredDbgRecords.insert(++(InsertAfter->getIterator()), *New);
   New->setMarker(this);
 }
 
 void DPMarker::absorbDebugValues(DPMarker &Src, bool InsertAtHead) {
-  auto It = InsertAtHead ? StoredDPValues.begin() : StoredDPValues.end();
-  for (DbgRecord &DPV : Src.StoredDPValues)
-    DPV.setMarker(this);
+  auto It = InsertAtHead ? StoredDbgRecords.begin() : StoredDbgRecords.end();
+  for (DbgRecord &DVR : Src.StoredDbgRecords)
+    DVR.setMarker(this);
 
-  StoredDPValues.splice(It, Src.StoredDPValues);
+  StoredDbgRecords.splice(It, Src.StoredDbgRecords);
 }
 
 void DPMarker::absorbDebugValues(iterator_range<DbgRecord::self_iterator> Range,
@@ -659,45 +674,45 @@ void DPMarker::absorbDebugValues(iterator_range<DbgRecord::self_iterator> Range,
     DR.setMarker(this);
 
   auto InsertPos =
-      (InsertAtHead) ? StoredDPValues.begin() : StoredDPValues.end();
+      (InsertAtHead) ? StoredDbgRecords.begin() : StoredDbgRecords.end();
 
-  StoredDPValues.splice(InsertPos, Src.StoredDPValues, Range.begin(),
-                        Range.end());
+  StoredDbgRecords.splice(InsertPos, Src.StoredDbgRecords, Range.begin(),
+                          Range.end());
 }
 
 iterator_range<simple_ilist<DbgRecord>::iterator> DPMarker::cloneDebugInfoFrom(
     DPMarker *From, std::optional<simple_ilist<DbgRecord>::iterator> from_here,
     bool InsertAtHead) {
   DbgRecord *First = nullptr;
-  // Work out what range of DPValues to clone: normally all the contents of the
-  // "From" marker, optionally we can start from the from_here position down to
-  // end().
+  // Work out what range of DbgRecords to clone: normally all the contents of
+  // the "From" marker, optionally we can start from the from_here position down
+  // to end().
   auto Range =
-      make_range(From->StoredDPValues.begin(), From->StoredDPValues.end());
+      make_range(From->StoredDbgRecords.begin(), From->StoredDbgRecords.end());
   if (from_here.has_value())
-    Range = make_range(*from_here, From->StoredDPValues.end());
+    Range = make_range(*from_here, From->StoredDbgRecords.end());
 
-  // Clone each DPValue and insert into StoreDPValues; optionally place them at
-  // the start or the end of the list.
-  auto Pos = (InsertAtHead) ? StoredDPValues.begin() : StoredDPValues.end();
+  // Clone each DbgVariableRecord and insert into StoreDbgVariableRecords;
+  // optionally place them at the start or the end of the list.
+  auto Pos = (InsertAtHead) ? StoredDbgRecords.begin() : StoredDbgRecords.end();
   for (DbgRecord &DR : Range) {
     DbgRecord *New = DR.clone();
     New->setMarker(this);
-    StoredDPValues.insert(Pos, *New);
+    StoredDbgRecords.insert(Pos, *New);
     if (!First)
       First = New;
   }
 
   if (!First)
-    return {StoredDPValues.end(), StoredDPValues.end()};
+    return {StoredDbgRecords.end(), StoredDbgRecords.end()};
 
   if (InsertAtHead)
     // If InsertAtHead is set, we cloned a range onto the front of of the
-    // StoredDPValues collection, return that range.
-    return {StoredDPValues.begin(), Pos};
+    // StoredDbgRecords collection, return that range.
+    return {StoredDbgRecords.begin(), Pos};
   else
     // We inserted a block at the end, return that range.
-    return {First->getIterator(), StoredDPValues.end()};
+    return {First->getIterator(), StoredDbgRecords.end()};
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 13996a4a82150..f512c96cc9ce2 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -920,12 +920,14 @@ CallInst *IRBuilderBase::CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
   return createCallHelper(Fn, {V}, Name, FMFSource);
 }
 
-CallInst *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS,
-                                               Value *RHS,
-                                               Instruction *FMFSource,
-                                               const Twine &Name) {
+Value *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS,
+                                            Value *RHS, Instruction *FMFSource,
+                                            const Twine &Name) {
   Module *M = BB->getModule();
   Function *Fn = Intrinsic::getDeclaration(M, ID, { LHS->getType() });
+  if (Value *V = Folder.FoldBinaryIntrinsic(ID, LHS, RHS, Fn->getReturnType(),
+                                            FMFSource))
+    return V;
   return createCallHelper(Fn, {LHS, RHS}, Name, FMFSource);
 }
 
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index e0892398f4344..9744eb32d27a5 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -143,8 +143,8 @@ void Instruction::insertBefore(BasicBlock &BB,
     return;
 
   // We've inserted "this": if InsertAtHead is set then it comes before any
-  // DPValues attached to InsertPos. But if it's not set, then any DPValues
-  // should now come before "this".
+  // DbgVariableRecords attached to InsertPos. But if it's not set, then any
+  // DbgRecords should now come before "this".
   bool InsertAtHead = InsertPos.getHeadBit();
   if (!InsertAtHead) {
     DPMarker *SrcMarker = BB.getMarker(InsertPos);
@@ -166,10 +166,10 @@ void Instruction::insertBefore(BasicBlock &BB,
   }
 
   // If we're inserting a terminator, check if we need to flush out
-  // TrailingDPValues. Inserting instructions at the end of an incomplete
+  // TrailingDbgRecords. Inserting instructions at the end of an incomplete
   // block is handled by the code block above.
   if (isTerminator())
-    getParent()->flushTerminatorDbgValues();
+    getParent()->flushTerminatorDbgRecords();
 }
 
 /// Unlink this instruction from its current basic block and insert it into the
@@ -212,12 +212,12 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
   assert(I == BB.end() || I->getParent() == &BB);
   bool InsertAtHead = I.getHeadBit();
 
-  // If we've been given the "Preserve" flag, then just move the DPValues with
+  // If we've been given the "Preserve" flag, then just move the DbgRecords with
   // the instruction, no more special handling needed.
   if (BB.IsNewDbgInfoFormat && DbgMarker && !Preserve) {
     if (I != this->getIterator() || InsertAtHead) {
       // "this" is definitely moving in the list, or it's moving ahead of its
-      // attached DPValues. Detach any existing DPValues.
+      // attached DbgVariableRecords. Detach any existing DbgRecords.
       handleMarkerRemoval();
     }
   }
@@ -229,15 +229,15 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
   if (BB.IsNewDbgInfoFormat && !Preserve) {
     DPMarker *NextMarker = getParent()->getNextMarker(this);
 
-    // If we're inserting at point I, and not in front of the DPValues attached
-    // there, then we should absorb the DPValues attached to I.
+    // If we're inserting at point I, and not in front of the DbgRecords
+    // attached there, then we should absorb the DbgRecords attached to I.
     if (!InsertAtHead && NextMarker && !NextMarker->empty()) {
       adoptDbgRecords(&BB, I, false);
     }
   }
 
   if (isTerminator())
-    getParent()->flushTerminatorDbgValues();
+    getParent()->flushTerminatorDbgRecords();
 }
 
 iterator_range<DbgRecord::self_iterator> Instruction::cloneDebugInfoFrom(
@@ -263,11 +263,11 @@ Instruction::getDbgReinsertionPosition() {
   if (!NextMarker)
     return std::nullopt;
 
-  // Are there any DPValues in the next marker?
-  if (NextMarker->StoredDPValues.empty())
+  // Are there any DbgRecords in the next marker?
+  if (NextMarker->StoredDbgRecords.empty())
     return std::nullopt;
 
-  return NextMarker->StoredDPValues.begin();
+  return NextMarker->StoredDbgRecords.begin();
 }
 
 bool Instruction::hasDbgRecords() const { return !getDbgRecordRange().empty(); }
@@ -275,20 +275,20 @@ bool Instruction::hasDbgRecords() const { return !getDbgRecordRange().empty(); }
 void Instruction::adoptDbgRecords(BasicBlock *BB, BasicBlock::iterator It,
                                   bool InsertAtHead) {
   DPMarker *SrcMarker = BB->getMarker(It);
-  auto ReleaseTrailingDPValues = [BB, It, SrcMarker]() {
+  auto ReleaseTrailingDbgRecords = [BB, It, SrcMarker]() {
     if (BB->end() == It) {
       SrcMarker->eraseFromParent();
       BB->deleteTrailingDbgRecords();
     }
   };
 
-  if (!SrcMarker || SrcMarker->StoredDPValues.empty()) {
-    ReleaseTrailingDPValues();
+  if (!SrcMarker || SrcMarker->StoredDbgRecords.empty()) {
+    ReleaseTrailingDbgRecords();
     return;
   }
 
   // If we have DPMarkers attached to this instruction, we have to honour the
-  // ordering of DPValues between this and the other marker. Fall back to just
+  // ordering of DbgRecords between this and the other marker. Fall back to just
   // absorbing from the source.
   if (DbgMarker || It == BB->end()) {
     // Ensure we _do_ have a marker.
@@ -304,10 +304,11 @@ void Instruction::adoptDbgRecords(BasicBlock *BB, BasicBlock::iterator It,
     // block, it's important to not leave the empty marker trailing. It will
     // give a misleading impression that some debug records have been left
     // trailing.
-    ReleaseTrailingDPValues();
+    ReleaseTrailingDbgRecords();
   } else {
-    // Optimisation: we're transferring all the DPValues from the source marker
-    // onto this empty location: just adopt the other instructions marker.
+    // Optimisation: we're transferring all the DbgRecords from the source
+    // marker onto this empty location: just adopt the other instructions
+    // marker.
     DbgMarker = SrcMarker;
     DbgMarker->MarkedInstr = this;
     It->DbgMarker = nullptr;
@@ -319,8 +320,8 @@ void Instruction::dropDbgRecords() {
     DbgMarker->dropDbgRecords();
 }
 
-void Instruction::dropOneDbgRecord(DbgRecord *DPV) {
-  DbgMarker->dropOneDbgRecord(DPV);
+void Instruction::dropOneDbgRecord(DbgRecord *DVR) {
+  DbgMarker->dropOneDbgRecord(DVR);
 }
 
 bool Instruction::comesBefore(const Instruction *Other) const {
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 21a2f229bff8f..29227cb80b335 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -48,9 +48,9 @@ LLVMContextImpl::~LLVMContextImpl() {
 #ifndef NDEBUG
   // Check that any variable location records that fell off the end of a block
   // when it's terminator was removed were eventually replaced. This assertion
-  // firing indicates that DPValues went missing during the lifetime of the
-  // LLVMContext.
-  assert(TrailingDPValues.empty() && "DPValue records in blocks not cleaned");
+  // firing indicates that DbgVariableRecords went missing during the lifetime
+  // of the LLVMContext.
+  assert(TrailingDbgRecords.empty() && "DbgRecords in blocks not cleaned");
 #endif
 
   // NOTE: We need to delete the contents of OwnedModules, but Module's dtor
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index c841b28ca4380..4542e16e59fd6 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -540,6 +540,7 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   uint64_t OffsetInBits;
   uint32_t AlignInBits;
   std::optional<unsigned> DWARFAddressSpace;
+  std::optional<DIDerivedType::PtrAuthData> PtrAuthData;
   unsigned Flags;
   Metadata *ExtraData;
   Metadata *Annotations;
@@ -547,18 +548,21 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
                 uint32_t AlignInBits, uint64_t OffsetInBits,
-                std::optional<unsigned> DWARFAddressSpace, unsigned Flags,
-                Metadata *ExtraData, Metadata *Annotations)
+                std::optional<unsigned> DWARFAddressSpace,
+                std::optional<DIDerivedType::PtrAuthData> PtrAuthData,
+                unsigned Flags, Metadata *ExtraData, Metadata *Annotations)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
         AlignInBits(AlignInBits), DWARFAddressSpace(DWARFAddressSpace),
-        Flags(Flags), ExtraData(ExtraData), Annotations(Annotations) {}
+        PtrAuthData(PtrAuthData), Flags(Flags), ExtraData(ExtraData),
+        Annotations(Annotations) {}
   MDNodeKeyImpl(const DIDerivedType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
         BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
         OffsetInBits(N->getOffsetInBits()), AlignInBits(N->getAlignInBits()),
-        DWARFAddressSpace(N->getDWARFAddressSpace()), Flags(N->getFlags()),
+        DWARFAddressSpace(N->getDWARFAddressSpace()),
+        PtrAuthData(N->getPtrAuthData()), Flags(N->getFlags()),
         ExtraData(N->getRawExtraData()), Annotations(N->getRawAnnotations()) {}
 
   bool isKeyOf(const DIDerivedType *RHS) const {
@@ -569,7 +573,8 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
            AlignInBits == RHS->getAlignInBits() &&
            OffsetInBits == RHS->getOffsetInBits() &&
            DWARFAddressSpace == RHS->getDWARFAddressSpace() &&
-           Flags == RHS->getFlags() && ExtraData == RHS->getRawExtraData() &&
+           PtrAuthData == RHS->getPtrAuthData() && Flags == RHS->getFlags() &&
+           ExtraData == RHS->getRawExtraData() &&
            Annotations == RHS->getRawAnnotations();
   }
 
@@ -1670,33 +1675,33 @@ class LLVMContextImpl {
   /// LLVMContext is used by compilation.
   void setOptPassGate(OptPassGate &);
 
-  /// Mapping of blocks to collections of "trailing" DPValues. As part of the
-  /// "RemoveDIs" project, debug-info variable location records are going to
-  /// cease being instructions... which raises the problem of where should they
-  /// be recorded when we remove the terminator of a blocks, such as:
+  /// Mapping of blocks to collections of "trailing" DbgVariableRecords. As part
+  /// of the "RemoveDIs" project, debug-info variable location records are going
+  /// to cease being instructions... which raises the problem of where should
+  /// they be recorded when we remove the terminator of a blocks, such as:
   ///
   ///    %foo = add i32 0, 0
   ///    br label %bar
   ///
   /// If the branch is removed, a legitimate transient state while editing a
   /// block, any debug-records between those two instructions will not have a
-  /// location. Each block thus records any DPValue records that "trail" in
-  /// such a way. These are stored in LLVMContext because typically LLVM only
-  /// edits a small number of blocks at a time, so there's no need to bloat
-  /// BasicBlock with such a data structure.
-  SmallDenseMap<BasicBlock *, DPMarker *> TrailingDPValues;
+  /// location. Each block thus records any DbgVariableRecord records that
+  /// "trail" in such a way. These are stored in LLVMContext because typically
+  /// LLVM only edits a small number of blocks at a time, so there's no need to
+  /// bloat BasicBlock with such a data structure.
+  SmallDenseMap<BasicBlock *, DPMarker *> TrailingDbgRecords;
 
-  // Set, get and delete operations for TrailingDPValues.
+  // Set, get and delete operations for TrailingDbgRecords.
   void setTrailingDbgRecords(BasicBlock *B, DPMarker *M) {
-    assert(!TrailingDPValues.count(B));
-    TrailingDPValues[B] = M;
+    assert(!TrailingDbgRecords.count(B));
+    TrailingDbgRecords[B] = M;
   }
 
   DPMarker *getTrailingDbgRecords(BasicBlock *B) {
-    return TrailingDPValues.lookup(B);
+    return TrailingDbgRecords.lookup(B);
   }
 
-  void deleteTrailingDbgRecords(BasicBlock *B) { TrailingDPValues.erase(B); }
+  void deleteTrailingDbgRecords(BasicBlock *B) { TrailingDbgRecords.erase(B); }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 8945c6dbc8d84..953f21ce74059 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -528,8 +528,9 @@ bool PassManagerImpl::run(Module &M) {
   dumpArguments();
   dumpPasses();
 
-  // RemoveDIs: if a command line flag is given, convert to the DPValue
-  // representation of debug-info for the duration of these passes.
+  // RemoveDIs: if a command line flag is given, convert to the
+  // DbgVariableRecord representation of debug-info for the duration of these
+  // passes.
   bool shouldConvertDbgInfo = UseNewDbgInfoFormat && !M.IsNewDbgInfoFormat;
   if (shouldConvertDbgInfo)
     M.convertToNewDbgValues();
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 06db91fe4f8e7..4472bf128c323 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -148,9 +148,11 @@ void MetadataAsValue::untrack() {
     MetadataTracking::untrack(MD);
 }
 
-DPValue *DebugValueUser::getUser() { return static_cast<DPValue *>(this); }
-const DPValue *DebugValueUser::getUser() const {
-  return static_cast<const DPValue *>(this);
+DbgVariableRecord *DebugValueUser::getUser() {
+  return static_cast<DbgVariableRecord *>(this);
+}
+const DbgVariableRecord *DebugValueUser::getUser() const {
+  return static_cast<const DbgVariableRecord *>(this);
 }
 
 void DebugValueUser::handleChangedValue(void *Old, Metadata *New) {
@@ -266,28 +268,29 @@ SmallVector<Metadata *> ReplaceableMetadataImpl::getAllArgListUsers() {
   return MDUsers;
 }
 
-SmallVector<DPValue *> ReplaceableMetadataImpl::getAllDPValueUsers() {
-  SmallVector<std::pair<OwnerTy, uint64_t> *> DPVUsersWithID;
+SmallVector<DbgVariableRecord *>
+ReplaceableMetadataImpl::getAllDbgVariableRecordUsers() {
+  SmallVector<std::pair<OwnerTy, uint64_t> *> DVRUsersWithID;
   for (auto Pair : UseMap) {
     OwnerTy Owner = Pair.second.first;
     if (Owner.isNull())
       continue;
     if (!Owner.is<DebugValueUser *>())
       continue;
-    DPVUsersWithID.push_back(&UseMap[Pair.first]);
+    DVRUsersWithID.push_back(&UseMap[Pair.first]);
   }
-  // Order DPValue users in reverse-creation order. Normal dbg.value users
-  // of MetadataAsValues are ordered by their UseList, i.e. reverse order of
-  // when they were added: we need to replicate that here. The structure of
+  // Order DbgVariableRecord users in reverse-creation order. Normal dbg.value
+  // users of MetadataAsValues are ordered by their UseList, i.e. reverse order
+  // of when they were added: we need to replicate that here. The structure of
   // debug-info output depends on the ordering of intrinsics, thus we need
   // to keep them consistent for comparisons sake.
-  llvm::sort(DPVUsersWithID, [](auto UserA, auto UserB) {
+  llvm::sort(DVRUsersWithID, [](auto UserA, auto UserB) {
     return UserA->second > UserB->second;
   });
-  SmallVector<DPValue *> DPVUsers;
-  for (auto UserWithID : DPVUsersWithID)
-    DPVUsers.push_back(UserWithID->first.get<DebugValueUser *>()->getUser());
-  return DPVUsers;
+  SmallVector<DbgVariableRecord *> DVRUsers;
+  for (auto UserWithID : DVRUsersWithID)
+    DVRUsers.push_back(UserWithID->first.get<DebugValueUser *>()->getUser());
+  return DVRUsers;
 }
 
 void ReplaceableMetadataImpl::addRef(void *Ref, OwnerTy Owner) {
diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp
index 42dec7c72328e..9b07bd8040492 100644
--- a/llvm/lib/IR/ReplaceConstant.cpp
+++ b/llvm/lib/IR/ReplaceConstant.cpp
@@ -22,11 +22,13 @@ static bool isExpandableUser(User *U) {
   return isa<ConstantExpr>(U) || isa<ConstantAggregate>(U);
 }
 
-static SmallVector<Instruction *, 4> expandUser(Instruction *InsertPt,
+static SmallVector<Instruction *, 4> expandUser(BasicBlock::iterator InsertPt,
                                                 Constant *C) {
   SmallVector<Instruction *, 4> NewInsts;
   if (auto *CE = dyn_cast<ConstantExpr>(C)) {
-    NewInsts.push_back(CE->getAsInstruction(InsertPt));
+    Instruction *ConstInst = CE->getAsInstruction();
+    ConstInst->insertBefore(*InsertPt->getParent(), InsertPt);
+    NewInsts.push_back(ConstInst);
   } else if (isa<ConstantStruct>(C) || isa<ConstantArray>(C)) {
     Value *V = PoisonValue::get(C->getType());
     for (auto [Idx, Op] : enumerate(C->operands())) {
@@ -80,12 +82,11 @@ bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts) {
     Instruction *I = InstructionWorklist.pop_back_val();
     DebugLoc Loc = I->getDebugLoc();
     for (Use &U : I->operands()) {
-      auto *BI = I;
+      BasicBlock::iterator BI = I->getIterator();
       if (auto *Phi = dyn_cast<PHINode>(I)) {
         BasicBlock *BB = Phi->getIncomingBlock(U);
-        BasicBlock::iterator It = BB->getFirstInsertionPt();
-        assert(It != BB->end() && "Unexpected empty basic block");
-        BI = &*It;
+        BI = BB->getFirstInsertionPt();
+        assert(BI != BB->end() && "Unexpected empty basic block");
       }
 
       if (auto *C = dyn_cast<Constant>(U.get())) {
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index c4e47975b1822..61e1c35ba4a3a 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -574,16 +574,16 @@ void Value::replaceUsesWithIf(Value *New,
 /// with New.
 static void replaceDbgUsesOutsideBlock(Value *V, Value *New, BasicBlock *BB) {
   SmallVector<DbgVariableIntrinsic *> DbgUsers;
-  SmallVector<DPValue *> DPUsers;
+  SmallVector<DbgVariableRecord *> DPUsers;
   findDbgUsers(DbgUsers, V, &DPUsers);
   for (auto *DVI : DbgUsers) {
     if (DVI->getParent() != BB)
       DVI->replaceVariableLocationOp(V, New);
   }
-  for (auto *DPV : DPUsers) {
-    DPMarker *Marker = DPV->getMarker();
+  for (auto *DVR : DPUsers) {
+    DPMarker *Marker = DVR->getMarker();
     if (Marker->getParent() != BB)
-      DPV->replaceVariableLocationOp(V, New);
+      DVR->replaceVariableLocationOp(V, New);
   }
 }
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c3b65e63346fd..ea1b708b0467f 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -180,21 +180,21 @@ struct VerifierSupport {
     }
   }
 
-  void Write(DPValue::LocationType Type) {
+  void Write(DbgVariableRecord::LocationType Type) {
     switch (Type) {
-    case DPValue::LocationType::Value:
+    case DbgVariableRecord::LocationType::Value:
       *OS << "value";
       break;
-    case DPValue::LocationType::Declare:
+    case DbgVariableRecord::LocationType::Declare:
       *OS << "declare";
       break;
-    case DPValue::LocationType::Assign:
+    case DbgVariableRecord::LocationType::Assign:
       *OS << "assign";
       break;
-    case DPValue::LocationType::End:
+    case DbgVariableRecord::LocationType::End:
       *OS << "end";
       break;
-    case DPValue::LocationType::Any:
+    case DbgVariableRecord::LocationType::Any:
       *OS << "any";
       break;
     };
@@ -545,7 +545,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void visitTemplateParams(const MDNode &N, const Metadata &RawParams);
 
   void visit(DPLabel &DPL);
-  void visit(DPValue &DPV);
+  void visit(DbgVariableRecord &DVR);
   // InstVisitor overrides...
   using InstVisitor<Verifier>::visit;
   void visitDbgRecords(Instruction &I);
@@ -632,15 +632,15 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void verifySiblingFuncletUnwinds();
 
   void verifyFragmentExpression(const DbgVariableIntrinsic &I);
-  void verifyFragmentExpression(const DPValue &I);
+  void verifyFragmentExpression(const DbgVariableRecord &I);
   template <typename ValueOrMetadata>
   void verifyFragmentExpression(const DIVariable &V,
                                 DIExpression::FragmentInfo Fragment,
                                 ValueOrMetadata *Desc);
   void verifyFnArgs(const DbgVariableIntrinsic &I);
-  void verifyFnArgs(const DPValue &DPV);
+  void verifyFnArgs(const DbgVariableRecord &DVR);
   void verifyNotEntryValue(const DbgVariableIntrinsic &I);
-  void verifyNotEntryValue(const DPValue &I);
+  void verifyNotEntryValue(const DbgVariableRecord &I);
 
   /// Module-level debug info verification...
   void verifyCompileUnits();
@@ -690,12 +690,12 @@ void Verifier::visitDbgRecords(Instruction &I) {
     if (auto *Loc =
             dyn_cast_or_null<DILocation>(DR.getDebugLoc().getAsMDNode()))
       visitMDNode(*Loc, AreDebugLocsAllowed::Yes);
-    if (auto *DPV = dyn_cast<DPValue>(&DR)) {
-      visit(*DPV);
+    if (auto *DVR = dyn_cast<DbgVariableRecord>(&DR)) {
+      visit(*DVR);
       // These have to appear after `visit` for consistency with existing
       // intrinsic behaviour.
-      verifyFragmentExpression(*DPV);
-      verifyNotEntryValue(*DPV);
+      verifyFragmentExpression(*DVR);
+      verifyNotEntryValue(*DVR);
     } else if (auto *DPL = dyn_cast<DPLabel>(&DR)) {
       visit(*DPL);
     }
@@ -1235,6 +1235,7 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
               N.getTag() == dwarf::DW_TAG_volatile_type ||
               N.getTag() == dwarf::DW_TAG_restrict_type ||
               N.getTag() == dwarf::DW_TAG_atomic_type ||
+              N.getTag() == dwarf::DW_TAG_LLVM_ptrauth_type ||
               N.getTag() == dwarf::DW_TAG_member ||
               (N.getTag() == dwarf::DW_TAG_variable && N.isStaticMember()) ||
               N.getTag() == dwarf::DW_TAG_inheritance ||
@@ -4806,11 +4807,12 @@ void Verifier::visitDIAssignIDMetadata(Instruction &I, MDNode *MD) {
                 "dbg.assign not in same function as inst", DAI, &I);
     }
   }
-  for (DPValue *DPV : cast<DIAssignID>(MD)->getAllDPValueUsers()) {
-    CheckDI(DPV->isDbgAssign(),
-            "!DIAssignID should only be used by Assign DPVs.", MD, DPV);
-    CheckDI(DPV->getFunction() == I.getFunction(),
-            "DPVAssign not in same function as inst", DPV, &I);
+  for (DbgVariableRecord *DVR :
+       cast<DIAssignID>(MD)->getAllDbgVariableRecordUsers()) {
+    CheckDI(DVR->isDbgAssign(),
+            "!DIAssignID should only be used by Assign DVRs.", MD, DVR);
+    CheckDI(DVR->getFunction() == I.getFunction(),
+            "DVRAssign not in same function as inst", DVR, &I);
   }
 }
 
@@ -5264,6 +5266,29 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::ucmp:
+  case Intrinsic::scmp: {
+    Type *SrcTy = Call.getOperand(0)->getType();
+    Type *DestTy = Call.getType();
+
+    Check(DestTy->getScalarSizeInBits() >= 2,
+          "result type must be at least 2 bits wide", Call);
+
+    bool IsDestTypeVector = DestTy->isVectorTy();
+    Check(SrcTy->isVectorTy() == IsDestTypeVector,
+          "ucmp/scmp argument and result types must both be either vector or "
+          "scalar types",
+          Call);
+    if (IsDestTypeVector) {
+      auto SrcVecLen = cast<VectorType>(SrcTy)->getElementCount();
+      auto DestVecLen = cast<VectorType>(DestTy)->getElementCount();
+      Check(SrcVecLen == DestVecLen,
+            "return type and arguments must have the same number of "
+            "elements",
+            Call);
+    }
+    break;
+  }
   case Intrinsic::coro_id: {
     auto *InfoArg = Call.getArgOperand(3)->stripPointerCasts();
     if (isa<ConstantPointerNull>(InfoArg))
@@ -6246,71 +6271,72 @@ void Verifier::visit(DPLabel &DPL) {
           Loc->getScope()->getSubprogram());
 }
 
-void Verifier::visit(DPValue &DPV) {
-  BasicBlock *BB = DPV.getParent();
+void Verifier::visit(DbgVariableRecord &DVR) {
+  BasicBlock *BB = DVR.getParent();
   Function *F = BB->getParent();
 
-  CheckDI(DPV.getType() == DPValue::LocationType::Value ||
-              DPV.getType() == DPValue::LocationType::Declare ||
-              DPV.getType() == DPValue::LocationType::Assign,
-          "invalid #dbg record type", &DPV, DPV.getType());
+  CheckDI(DVR.getType() == DbgVariableRecord::LocationType::Value ||
+              DVR.getType() == DbgVariableRecord::LocationType::Declare ||
+              DVR.getType() == DbgVariableRecord::LocationType::Assign,
+          "invalid #dbg record type", &DVR, DVR.getType());
 
-  // The location for a DPValue must be either a ValueAsMetadata, DIArgList, or
-  // an empty MDNode (which is a legacy representation for an "undef" location).
-  auto *MD = DPV.getRawLocation();
+  // The location for a DbgVariableRecord must be either a ValueAsMetadata,
+  // DIArgList, or an empty MDNode (which is a legacy representation for an
+  // "undef" location).
+  auto *MD = DVR.getRawLocation();
   CheckDI(MD && (isa<ValueAsMetadata>(MD) || isa<DIArgList>(MD) ||
                  (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands())),
-          "invalid #dbg record address/value", &DPV, MD);
+          "invalid #dbg record address/value", &DVR, MD);
   if (auto *VAM = dyn_cast<ValueAsMetadata>(MD))
     visitValueAsMetadata(*VAM, F);
   else if (auto *AL = dyn_cast<DIArgList>(MD))
     visitDIArgList(*AL, F);
 
-  CheckDI(isa_and_nonnull<DILocalVariable>(DPV.getRawVariable()),
-          "invalid #dbg record variable", &DPV, DPV.getRawVariable());
-  visitMDNode(*DPV.getRawVariable(), AreDebugLocsAllowed::No);
+  CheckDI(isa_and_nonnull<DILocalVariable>(DVR.getRawVariable()),
+          "invalid #dbg record variable", &DVR, DVR.getRawVariable());
+  visitMDNode(*DVR.getRawVariable(), AreDebugLocsAllowed::No);
 
-  CheckDI(isa_and_nonnull<DIExpression>(DPV.getRawExpression()),
-          "invalid #dbg record expression", &DPV, DPV.getRawExpression());
-  visitMDNode(*DPV.getExpression(), AreDebugLocsAllowed::No);
+  CheckDI(isa_and_nonnull<DIExpression>(DVR.getRawExpression()),
+          "invalid #dbg record expression", &DVR, DVR.getRawExpression());
+  visitMDNode(*DVR.getExpression(), AreDebugLocsAllowed::No);
 
-  if (DPV.isDbgAssign()) {
-    CheckDI(isa_and_nonnull<DIAssignID>(DPV.getRawAssignID()),
-            "invalid #dbg_assign DIAssignID", &DPV, DPV.getRawAssignID());
-    visitMDNode(*cast<DIAssignID>(DPV.getRawAssignID()),
+  if (DVR.isDbgAssign()) {
+    CheckDI(isa_and_nonnull<DIAssignID>(DVR.getRawAssignID()),
+            "invalid #dbg_assign DIAssignID", &DVR, DVR.getRawAssignID());
+    visitMDNode(*cast<DIAssignID>(DVR.getRawAssignID()),
                 AreDebugLocsAllowed::No);
 
-    const auto *RawAddr = DPV.getRawAddress();
-    // Similarly to the location above, the address for an assign DPValue must
-    // be a ValueAsMetadata or an empty MDNode, which represents an undef
-    // address.
+    const auto *RawAddr = DVR.getRawAddress();
+    // Similarly to the location above, the address for an assign
+    // DbgVariableRecord must be a ValueAsMetadata or an empty MDNode, which
+    // represents an undef address.
     CheckDI(
         isa<ValueAsMetadata>(RawAddr) ||
             (isa<MDNode>(RawAddr) && !cast<MDNode>(RawAddr)->getNumOperands()),
-        "invalid #dbg_assign address", &DPV, DPV.getRawAddress());
+        "invalid #dbg_assign address", &DVR, DVR.getRawAddress());
     if (auto *VAM = dyn_cast<ValueAsMetadata>(RawAddr))
       visitValueAsMetadata(*VAM, F);
 
-    CheckDI(isa_and_nonnull<DIExpression>(DPV.getRawAddressExpression()),
-            "invalid #dbg_assign address expression", &DPV,
-            DPV.getRawAddressExpression());
-    visitMDNode(*DPV.getAddressExpression(), AreDebugLocsAllowed::No);
+    CheckDI(isa_and_nonnull<DIExpression>(DVR.getRawAddressExpression()),
+            "invalid #dbg_assign address expression", &DVR,
+            DVR.getRawAddressExpression());
+    visitMDNode(*DVR.getAddressExpression(), AreDebugLocsAllowed::No);
 
-    // All of the linked instructions should be in the same function as DPV.
-    for (Instruction *I : at::getAssignmentInsts(&DPV))
-      CheckDI(DPV.getFunction() == I->getFunction(),
-              "inst not in same function as #dbg_assign", I, &DPV);
+    // All of the linked instructions should be in the same function as DVR.
+    for (Instruction *I : at::getAssignmentInsts(&DVR))
+      CheckDI(DVR.getFunction() == I->getFunction(),
+              "inst not in same function as #dbg_assign", I, &DVR);
   }
 
   // This check is redundant with one in visitLocalVariable().
-  DILocalVariable *Var = DPV.getVariable();
+  DILocalVariable *Var = DVR.getVariable();
   CheckDI(isType(Var->getRawType()), "invalid type ref", Var,
           Var->getRawType());
 
-  auto *DLNode = DPV.getDebugLoc().getAsMDNode();
+  auto *DLNode = DVR.getDebugLoc().getAsMDNode();
   CheckDI(isa_and_nonnull<DILocation>(DLNode), "invalid #dbg record DILocation",
-          &DPV, DLNode);
-  DILocation *Loc = DPV.getDebugLoc();
+          &DVR, DLNode);
+  DILocation *Loc = DVR.getDebugLoc();
 
   // The scopes for variables and !dbg attachments must agree.
   DISubprogram *VarSP = getSubprogram(Var->getRawScope());
@@ -6320,10 +6346,10 @@ void Verifier::visit(DPValue &DPV) {
 
   CheckDI(VarSP == LocSP,
           "mismatched subprogram between #dbg record variable and DILocation",
-          &DPV, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
+          &DVR, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
           Loc->getScope()->getSubprogram());
 
-  verifyFnArgs(DPV);
+  verifyFnArgs(DVR);
 }
 
 void Verifier::visitVPIntrinsic(VPIntrinsic &VPI) {
@@ -6684,9 +6710,9 @@ void Verifier::verifyFragmentExpression(const DbgVariableIntrinsic &I) {
 
   verifyFragmentExpression(*V, *Fragment, &I);
 }
-void Verifier::verifyFragmentExpression(const DPValue &DPV) {
-  DILocalVariable *V = dyn_cast_or_null<DILocalVariable>(DPV.getRawVariable());
-  DIExpression *E = dyn_cast_or_null<DIExpression>(DPV.getRawExpression());
+void Verifier::verifyFragmentExpression(const DbgVariableRecord &DVR) {
+  DILocalVariable *V = dyn_cast_or_null<DILocalVariable>(DVR.getRawVariable());
+  DIExpression *E = dyn_cast_or_null<DIExpression>(DVR.getRawExpression());
 
   // We don't know whether this intrinsic verified correctly.
   if (!V || !E || !E->isValid())
@@ -6706,7 +6732,7 @@ void Verifier::verifyFragmentExpression(const DPValue &DPV) {
   if (V->isArtificial())
     return;
 
-  verifyFragmentExpression(*V, *Fragment, &DPV);
+  verifyFragmentExpression(*V, *Fragment, &DVR);
 }
 
 template <typename ValueOrMetadata>
@@ -6754,7 +6780,7 @@ void Verifier::verifyFnArgs(const DbgVariableIntrinsic &I) {
   CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I,
           Prev, Var);
 }
-void Verifier::verifyFnArgs(const DPValue &DPV) {
+void Verifier::verifyFnArgs(const DbgVariableRecord &DVR) {
   // This function does not take the scope of noninlined function arguments into
   // account. Don't run it if current function is nodebug, because it may
   // contain inlined debug intrinsics.
@@ -6762,10 +6788,10 @@ void Verifier::verifyFnArgs(const DPValue &DPV) {
     return;
 
   // For performance reasons only check non-inlined ones.
-  if (DPV.getDebugLoc()->getInlinedAt())
+  if (DVR.getDebugLoc()->getInlinedAt())
     return;
 
-  DILocalVariable *Var = DPV.getVariable();
+  DILocalVariable *Var = DVR.getVariable();
   CheckDI(Var, "#dbg record without variable");
 
   unsigned ArgNo = Var->getArg();
@@ -6779,7 +6805,7 @@ void Verifier::verifyFnArgs(const DPValue &DPV) {
 
   auto *Prev = DebugFnArgs[ArgNo - 1];
   DebugFnArgs[ArgNo - 1] = Var;
-  CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &DPV,
+  CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &DVR,
           Prev, Var);
 }
 
@@ -6806,15 +6832,15 @@ void Verifier::verifyNotEntryValue(const DbgVariableIntrinsic &I) {
           "swiftasync Argument",
           &I);
 }
-void Verifier::verifyNotEntryValue(const DPValue &DPV) {
-  DIExpression *E = dyn_cast_or_null<DIExpression>(DPV.getRawExpression());
+void Verifier::verifyNotEntryValue(const DbgVariableRecord &DVR) {
+  DIExpression *E = dyn_cast_or_null<DIExpression>(DVR.getRawExpression());
 
   // We don't know whether this intrinsic verified correctly.
   if (!E || !E->isValid())
     return;
 
-  if (isa<ValueAsMetadata>(DPV.getRawLocation())) {
-    Value *VarValue = DPV.getVariableLocationOp(0);
+  if (isa<ValueAsMetadata>(DVR.getRawLocation())) {
+    Value *VarValue = DVR.getVariableLocationOp(0);
     if (isa<UndefValue>(VarValue) || isa<PoisonValue>(VarValue))
       return;
     // We allow EntryValues for swift async arguments, as they have an
@@ -6827,7 +6853,7 @@ void Verifier::verifyNotEntryValue(const DPValue &DPV) {
   CheckDI(!E->isEntryValue(),
           "Entry values are only allowed in MIR unless they target a "
           "swiftasync Argument",
-          &DPV);
+          &DVR);
 }
 
 void Verifier::verifyCompileUnits() {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index b4e8a99c87dfe..cb4bb3b7465a3 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Linker/IRMover.h"
 #include "LinkDiagnosticInfo.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -1545,7 +1546,23 @@ Error IRLinker::run() {
     if (Error Err = SrcM->getMaterializer()->materializeMetadata())
       return Err;
 
-  DstM.IsNewDbgInfoFormat = SrcM->IsNewDbgInfoFormat;
+  // Convert source module to match dest for the duration of the link.
+  bool SrcModuleNewDbgFormat = SrcM->IsNewDbgInfoFormat;
+  if (DstM.IsNewDbgInfoFormat != SrcM->IsNewDbgInfoFormat) {
+    if (DstM.IsNewDbgInfoFormat)
+      SrcM->convertToNewDbgValues();
+    else
+      SrcM->convertFromNewDbgValues();
+  }
+  // Undo debug mode conversion afterwards.
+  auto Cleanup = make_scope_exit([&]() {
+    if (SrcModuleNewDbgFormat != SrcM->IsNewDbgInfoFormat) {
+      if (SrcModuleNewDbgFormat)
+        SrcM->convertToNewDbgValues();
+      else
+        SrcM->convertFromNewDbgValues();
+    }
+  });
 
   // Inherit the target data from the source module if the destination module
   // doesn't have one already.
@@ -1605,11 +1622,6 @@ Error IRLinker::run() {
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
-  // Convert module level attributes to function level attributes because
-  // after merging modules the attributes might change and would have different
-  // effect on the functions as the original module would have.
-  CopyModuleAttrToFunctions(*SrcM);
-
   std::reverse(Worklist.begin(), Worklist.end());
   while (!Worklist.empty()) {
     GlobalValue *GV = Worklist.back();
@@ -1779,8 +1791,6 @@ IRMover::IRMover(Module &M) : Composite(M) {
 Error IRMover::move(std::unique_ptr<Module> Src,
                     ArrayRef<GlobalValue *> ValuesToLink,
                     LazyCallback AddLazyFor, bool IsPerformingImport) {
-  if (getModule().IsNewDbgInfoFormat)
-    Src->convertToNewDbgValues();
   IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes,
                        std::move(Src), ValuesToLink, std::move(AddLazyFor),
                        IsPerformingImport);
diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp
index 39856e96e9be5..d72d6e07f2e6f 100644
--- a/llvm/lib/MC/SPIRVObjectWriter.cpp
+++ b/llvm/lib/MC/SPIRVObjectWriter.cpp
@@ -43,18 +43,14 @@ class SPIRVObjectWriter : public MCObjectWriter {
 
 void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) {
   constexpr uint32_t MagicNumber = 0x07230203;
-
-  // TODO: set the version on a min-necessary basis (just like the translator
-  // does) requires some refactoring of MCAssembler::VersionInfoType.
-  constexpr uint32_t Major = 1;
-  constexpr uint32_t Minor = 0;
-  constexpr uint32_t VersionNumber = 0 | (Major << 16) | (Minor << 8);
-  // TODO: check if we could use anything other than 0 (spec allows).
   constexpr uint32_t GeneratorMagicNumber = 0;
-  // TODO: do not hardcode this as well.
-  constexpr uint32_t Bound = 900;
   constexpr uint32_t Schema = 0;
 
+  // Construct SPIR-V version and Bound
+  const MCAssembler::VersionInfoType &VIT = Asm.getVersionInfo();
+  uint32_t VersionNumber = 0 | (VIT.Major << 16) | (VIT.Minor << 8);
+  uint32_t Bound = VIT.Update;
+
   W.write<uint32_t>(MagicNumber);
   W.write<uint32_t>(VersionNumber);
   W.write<uint32_t>(GeneratorMagicNumber);
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index f52bcb74938d1..e4d6e02f3aa60 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -214,33 +214,32 @@ static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
                            SecName.str().c_str());
 }
 
-static bool isCompressable(const SectionBase &Sec) {
-  return !(Sec.Flags & ELF::SHF_COMPRESSED) &&
-         StringRef(Sec.Name).starts_with(".debug");
-}
-
-static Error replaceDebugSections(
-    Object &Obj, function_ref<bool(const SectionBase &)> ShouldReplace,
-    function_ref<Expected<SectionBase *>(const SectionBase *)> AddSection) {
+Error Object::compressOrDecompressSections(const CommonConfig &Config) {
   // Build a list of the debug sections we are going to replace.
   // We can't call `AddSection` while iterating over sections,
   // because it would mutate the sections array.
-  SmallVector<SectionBase *, 13> ToReplace;
-  for (auto &Sec : Obj.sections())
-    if (ShouldReplace(Sec))
-      ToReplace.push_back(&Sec);
-
-  // Build a mapping from original section to a new one.
-  DenseMap<SectionBase *, SectionBase *> FromTo;
-  for (SectionBase *S : ToReplace) {
-    Expected<SectionBase *> NewSection = AddSection(S);
-    if (!NewSection)
-      return NewSection.takeError();
-
-    FromTo[S] = *NewSection;
+  SmallVector<std::pair<SectionBase *, std::function<SectionBase *()>>, 0>
+      ToReplace;
+  for (SectionBase &Sec : sections()) {
+    if ((Sec.Flags & SHF_ALLOC) || !StringRef(Sec.Name).starts_with(".debug"))
+      continue;
+    if (auto *CS = dyn_cast<CompressedSection>(&Sec)) {
+      if (Config.DecompressDebugSections) {
+        ToReplace.emplace_back(
+            &Sec, [=] { return &addSection<DecompressedSection>(*CS); });
+      }
+    } else if (Config.CompressionType != DebugCompressionType::None) {
+      ToReplace.emplace_back(&Sec, [&, S = &Sec] {
+        return &addSection<CompressedSection>(
+            CompressedSection(*S, Config.CompressionType, Is64Bits));
+      });
+    }
   }
 
-  return Obj.replaceSections(FromTo);
+  DenseMap<SectionBase *, SectionBase *> FromTo;
+  for (auto [S, Func] : ToReplace)
+    FromTo[S] = Func();
+  return replaceSections(FromTo);
 }
 
 static bool isAArch64MappingSymbol(const Symbol &Sym) {
@@ -534,24 +533,8 @@ static Error replaceAndRemoveSections(const CommonConfig &Config,
   if (Error E = Obj.removeSections(ELFConfig.AllowBrokenLinks, RemovePred))
     return E;
 
-  if (Config.CompressionType != DebugCompressionType::None) {
-    if (Error Err = replaceDebugSections(
-            Obj, isCompressable,
-            [&Config, &Obj](const SectionBase *S) -> Expected<SectionBase *> {
-              return &Obj.addSection<CompressedSection>(
-                  CompressedSection(*S, Config.CompressionType, Obj.Is64Bits));
-            }))
-      return Err;
-  } else if (Config.DecompressDebugSections) {
-    if (Error Err = replaceDebugSections(
-            Obj,
-            [](const SectionBase &S) { return isa<CompressedSection>(&S); },
-            [&Obj](const SectionBase *S) {
-              const CompressedSection *CS = cast<CompressedSection>(S);
-              return &Obj.addSection<DecompressedSection>(*CS);
-            }))
-      return Err;
-  }
+  if (Error E = Obj.compressOrDecompressSections(Config))
+    return E;
 
   return Error::success();
 }
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
index 7a2e20d82d115..f72c109b6009e 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.h
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -1210,6 +1210,7 @@ class Object {
 
   Error removeSections(bool AllowBrokenLinks,
                        std::function<bool(const SectionBase &)> ToRemove);
+  Error compressOrDecompressSections(const CommonConfig &Config);
   Error replaceSections(const DenseMap<SectionBase *, SectionBase *> &FromTo);
   Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
   template <class T, class... Ts> T &addSection(Ts &&...Args) {
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index aa57e55de70c8..97d0e4f75be8e 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -795,23 +795,31 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
       Entry.second = Entry.second > 1 ? 1 : 0;
   }
 
+  std::vector<std::unique_ptr<SymbolicFile>> SymFiles;
+
+  if (NeedSymbols != SymtabWritingMode::NoSymtab || isAIXBigArchive(Kind)) {
+    for (const NewArchiveMember &M : NewMembers) {
+      Expected<std::unique_ptr<SymbolicFile>> SymFileOrErr =
+          getSymbolicFile(M.Buf->getMemBufferRef(), Context);
+      if (!SymFileOrErr)
+        return createFileError(M.MemberName, SymFileOrErr.takeError());
+      SymFiles.push_back(std::move(*SymFileOrErr));
+    }
+  }
+
   // The big archive format needs to know the offset of the previous member
   // header.
   uint64_t PrevOffset = 0;
   uint64_t NextMemHeadPadSize = 0;
-  std::unique_ptr<SymbolicFile> CurSymFile;
-  std::unique_ptr<SymbolicFile> NextSymFile;
-  uint16_t Index = 0;
 
-  for (auto M = NewMembers.begin(); M < NewMembers.end(); ++M) {
+  for (uint32_t Index = 0; Index < NewMembers.size(); ++Index) {
+    const NewArchiveMember *M = &NewMembers[Index];
     std::string Header;
     raw_string_ostream Out(Header);
 
     MemoryBufferRef Buf = M->Buf->getMemBufferRef();
     StringRef Data = Thin ? "" : Buf.getBuffer();
 
-    Index++;
-
     // ld64 expects the members to be 8-byte aligned for 64-bit content and at
     // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
     // uniformly.  This matches the behaviour with cctools and ensures that ld64
@@ -837,29 +845,9 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
           std::move(StringMsg), object::object_error::parse_failed);
     }
 
-    if (NeedSymbols != SymtabWritingMode::NoSymtab || isAIXBigArchive(Kind)) {
-      auto SetNextSymFile = [&NextSymFile,
-                             &Context](MemoryBufferRef Buf,
-                                       StringRef MemberName) -> Error {
-        Expected<std::unique_ptr<SymbolicFile>> SymFileOrErr =
-            getSymbolicFile(Buf, Context);
-        if (!SymFileOrErr)
-          return createFileError(MemberName, SymFileOrErr.takeError());
-        NextSymFile = std::move(*SymFileOrErr);
-        return Error::success();
-      };
-
-      if (M == NewMembers.begin())
-        if (Error Err = SetNextSymFile(Buf, M->MemberName))
-          return std::move(Err);
-
-      CurSymFile = std::move(NextSymFile);
-
-      if ((M + 1) != NewMembers.end())
-        if (Error Err = SetNextSymFile((M + 1)->Buf->getMemBufferRef(),
-                                       (M + 1)->MemberName))
-          return std::move(Err);
-    }
+    std::unique_ptr<SymbolicFile> CurSymFile;
+    if (!SymFiles.empty())
+      CurSymFile = std::move(SymFiles[Index]);
 
     // In the big archive file format, we need to calculate and include the next
     // member offset and previous member offset in the file member header.
@@ -880,13 +868,13 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
 
       // If there is another member file after this, we need to calculate the
       // padding before the header.
-      if ((M + 1) != NewMembers.end()) {
-        uint64_t OffsetToNextMemData = NextOffset +
-                                       sizeof(object::BigArMemHdrType) +
-                                       alignTo((M + 1)->MemberName.size(), 2);
+      if (Index + 1 != SymFiles.size()) {
+        uint64_t OffsetToNextMemData =
+            NextOffset + sizeof(object::BigArMemHdrType) +
+            alignTo(NewMembers[Index + 1].MemberName.size(), 2);
         NextMemHeadPadSize =
             alignToPowerOf2(OffsetToNextMemData,
-                            getMemberAlignment(NextSymFile.get())) -
+                            getMemberAlignment(SymFiles[Index + 1].get())) -
             OffsetToNextMemData;
         NextOffset += NextMemHeadPadSize;
       }
@@ -902,7 +890,7 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
     std::vector<unsigned> Symbols;
     if (NeedSymbols != SymtabWritingMode::NoSymtab) {
       Expected<std::vector<unsigned>> SymbolsOrErr =
-          getSymbols(CurSymFile.get(), Index, SymNames, SymMap);
+          getSymbols(CurSymFile.get(), Index + 1, SymNames, SymMap);
       if (!SymbolsOrErr)
         return createFileError(M->MemberName, SymbolsOrErr.takeError());
       Symbols = std::move(*SymbolsOrErr);
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 137f606dd2d46..55dd0c8e06c09 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -251,7 +251,10 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     }
     break;
   case ELF::EM_HEXAGON:
-    switch (Type) { STRINGIFY_ENUM_CASE(ELF, SHT_HEX_ORDERED); }
+    switch (Type) {
+      STRINGIFY_ENUM_CASE(ELF, SHT_HEX_ORDERED);
+      STRINGIFY_ENUM_CASE(ELF, SHT_HEXAGON_ATTRIBUTES);
+    }
     break;
   case ELF::EM_X86_64:
     switch (Type) { STRINGIFY_ENUM_CASE(ELF, SHT_X86_64_UNWIND); }
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 33be48196ae7d..efec612957de3 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Support/ARMAttributeParser.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/HexagonAttributeParser.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributeParser.h"
 #include "llvm/Support/RISCVAttributes.h"
@@ -287,6 +288,81 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   return Features;
 }
 
+static std::optional<std::string> hexagonAttrToFeatureString(unsigned Attr) {
+  switch (Attr) {
+  case 5:
+    return "v5";
+  case 55:
+    return "v55";
+  case 60:
+    return "v60";
+  case 62:
+    return "v62";
+  case 65:
+    return "v65";
+  case 67:
+    return "v67";
+  case 68:
+    return "v68";
+  case 69:
+    return "v69";
+  case 71:
+    return "v71";
+  case 73:
+    return "v73";
+  default:
+    return {};
+  }
+}
+
+SubtargetFeatures ELFObjectFileBase::getHexagonFeatures() const {
+  SubtargetFeatures Features;
+  HexagonAttributeParser Parser;
+  if (Error E = getBuildAttributes(Parser)) {
+    // Return no attributes if none can be read.
+    // This behavior is important for backwards compatibility.
+    consumeError(std::move(E));
+    return Features;
+  }
+  std::optional<unsigned> Attr;
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::ARCH))) {
+    if (std::optional<std::string> FeatureString =
+            hexagonAttrToFeatureString(*Attr))
+      Features.AddFeature(*FeatureString);
+  }
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::HVXARCH))) {
+    std::optional<std::string> FeatureString =
+        hexagonAttrToFeatureString(*Attr);
+    // There is no corresponding hvx arch for v5 and v55.
+    if (FeatureString && *Attr >= 60)
+      Features.AddFeature("hvx" + *FeatureString);
+  }
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::HVXIEEEFP)))
+    if (*Attr)
+      Features.AddFeature("hvx-ieee-fp");
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::HVXQFLOAT)))
+    if (*Attr)
+      Features.AddFeature("hvx-qfloat");
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::ZREG)))
+    if (*Attr)
+      Features.AddFeature("zreg");
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::AUDIO)))
+    if (*Attr)
+      Features.AddFeature("audio");
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::CABAC)))
+    if (*Attr)
+      Features.AddFeature("cabac");
+
+  return Features;
+}
+
 Expected<SubtargetFeatures> ELFObjectFileBase::getRISCVFeatures() const {
   SubtargetFeatures Features;
   unsigned PlatformFlags = getPlatformFlags();
@@ -349,6 +425,8 @@ Expected<SubtargetFeatures> ELFObjectFileBase::getFeatures() const {
     return getRISCVFeatures();
   case ELF::EM_LOONGARCH:
     return getLoongArchFeatures();
+  case ELF::EM_HEXAGON:
+    return getHexagonFeatures();
   default:
     return SubtargetFeatures();
   }
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 7dc9822bdd221..a6871e7855e4e 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -24,14 +24,14 @@ static_assert((uint64_t)dxbc::FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
 DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) {
-#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str)                      \
   Val = (FlagData & (uint64_t)dxbc::FeatureFlags::Val) > 0;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
 uint64_t DXContainerYAML::ShaderFeatureFlags::getEncodedFlags() {
   uint64_t Flag = 0;
-#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str)                      \
   if (Val)                                                                     \
     Flag |= (uint64_t)dxbc::FeatureFlags::Val;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -105,7 +105,8 @@ void MappingTraits<DXContainerYAML::DXILProgram>::mapping(
 
 void MappingTraits<DXContainerYAML::ShaderFeatureFlags>::mapping(
     IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags) {
-#define SHADER_FEATURE_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str)                      \
+  IO.mapRequired(#Val, Flags.Val);
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 9c1a28db592a1..045211c44b907 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -716,6 +716,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
     break;
   case ELF::EM_HEXAGON:
     ECase(SHT_HEX_ORDERED);
+    ECase(SHT_HEXAGON_ATTRIBUTES);
     break;
   case ELF::EM_X86_64:
     ECase(SHT_X86_64_UNWIND);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 6b24ba56ccc54..a63d653494612 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -370,7 +370,7 @@ class TriggerVerifierErrorPass
     // Intentionally break the Function by inserting a terminator
     // instruction in the middle of a basic block.
     BasicBlock &BB = F.getEntryBlock();
-    new UnreachableInst(F.getContext(), BB.getTerminator());
+    new UnreachableInst(F.getContext(), BB.getTerminator()->getIterator());
     return PreservedAnalyses::none();
   }
 
@@ -515,15 +515,6 @@ static std::optional<int> parseDevirtPassName(StringRef Name) {
   return Count;
 }
 
-static bool checkParametrizedPassName(StringRef Name, StringRef PassName) {
-  if (!Name.consume_front(PassName))
-    return false;
-  // normal pass name w/o parameters == default parameters
-  if (Name.empty())
-    return true;
-  return Name.starts_with("<") && Name.ends_with(">");
-}
-
 static std::optional<OptimizationLevel> parseOptLevel(StringRef S) {
   return StringSwitch<std::optional<OptimizationLevel>>(S)
       .Case("O0", OptimizationLevel::O0)
@@ -537,42 +528,6 @@ static std::optional<OptimizationLevel> parseOptLevel(StringRef S) {
 
 namespace {
 
-/// This performs customized parsing of pass name with parameters.
-///
-/// We do not need parametrization of passes in textual pipeline very often,
-/// yet on a rare occasion ability to specify parameters right there can be
-/// useful.
-///
-/// \p Name - parameterized specification of a pass from a textual pipeline
-/// is a string in a form of :
-///      PassName '<' parameter-list '>'
-///
-/// Parameter list is being parsed by the parser callable argument, \p Parser,
-/// It takes a string-ref of parameters and returns either StringError or a
-/// parameter list in a form of a custom parameters type, all wrapped into
-/// Expected<> template class.
-///
-template <typename ParametersParseCallableT>
-auto parsePassParameters(ParametersParseCallableT &&Parser, StringRef Name,
-                         StringRef PassName) -> decltype(Parser(StringRef{})) {
-  using ParametersT = typename decltype(Parser(StringRef{}))::value_type;
-
-  StringRef Params = Name;
-  if (!Params.consume_front(PassName)) {
-    assert(false &&
-           "unable to strip pass name from parametrized pass specification");
-  }
-  if (!Params.empty() &&
-      (!Params.consume_front("<") || !Params.consume_back(">"))) {
-    assert(false && "invalid format for parametrized pass name");
-  }
-
-  Expected<ParametersT> Result = Parser(Params);
-  assert((Result || Result.template errorIsA<StringError>()) &&
-         "Pass parameter parser can only return StringErrors.");
-  return Result;
-}
-
 /// Parser of parameters for HardwareLoops  pass.
 Expected<HardwareLoopOptions> parseHardwareLoopOptions(StringRef Params) {
   HardwareLoopOptions HardwareLoopOpts;
@@ -1208,7 +1163,7 @@ static bool isModulePassName(StringRef Name, CallbacksT &Callbacks) {
   if (Name == NAME)                                                            \
     return true;
 #define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
-  if (checkParametrizedPassName(Name, NAME))                                   \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME))                      \
     return true;
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
@@ -1237,7 +1192,7 @@ static bool isCGSCCPassName(StringRef Name, CallbacksT &Callbacks) {
   if (Name == NAME)                                                            \
     return true;
 #define CGSCC_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)       \
-  if (checkParametrizedPassName(Name, NAME))                                   \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME))                      \
     return true;
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
@@ -1264,7 +1219,7 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
   if (Name == NAME)                                                            \
     return true;
 #define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)    \
-  if (checkParametrizedPassName(Name, NAME))                                   \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME))                      \
     return true;
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
@@ -1305,7 +1260,7 @@ static bool isLoopNestPassName(StringRef Name, CallbacksT &Callbacks,
   if (parseRepeatPassName(Name))
     return true;
 
-  if (checkParametrizedPassName(Name, "lnicm")) {
+  if (PassBuilder::checkParametrizedPassName(Name, "lnicm")) {
     UseMemorySSA = true;
     return true;
   }
@@ -1327,7 +1282,7 @@ static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks,
   if (parseRepeatPassName(Name))
     return true;
 
-  if (checkParametrizedPassName(Name, "licm")) {
+  if (PassBuilder::checkParametrizedPassName(Name, "licm")) {
     UseMemorySSA = true;
     return true;
   }
@@ -1336,7 +1291,7 @@ static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks,
   if (Name == NAME)                                                            \
     return true;
 #define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)        \
-  if (checkParametrizedPassName(Name, NAME))                                   \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME))                      \
     return true;
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index e757c735719b1..d9fe88a00bdfc 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -60,16 +60,16 @@ class ProfOStream {
   // \c patch can only be called when all data is written and flushed.
   // For raw_string_ostream, the patch is done on the target string
   // directly and it won't be reflected in the stream's internal buffer.
-  void patch(PatchItem *P, int NItems) {
+  void patch(ArrayRef<PatchItem> P) {
     using namespace support;
 
     if (IsFDOStream) {
       raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
       const uint64_t LastPos = FDOStream.tell();
-      for (int K = 0; K < NItems; K++) {
-        FDOStream.seek(P[K].Pos);
-        for (int I = 0; I < P[K].N; I++)
-          write(P[K].D[I]);
+      for (const auto &K : P) {
+        FDOStream.seek(K.Pos);
+        for (int I = 0; I < K.N; I++)
+          write(K.D[I]);
       }
       // Reset the stream to the last position after patching so that users
       // don't accidentally overwrite data. This makes it consistent with
@@ -78,11 +78,11 @@ class ProfOStream {
     } else {
       raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
-      for (int K = 0; K < NItems; K++) {
-        for (int I = 0; I < P[K].N; I++) {
+      for (const auto &K : P) {
+        for (int I = 0; I < K.N; I++) {
           uint64_t Bytes =
-              endian::byte_swap<uint64_t, llvm::endianness::little>(P[K].D[I]);
-          Data.replace(P[K].Pos + I * sizeof(uint64_t), sizeof(uint64_t),
+              endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+          Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
                        (const char *)&Bytes, sizeof(uint64_t));
         }
       }
@@ -575,7 +575,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
         {MemProfSectionStart + sizeof(uint64_t), &FramePayloadOffset, 1},
         {MemProfSectionStart + 2 * sizeof(uint64_t), &FrameTableOffset, 1},
     };
-    OS.patch(PatchItems, 3);
+    OS.patch(PatchItems);
   }
 
   // BinaryIdSection has two parts:
@@ -693,7 +693,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
         {CSSummaryOffset, reinterpret_cast<uint64_t *>(TheCSSummary.get()),
          (int)CSSummarySize}};
 
-    OS.patch(PatchItems, std::size(PatchItems));
+    OS.patch(PatchItems);
   } else {
     // Now do the final patch:
     PatchItem PatchItems[] = {
@@ -713,7 +713,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
         {CSSummaryOffset, reinterpret_cast<uint64_t *>(TheCSSummary.get()),
          (int)CSSummarySize}};
 
-    OS.patch(PatchItems, std::size(PatchItems));
+    OS.patch(PatchItems);
   }
 
   for (const auto &I : FunctionData)
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index e686b97652330..7a383c3236d9b 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -2022,6 +2022,13 @@ APInt APInt::ushl_ov(unsigned ShAmt, bool &Overflow) const {
   return *this << ShAmt;
 }
 
+APInt APInt::sfloordiv_ov(const APInt &RHS, bool &Overflow) const {
+  APInt quotient = sdiv_ov(RHS, Overflow);
+  if ((quotient * RHS != *this) && (isNegative() != RHS.isNegative()))
+    return quotient - 1;
+  return quotient;
+}
+
 APInt APInt::sadd_sat(const APInt &RHS) const {
   bool Overflow;
   APInt Res = sadd_ov(RHS, Overflow);
@@ -3094,3 +3101,23 @@ void llvm::LoadIntFromMemory(APInt &IntVal, const uint8_t *Src,
     memcpy(Dst + sizeof(uint64_t) - LoadBytes, Src, LoadBytes);
   }
 }
+
+APInt APIntOps::avgFloorS(const APInt &C1, const APInt &C2) {
+  // Return floor((C1 + C2) / 2)
+  return (C1 & C2) + (C1 ^ C2).ashr(1);
+}
+
+APInt APIntOps::avgFloorU(const APInt &C1, const APInt &C2) {
+  // Return floor((C1 + C2) / 2)
+  return (C1 & C2) + (C1 ^ C2).lshr(1);
+}
+
+APInt APIntOps::avgCeilS(const APInt &C1, const APInt &C2) {
+  // Return ceil((C1 + C2) / 2)
+  return (C1 | C2) - (C1 ^ C2).ashr(1);
+}
+
+APInt APIntOps::avgCeilU(const APInt &C1, const APInt &C2) {
+  // Return ceil((C1 + C2) / 2)
+  return (C1 | C2) - (C1 ^ C2).lshr(1);
+}
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 90d0d25e99e2c..9d4e48db8b405 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -188,6 +188,8 @@ add_llvm_component_library(LLVMSupport
   GlobPattern.cpp
   GraphWriter.cpp
   Hashing.cpp
+  HexagonAttributeParser.cpp
+  HexagonAttributes.cpp
   InitLLVM.cpp
   InstructionCost.cpp
   IntEqClasses.cpp
diff --git a/llvm/lib/Support/HexagonAttributeParser.cpp b/llvm/lib/Support/HexagonAttributeParser.cpp
new file mode 100644
index 0000000000000..2143162d11c79
--- /dev/null
+++ b/llvm/lib/Support/HexagonAttributeParser.cpp
@@ -0,0 +1,55 @@
+//===-- HexagonAttributeParser.cpp - Hexagon Attribute Parser -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/HexagonAttributeParser.h"
+
+using namespace llvm;
+
+const HexagonAttributeParser::DisplayHandler
+    HexagonAttributeParser::DisplayRoutines[] = {
+        {
+            HexagonAttrs::ARCH,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::HVXARCH,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::HVXIEEEFP,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::HVXQFLOAT,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::ZREG,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::AUDIO,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::CABAC,
+            &ELFAttributeParser::integerAttribute,
+        }};
+
+Error HexagonAttributeParser::handler(uint64_t Tag, bool &Handled) {
+  Handled = false;
+  for (const auto &R : DisplayRoutines) {
+    if (uint64_t(R.Attribute) == Tag) {
+      if (Error E = (this->*R.Routine)(Tag))
+        return E;
+      Handled = true;
+      break;
+    }
+  }
+  return Error::success();
+}
diff --git a/llvm/lib/Support/HexagonAttributes.cpp b/llvm/lib/Support/HexagonAttributes.cpp
new file mode 100644
index 0000000000000..165215c8fb676
--- /dev/null
+++ b/llvm/lib/Support/HexagonAttributes.cpp
@@ -0,0 +1,27 @@
+//===-- HexagonAttributes.cpp - Qualcomm Hexagon Attributes ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/HexagonAttributes.h"
+
+using namespace llvm;
+using namespace llvm::HexagonAttrs;
+
+static constexpr TagNameItem TagData[] = {
+    {ARCH, "Tag_arch"},
+    {HVXARCH, "Tag_hvx_arch"},
+    {HVXIEEEFP, "Tag_hvx_ieeefp"},
+    {HVXQFLOAT, "Tag_hvx_qfloat"},
+    {ZREG, "Tag_zreg"},
+    {AUDIO, "Tag_audio"},
+    {CABAC, "Tag_cabac"},
+};
+
+constexpr TagNameMap HexagonAttributeTags{TagData};
+const TagNameMap &llvm::HexagonAttrs::getHexagonAttributeTags() {
+  return HexagonAttributeTags;
+}
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 6eec03fd6f708..39235ace47248 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -90,11 +90,14 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"xcvmac", {1, 0}},
     {"xcvmem", {1, 0}},
     {"xcvsimd", {1, 0}},
+    {"xsfcease", {1, 0}},
     {"xsfvcp", {1, 0}},
     {"xsfvfnrclipxfqf", {1, 0}},
     {"xsfvfwmaccqqq", {1, 0}},
     {"xsfvqmaccdod", {1, 0}},
     {"xsfvqmaccqoq", {1, 0}},
+    {"xsifivecdiscarddlone", {1, 0}},
+    {"xsifivecflushdlone", {1, 0}},
     {"xtheadba", {1, 0}},
     {"xtheadbb", {1, 0}},
     {"xtheadbs", {1, 0}},
@@ -258,7 +261,7 @@ static void PrintExtension(StringRef Name, StringRef Version,
                            StringRef Description) {
   outs().indent(4);
   unsigned VersionWidth = Description.empty() ? 0 : 10;
-  outs() << left_justify(Name, 20) << left_justify(Version, VersionWidth)
+  outs() << left_justify(Name, 21) << left_justify(Version, VersionWidth)
          << Description << "\n";
 }
 
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 4d625b3eb5b17..092028dd2a5b3 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/JSON.h"
@@ -20,6 +21,7 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <memory>
 #include <mutex>
 #include <string>
 #include <vector>
@@ -64,17 +66,19 @@ using CountAndDurationType = std::pair<size_t, DurationType>;
 using NameAndCountAndDurationType =
     std::pair<std::string, CountAndDurationType>;
 
+} // anonymous namespace
+
 /// Represents an open or completed time section entry to be captured.
-struct TimeTraceProfilerEntry {
+struct llvm::TimeTraceProfilerEntry {
   const TimePointType Start;
   TimePointType End;
   const std::string Name;
   const std::string Detail;
-
+  const bool AsyncEvent = false;
   TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N,
-                         std::string &&Dt)
+                         std::string &&Dt, bool Ae)
       : Start(std::move(S)), End(std::move(E)), Name(std::move(N)),
-        Detail(std::move(Dt)) {}
+        Detail(std::move(Dt)), AsyncEvent(Ae) {}
 
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoids truncation issues causing inner
@@ -92,8 +96,6 @@ struct TimeTraceProfilerEntry {
   }
 };
 
-} // anonymous namespace
-
 struct llvm::TimeTraceProfiler {
   TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "")
       : BeginningOfTime(system_clock::now()), StartTime(ClockType::now()),
@@ -102,22 +104,23 @@ struct llvm::TimeTraceProfiler {
     llvm::get_thread_name(ThreadName);
   }
 
-  void begin(std::string Name, llvm::function_ref<std::string()> Detail) {
-    Stack.emplace_back(ClockType::now(), TimePointType(), std::move(Name),
-                       Detail());
+  TimeTraceProfilerEntry *begin(std::string Name,
+                                llvm::function_ref<std::string()> Detail,
+                                bool AsyncEvent = false) {
+    Stack.emplace_back(std::make_unique<TimeTraceProfilerEntry>(
+        ClockType::now(), TimePointType(), std::move(Name), Detail(),
+        AsyncEvent));
+    return Stack.back().get();
   }
 
   void end() {
     assert(!Stack.empty() && "Must call begin() first");
-    TimeTraceProfilerEntry &E = Stack.back();
-    E.End = ClockType::now();
+    end(*Stack.back().get());
+  }
 
-    // Check that end times monotonically increase.
-    assert((Entries.empty() ||
-            (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
-             Entries.back().getFlameGraphStartUs(StartTime) +
-                 Entries.back().getFlameGraphDurUs())) &&
-           "TimeProfiler scope ended earlier than previous scope");
+  void end(TimeTraceProfilerEntry &E) {
+    assert(!Stack.empty() && "Must call begin() first");
+    E.End = ClockType::now();
 
     // Calculate duration at full precision for overall counts.
     DurationType Duration = E.End - E.Start;
@@ -132,15 +135,18 @@ struct llvm::TimeTraceProfiler {
     // happens to be the ones that don't have any currently open entries above
     // itself.
     if (llvm::none_of(llvm::drop_begin(llvm::reverse(Stack)),
-                      [&](const TimeTraceProfilerEntry &Val) {
-                        return Val.Name == E.Name;
+                      [&](const std::unique_ptr<TimeTraceProfilerEntry> &Val) {
+                        return Val->Name == E.Name;
                       })) {
       auto &CountAndTotal = CountAndTotalPerName[E.Name];
       CountAndTotal.first++;
       CountAndTotal.second += Duration;
-    }
+    };
 
-    Stack.pop_back();
+    llvm::erase_if(Stack,
+                   [&](const std::unique_ptr<TimeTraceProfilerEntry> &Val) {
+                     return Val.get() == &E;
+                   });
   }
 
   // Write events from this TimeTraceProfilerInstance and
@@ -168,14 +174,32 @@ struct llvm::TimeTraceProfiler {
       J.object([&] {
         J.attribute("pid", Pid);
         J.attribute("tid", int64_t(Tid));
-        J.attribute("ph", "X");
         J.attribute("ts", StartUs);
-        J.attribute("dur", DurUs);
+        if (E.AsyncEvent) {
+          J.attribute("cat", E.Name);
+          J.attribute("ph", "b");
+          J.attribute("id", 0);
+        } else {
+          J.attribute("ph", "X");
+          J.attribute("dur", DurUs);
+        }
         J.attribute("name", E.Name);
         if (!E.Detail.empty()) {
           J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
         }
       });
+
+      if (E.AsyncEvent) {
+        J.object([&] {
+          J.attribute("pid", Pid);
+          J.attribute("tid", int64_t(Tid));
+          J.attribute("ts", StartUs + DurUs);
+          J.attribute("cat", E.Name);
+          J.attribute("ph", "e");
+          J.attribute("id", 0);
+          J.attribute("name", E.Name);
+        });
+      }
     };
     for (const TimeTraceProfilerEntry &E : Entries)
       writeEvent(E, this->Tid);
@@ -269,7 +293,7 @@ struct llvm::TimeTraceProfiler {
     J.objectEnd();
   }
 
-  SmallVector<TimeTraceProfilerEntry, 16> Stack;
+  SmallVector<std::unique_ptr<TimeTraceProfilerEntry>, 16> Stack;
   SmallVector<TimeTraceProfilerEntry, 128> Entries;
   StringMap<CountAndDurationType> CountAndTotalPerName;
   // System clock time when the session was begun.
@@ -341,19 +365,36 @@ Error llvm::timeTraceProfilerWrite(StringRef PreferredFileName,
   return Error::success();
 }
 
-void llvm::timeTraceProfilerBegin(StringRef Name, StringRef Detail) {
+TimeTraceProfilerEntry *llvm::timeTraceProfilerBegin(StringRef Name,
+                                                     StringRef Detail) {
   if (TimeTraceProfilerInstance != nullptr)
-    TimeTraceProfilerInstance->begin(std::string(Name),
-                                     [&]() { return std::string(Detail); });
+    return TimeTraceProfilerInstance->begin(
+        std::string(Name), [&]() { return std::string(Detail); }, false);
+  return nullptr;
 }
 
-void llvm::timeTraceProfilerBegin(StringRef Name,
-                                  llvm::function_ref<std::string()> Detail) {
+TimeTraceProfilerEntry *
+llvm::timeTraceProfilerBegin(StringRef Name,
+                             llvm::function_ref<std::string()> Detail) {
   if (TimeTraceProfilerInstance != nullptr)
-    TimeTraceProfilerInstance->begin(std::string(Name), Detail);
+    return TimeTraceProfilerInstance->begin(std::string(Name), Detail, false);
+  return nullptr;
+}
+
+TimeTraceProfilerEntry *llvm::timeTraceAsyncProfilerBegin(StringRef Name,
+                                                          StringRef Detail) {
+  if (TimeTraceProfilerInstance != nullptr)
+    return TimeTraceProfilerInstance->begin(
+        std::string(Name), [&]() { return std::string(Detail); }, true);
+  return nullptr;
 }
 
 void llvm::timeTraceProfilerEnd() {
   if (TimeTraceProfilerInstance != nullptr)
     TimeTraceProfilerInstance->end();
 }
+
+void llvm::timeTraceProfilerEnd(TimeTraceProfilerEntry *E) {
+  if (TimeTraceProfilerInstance != nullptr)
+    TimeTraceProfilerInstance->end(*E);
+}
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 051dd2a67d120..057f8eae0552c 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -1344,7 +1344,7 @@ std::error_code RedirectingFileSystem::isLocal(const Twine &Path_,
   SmallString<256> Path;
   Path_.toVector(Path);
 
-  if (makeCanonical(Path))
+  if (makeAbsolute(Path))
     return {};
 
   return ExternalFS->isLocal(Path, Result);
@@ -1411,7 +1411,7 @@ directory_iterator RedirectingFileSystem::dir_begin(const Twine &Dir,
   SmallString<256> Path;
   Dir.toVector(Path);
 
-  EC = makeCanonical(Path);
+  EC = makeAbsolute(Path);
   if (EC)
     return {};
 
@@ -2261,8 +2261,8 @@ void RedirectingFileSystem::LookupResult::getPath(
   llvm::sys::path::append(Result, E->getName());
 }
 
-std::error_code
-RedirectingFileSystem::makeCanonical(SmallVectorImpl<char> &Path) const {
+std::error_code RedirectingFileSystem::makeCanonicalForLookup(
+    SmallVectorImpl<char> &Path) const {
   if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
@@ -2277,12 +2277,16 @@ RedirectingFileSystem::makeCanonical(SmallVectorImpl<char> &Path) const {
 
 ErrorOr<RedirectingFileSystem::LookupResult>
 RedirectingFileSystem::lookupPath(StringRef Path) const {
+  llvm::SmallString<128> CanonicalPath(Path);
+  if (std::error_code EC = makeCanonicalForLookup(CanonicalPath))
+    return EC;
+
   // RedirectOnly means the VFS is always used.
   if (UsageTrackingActive && Redirection == RedirectKind::RedirectOnly)
     HasBeenUsed = true;
 
-  sys::path::const_iterator Start = sys::path::begin(Path);
-  sys::path::const_iterator End = sys::path::end(Path);
+  sys::path::const_iterator Start = sys::path::begin(CanonicalPath);
+  sys::path::const_iterator End = sys::path::end(CanonicalPath);
   llvm::SmallVector<Entry *, 32> Entries;
   for (const auto &Root : Roots) {
     ErrorOr<RedirectingFileSystem::LookupResult> Result =
@@ -2358,14 +2362,14 @@ static Status getRedirectedFileStatus(const Twine &OriginalPath,
 }
 
 ErrorOr<Status> RedirectingFileSystem::status(
-    const Twine &CanonicalPath, const Twine &OriginalPath,
+    const Twine &LookupPath, const Twine &OriginalPath,
     const RedirectingFileSystem::LookupResult &Result) {
   if (std::optional<StringRef> ExtRedirect = Result.getExternalRedirect()) {
-    SmallString<256> CanonicalRemappedPath((*ExtRedirect).str());
-    if (std::error_code EC = makeCanonical(CanonicalRemappedPath))
+    SmallString<256> RemappedPath((*ExtRedirect).str());
+    if (std::error_code EC = makeAbsolute(RemappedPath))
       return EC;
 
-    ErrorOr<Status> S = ExternalFS->status(CanonicalRemappedPath);
+    ErrorOr<Status> S = ExternalFS->status(RemappedPath);
     if (!S)
       return S;
     S = Status::copyWithNewName(*S, *ExtRedirect);
@@ -2375,13 +2379,13 @@ ErrorOr<Status> RedirectingFileSystem::status(
   }
 
   auto *DE = cast<RedirectingFileSystem::DirectoryEntry>(Result.E);
-  return Status::copyWithNewName(DE->getStatus(), CanonicalPath);
+  return Status::copyWithNewName(DE->getStatus(), LookupPath);
 }
 
 ErrorOr<Status>
-RedirectingFileSystem::getExternalStatus(const Twine &CanonicalPath,
+RedirectingFileSystem::getExternalStatus(const Twine &LookupPath,
                                          const Twine &OriginalPath) const {
-  auto Result = ExternalFS->status(CanonicalPath);
+  auto Result = ExternalFS->status(LookupPath);
 
   // The path has been mapped by some nested VFS, don't override it with the
   // original path.
@@ -2391,38 +2395,37 @@ RedirectingFileSystem::getExternalStatus(const Twine &CanonicalPath,
 }
 
 ErrorOr<Status> RedirectingFileSystem::status(const Twine &OriginalPath) {
-  SmallString<256> CanonicalPath;
-  OriginalPath.toVector(CanonicalPath);
+  SmallString<256> Path;
+  OriginalPath.toVector(Path);
 
-  if (std::error_code EC = makeCanonical(CanonicalPath))
+  if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
   if (Redirection == RedirectKind::Fallback) {
     // Attempt to find the original file first, only falling back to the
     // mapped file if that fails.
-    ErrorOr<Status> S = getExternalStatus(CanonicalPath, OriginalPath);
+    ErrorOr<Status> S = getExternalStatus(Path, OriginalPath);
     if (S)
       return S;
   }
 
-  ErrorOr<RedirectingFileSystem::LookupResult> Result =
-      lookupPath(CanonicalPath);
+  ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
   if (!Result) {
     // Was not able to map file, fallthrough to using the original path if
     // that was the specified redirection type.
     if (Redirection == RedirectKind::Fallthrough &&
         isFileNotFound(Result.getError()))
-      return getExternalStatus(CanonicalPath, OriginalPath);
+      return getExternalStatus(Path, OriginalPath);
     return Result.getError();
   }
 
-  ErrorOr<Status> S = status(CanonicalPath, OriginalPath, *Result);
+  ErrorOr<Status> S = status(Path, OriginalPath, *Result);
   if (!S && Redirection == RedirectKind::Fallthrough &&
       isFileNotFound(S.getError(), Result->E)) {
     // Mapped the file but it wasn't found in the underlying filesystem,
     // fallthrough to using the original path if that was the specified
     // redirection type.
-    return getExternalStatus(CanonicalPath, OriginalPath);
+    return getExternalStatus(Path, OriginalPath);
   }
 
   return S;
@@ -2471,30 +2474,27 @@ File::getWithPath(ErrorOr<std::unique_ptr<File>> Result, const Twine &P) {
 
 ErrorOr<std::unique_ptr<File>>
 RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
-  SmallString<256> CanonicalPath;
-  OriginalPath.toVector(CanonicalPath);
+  SmallString<256> Path;
+  OriginalPath.toVector(Path);
 
-  if (std::error_code EC = makeCanonical(CanonicalPath))
+  if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
   if (Redirection == RedirectKind::Fallback) {
     // Attempt to find the original file first, only falling back to the
     // mapped file if that fails.
-    auto F = File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
-                               OriginalPath);
+    auto F = File::getWithPath(ExternalFS->openFileForRead(Path), OriginalPath);
     if (F)
       return F;
   }
 
-  ErrorOr<RedirectingFileSystem::LookupResult> Result =
-      lookupPath(CanonicalPath);
+  ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
   if (!Result) {
     // Was not able to map file, fallthrough to using the original path if
     // that was the specified redirection type.
     if (Redirection == RedirectKind::Fallthrough &&
         isFileNotFound(Result.getError()))
-      return File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
-                               OriginalPath);
+      return File::getWithPath(ExternalFS->openFileForRead(Path), OriginalPath);
     return Result.getError();
   }
 
@@ -2502,22 +2502,21 @@ RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
     return make_error_code(llvm::errc::invalid_argument);
 
   StringRef ExtRedirect = *Result->getExternalRedirect();
-  SmallString<256> CanonicalRemappedPath(ExtRedirect.str());
-  if (std::error_code EC = makeCanonical(CanonicalRemappedPath))
+  SmallString<256> RemappedPath(ExtRedirect.str());
+  if (std::error_code EC = makeAbsolute(RemappedPath))
     return EC;
 
   auto *RE = cast<RedirectingFileSystem::RemapEntry>(Result->E);
 
-  auto ExternalFile = File::getWithPath(
-      ExternalFS->openFileForRead(CanonicalRemappedPath), ExtRedirect);
+  auto ExternalFile =
+      File::getWithPath(ExternalFS->openFileForRead(RemappedPath), ExtRedirect);
   if (!ExternalFile) {
     if (Redirection == RedirectKind::Fallthrough &&
         isFileNotFound(ExternalFile.getError(), Result->E)) {
       // Mapped the file but it wasn't found in the underlying filesystem,
       // fallthrough to using the original path if that was the specified
       // redirection type.
-      return File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
-                               OriginalPath);
+      return File::getWithPath(ExternalFS->openFileForRead(Path), OriginalPath);
     }
     return ExternalFile;
   }
@@ -2537,28 +2536,27 @@ RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
 std::error_code
 RedirectingFileSystem::getRealPath(const Twine &OriginalPath,
                                    SmallVectorImpl<char> &Output) const {
-  SmallString<256> CanonicalPath;
-  OriginalPath.toVector(CanonicalPath);
+  SmallString<256> Path;
+  OriginalPath.toVector(Path);
 
-  if (std::error_code EC = makeCanonical(CanonicalPath))
+  if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
   if (Redirection == RedirectKind::Fallback) {
     // Attempt to find the original file first, only falling back to the
     // mapped file if that fails.
-    std::error_code EC = ExternalFS->getRealPath(CanonicalPath, Output);
+    std::error_code EC = ExternalFS->getRealPath(Path, Output);
     if (!EC)
       return EC;
   }
 
-  ErrorOr<RedirectingFileSystem::LookupResult> Result =
-      lookupPath(CanonicalPath);
+  ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
   if (!Result) {
     // Was not able to map file, fallthrough to using the original path if
     // that was the specified redirection type.
     if (Redirection == RedirectKind::Fallthrough &&
         isFileNotFound(Result.getError()))
-      return ExternalFS->getRealPath(CanonicalPath, Output);
+      return ExternalFS->getRealPath(Path, Output);
     return Result.getError();
   }
 
@@ -2571,7 +2569,7 @@ RedirectingFileSystem::getRealPath(const Twine &OriginalPath,
       // Mapped the file but it wasn't found in the underlying filesystem,
       // fallthrough to using the original path if that was the specified
       // redirection type.
-      return ExternalFS->getRealPath(CanonicalPath, Output);
+      return ExternalFS->getRealPath(Path, Output);
     }
     return P;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 402c7292d7f81..6425aa9b091f7 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -873,6 +873,12 @@ def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler]>;
 
+def TuneA520AE  : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
+                                   "Cortex-A520AE ARM processors", [
+                                   FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
+                                   FeaturePostRAScheduler]>;
+
 def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors", [
                                    FeatureFuseAES,
@@ -1001,6 +1007,17 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720",
                                  FeatureEnableSelectOptimize,
                                  FeaturePredictableSelectIsExpensive]>;
 
+def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720",
+                                "Cortex-A720AE ARM processors", [
+                                 FeatureFuseAES,
+                                 FeaturePostRAScheduler,
+                                 FeatureCmpBccFusion,
+                                 FeatureAddrLSLFast,
+                                 FeatureALULSLFast,
+                                 FeatureFuseAdrpAdd,
+                                 FeatureEnableSelectOptimize,
+                                 FeaturePredictableSelectIsExpensive]>;
+
 def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
                                "CortexR82",
                                "Cortex-R82 ARM processors", [
@@ -1423,6 +1440,9 @@ def ProcessorFeatures {
   list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
                                  FeatureFP16FML];
+  list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
+                                 FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
+                                 FeatureFP16FML];
   list<SubtargetFeature> A65  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureFullFP16, FeatureDotProd,
                                  FeatureRCPC, FeatureSSBS, FeatureRAS,
@@ -1456,6 +1476,9 @@ def ProcessorFeatures {
   list<SubtargetFeature> A720 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
                                  FeatureTRBE, FeatureSVE2BitPerm, FeatureETE,
                                  FeaturePerfMon, FeatureSPE, FeatureSPE_EEF];
+  list<SubtargetFeature> A720AE = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
+                                 FeatureTRBE, FeatureSVE2BitPerm, FeatureETE,
+                                 FeaturePerfMon, FeatureSPE, FeatureSPE_EEF];
   list<SubtargetFeature> R82  = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
                                  FeatureFP16FML, FeatureSSBS, FeaturePredRes,
                                  FeatureSB, FeatureRDM, FeatureDotProd,
@@ -1598,6 +1621,8 @@ def : ProcessorModel<"cortex-a510", CortexA510Model, ProcessorFeatures.A510,
                      [TuneA510]>;
 def : ProcessorModel<"cortex-a520", CortexA510Model, ProcessorFeatures.A520,
                      [TuneA520]>;
+def : ProcessorModel<"cortex-a520ae", CortexA510Model, ProcessorFeatures.A520AE,
+                     [TuneA520AE]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, ProcessorFeatures.A53,
                      [TuneA57]>;
 def : ProcessorModel<"cortex-a65", CortexA53Model, ProcessorFeatures.A65,
@@ -1628,6 +1653,8 @@ def : ProcessorModel<"cortex-a715", NeoverseN2Model, ProcessorFeatures.A715,
                      [TuneA715]>;
 def : ProcessorModel<"cortex-a720", NeoverseN2Model, ProcessorFeatures.A720,
                      [TuneA720]>;
+def : ProcessorModel<"cortex-a720ae", NeoverseN2Model, ProcessorFeatures.A720AE,
+                     [TuneA720AE]>;
 def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82,
                      [TuneR82]>;
 def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1,
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index b2c52b443753d..03f0778bae59d 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -987,7 +987,7 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
   // Expand the pseudo into smstart or smstop instruction. The pseudo has the
   // following operands:
   //
-  //   MSRpstatePseudo <za|sm|both>, <0|1>, pstate.sm, expectedval, <regmask>
+  //   MSRpstatePseudo <za|sm|both>, <0|1>, condition[, pstate.sm], <regmask>
   //
   // The pseudo is expanded into a conditional smstart/smstop, with a
   // check if pstate.sm (register) equals the expected value, and if not,
@@ -997,9 +997,9 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
   // streaming-compatible function:
   //
   // OrigBB:
-  //   MSRpstatePseudo 3, 0, %0, 0, <regmask>             <- Conditional SMSTOP
+  //   MSRpstatePseudo 3, 0, IfCallerIsStreaming, %0, <regmask>  <- Cond SMSTOP
   //   bl @normal_callee
-  //   MSRpstatePseudo 3, 1, %0, 0, <regmask>             <- Conditional SMSTART
+  //   MSRpstatePseudo 3, 1, IfCallerIsStreaming, %0, <regmask>  <- Cond SMSTART
   //
   // ...which will be transformed into:
   //
@@ -1022,11 +1022,20 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
   // We test the live value of pstate.sm and toggle pstate.sm if this is not the
   // expected value for the callee (0 for a normal callee and 1 for a streaming
   // callee).
-  auto PStateSM = MI.getOperand(2).getReg();
+  unsigned Opc;
+  switch (MI.getOperand(2).getImm()) {
+  case AArch64SME::Always:
+    llvm_unreachable("Should have matched to instruction directly");
+  case AArch64SME::IfCallerIsStreaming:
+    Opc = AArch64::TBNZW;
+    break;
+  case AArch64SME::IfCallerIsNonStreaming:
+    Opc = AArch64::TBZW;
+    break;
+  }
+  auto PStateSM = MI.getOperand(3).getReg();
   auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
   unsigned SMReg32 = TRI->getSubReg(PStateSM, AArch64::sub_32);
-  bool IsStreamingCallee = MI.getOperand(3).getImm();
-  unsigned Opc = IsStreamingCallee ? AArch64::TBZW : AArch64::TBNZW;
   MachineInstrBuilder Tbx =
       BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(SMReg32).addImm(0);
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9665ae5ceb903..92f59dbf1e9e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1131,6 +1131,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
           ISD::FMUL,           ISD::FDIV,           ISD::FMA,
           ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
           ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
+          ISD::FSIN,           ISD::FCOS,           ISD::FPOW,
+          ISD::FLOG,           ISD::FLOG2,          ISD::FLOG10,
+          ISD::FEXP,           ISD::FEXP2,          ISD::FEXP10,
           ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
           ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
           ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
@@ -5270,13 +5273,13 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         AArch64ISD::SMSTART, DL, MVT::Other,
         Op->getOperand(0), // Chain
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
   case Intrinsic::aarch64_sme_za_disable:
     return DAG.getNode(
         AArch64ISD::SMSTOP, DL, MVT::Other,
         Op->getOperand(0), // Chain
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
   }
 }
 
@@ -7197,11 +7200,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
           getRegClassFor(PStateSM.getValueType().getSimpleVT()));
       FuncInfo->setPStateSMReg(Reg);
       Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
-    } else {
-      PStateSM = DAG.getConstant(0, DL, MVT::i64);
-    }
-    Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue, PStateSM,
-                                /*Entry*/ true);
+      Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
+                                  AArch64SME::IfCallerIsNonStreaming, PStateSM);
+    } else
+      Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
+                                  AArch64SME::Always);
 
     // Ensure that the SMSTART happens after the CopyWithChain such that its
     // chain result is used.
@@ -7786,9 +7789,11 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
   }
 }
 
-SDValue AArch64TargetLowering::changeStreamingMode(
-    SelectionDAG &DAG, SDLoc DL, bool Enable,
-    SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {
+SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
+                                                   bool Enable, SDValue Chain,
+                                                   SDValue InGlue,
+                                                   unsigned Condition,
+                                                   SDValue PStateSM) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   FuncInfo->setHasStreamingModeChanges(true);
@@ -7797,10 +7802,13 @@ SDValue AArch64TargetLowering::changeStreamingMode(
   SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
   SDValue MSROp =
       DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
-
-  SDValue ExpectedSMVal =
-      DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64);
-  SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask};
+  SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
+  SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
+  if (Condition != AArch64SME::Always) {
+    assert(PStateSM && "PStateSM should be defined");
+    Ops.push_back(PStateSM);
+  }
+  Ops.push_back(RegMask);
 
   if (InGlue)
     Ops.push_back(InGlue);
@@ -7809,6 +7817,19 @@ SDValue AArch64TargetLowering::changeStreamingMode(
   return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
 }
 
+static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
+                               const SMEAttrs &CalleeAttrs) {
+  if (!CallerAttrs.hasStreamingCompatibleInterface() ||
+      CallerAttrs.hasStreamingBody())
+    return AArch64SME::Always;
+  if (CalleeAttrs.hasNonStreamingInterface())
+    return AArch64SME::IfCallerIsStreaming;
+  if (CalleeAttrs.hasStreamingInterface())
+    return AArch64SME::IfCallerIsNonStreaming;
+
+  llvm_unreachable("Unsupported attributes");
+}
+
 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
 /// and add input and output parameter nodes.
 SDValue
@@ -8028,7 +8049,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = DAG.getNode(
         AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
@@ -8299,9 +8320,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue InGlue;
   if (RequiresSMChange) {
-    SDValue NewChain =
-        changeStreamingMode(DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain,
-                            InGlue, PStateSM, true);
+    SDValue NewChain = changeStreamingMode(
+        DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
+        getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
     Chain = NewChain.getValue(0);
     InGlue = NewChain.getValue(1);
   }
@@ -8455,8 +8476,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (RequiresSMChange) {
     assert(PStateSM && "Expected a PStateSM to be set");
-    Result = changeStreamingMode(DAG, DL, !CalleeAttrs.hasStreamingInterface(),
-                                 Result, InGlue, PStateSM, false);
+    Result = changeStreamingMode(
+        DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
+        getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
   }
 
   if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
@@ -8464,7 +8486,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Result = DAG.getNode(
         AArch64ISD::SMSTART, DL, MVT::Other, Result,
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
 
   if (ShouldPreserveZT0)
     Result =
@@ -8599,13 +8621,12 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       Register Reg = FuncInfo->getPStateSMReg();
       assert(Reg.isValid() && "PStateSM Register is invalid");
       SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
-      Chain =
-          changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
-                              /*Glue*/ SDValue(), PStateSM, /*Entry*/ false);
+      Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
+                                  /*Glue*/ SDValue(),
+                                  AArch64SME::IfCallerIsNonStreaming, PStateSM);
     } else
-      Chain = changeStreamingMode(
-          DAG, DL, /*Enable*/ false, Chain,
-          /*Glue*/ SDValue(), DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true);
+      Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
+                                  /*Glue*/ SDValue(), AArch64SME::Always);
     Glue = Chain.getValue(1);
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 68341c199e0a2..89016cbf56e39 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -968,12 +968,12 @@ class AArch64TargetLowering : public TargetLowering {
   bool shouldExpandCttzElements(EVT VT) const override;
 
   /// If a change in streaming mode is required on entry to/return from a
-  /// function call it emits and returns the corresponding SMSTART or SMSTOP node.
-  /// \p Entry tells whether this is before/after the Call, which is necessary
-  /// because PSTATE.SM is only queried once.
+  /// function call it emits and returns the corresponding SMSTART or SMSTOP
+  /// node. \p Condition should be one of the enum values from
+  /// AArch64SME::ToggleCondition.
   SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
-                              SDValue Chain, SDValue InGlue,
-                              SDValue PStateSM, bool Entry) const;
+                              SDValue Chain, SDValue InGlue, unsigned Condition,
+                              SDValue PStateSM = SDValue()) const;
 
   bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 0ab2c401b1749..d0adb78b231a7 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -2362,7 +2362,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
       // Get the needed alignments to check them if
       // ldp-aligned-only/stp-aligned-only features are opted.
       uint64_t MemAlignment = MemOp->getAlign().value();
-      uint64_t TypeAlignment = Align(MemOp->getSize()).value();
+      uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();
 
       if (MemAlignment < 2 * TypeAlignment) {
         NumFailedAlignmentCheck++;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ec59778cbb0ef..ad29003f1e817 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1072,3 +1072,8 @@ bool AArch64RegisterInfo::shouldCoalesce(
 
   return true;
 }
+
+bool AArch64RegisterInfo::shouldAnalyzePhysregInMachineLoopInfo(
+    MCRegister R) const {
+  return R == AArch64::VG;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index e93352c13b3c7..5c8a5e029584f 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -145,6 +145,8 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
 
   void getOffsetOpcodes(const StackOffset &Offset,
                         SmallVectorImpl<uint64_t> &Ops) const override;
+
+  bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2907ba74ff810..b65c67e70a4e0 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -10,12 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-def AArch64_smstart : SDNode<"AArch64ISD::SMSTART", SDTypeProfile<0, 3,
-                             [SDTCisInt<0>, SDTCisInt<0>, SDTCisInt<0>]>,
+def AArch64_smstart : SDNode<"AArch64ISD::SMSTART", SDTypeProfile<0, 2,
+                             [SDTCisInt<0>, SDTCisInt<0>]>,
                              [SDNPHasChain, SDNPSideEffect, SDNPVariadic,
                               SDNPOptInGlue, SDNPOutGlue]>;
-def AArch64_smstop  : SDNode<"AArch64ISD::SMSTOP", SDTypeProfile<0, 3,
-                             [SDTCisInt<0>, SDTCisInt<0>, SDTCisInt<0>]>,
+def AArch64_smstop  : SDNode<"AArch64ISD::SMSTOP", SDTypeProfile<0, 2,
+                             [SDTCisInt<0>, SDTCisInt<0>]>,
                              [SDNPHasChain, SDNPSideEffect, SDNPVariadic,
                               SDNPOptInGlue, SDNPOutGlue]>;
 def AArch64_restore_za : SDNode<"AArch64ISD::RESTORE_ZA", SDTypeProfile<0, 3,
@@ -158,40 +158,14 @@ def : Pat<(AArch64_restore_za
             (i64 GPR64:$tpidr2_el0), (i64 GPR64sp:$tpidr2obj), (i64 texternalsym:$restore_routine)),
           (RestoreZAPseudo GPR64:$tpidr2_el0, GPR64sp:$tpidr2obj, texternalsym:$restore_routine)>;
 
-// Scenario A:
-//
-//   %pstate.before.call = 1
-//   if (%pstate.before.call != 0)
-//     smstop (pstate_za|pstate_sm)
-//   call fn()
-//   if (%pstate.before.call != 0)
-//     smstart (pstate_za|pstate_sm)
-//
-def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 1), (i64 0)),   // before call
-          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
-def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 1), (i64 0)),  // after call
-          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b1)>;
-
-// Scenario B:
-//
-//   %pstate.before.call = 0
-//   if (%pstate.before.call != 1)
-//     smstart (pstate_za|pstate_sm)
-//   call fn()
-//   if (%pstate.before.call != 1)
-//     smstop (pstate_za|pstate_sm)
-//
-def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 0), (i64 1)),  // before call
-          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b1)>;
-def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 0), (i64 1)),   // after call
-          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
-
 // Read and write TPIDR2_EL0
 def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
           (MSR 0xde85, GPR64:$val)>;
 def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
           (MRS 0xde85)>;
 
+} // End let Predicates = [HasSME]
+
 multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> {
   def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> {
     let Constraints = "$dst = $src";
@@ -211,8 +185,6 @@ multiclass CoalescerBarriers {
 
 defm COALESCER_BARRIER : CoalescerBarriers;
 
-} // End let Predicates = [HasSME]
-
 // Pseudo to match to smstart/smstop. This expands:
 //
 //  pseudonode (pstate_za|pstate_sm), before_call, expected_value
@@ -230,17 +202,24 @@ defm COALESCER_BARRIER : CoalescerBarriers;
 // SME instructions.
 def MSRpstatePseudo :
   Pseudo<(outs),
-           (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>,
+           (ins svcr_op:$pstatefield, timm0_1:$imm, timm0_31:$condition, variable_ops), []>,
     Sched<[WriteSys]> {
   let hasPostISelHook = 1;
   let Uses = [VG];
   let Defs = [VG];
 }
 
-def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
-          (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
-def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
-          (MSRpstatePseudo svcr_op:$pstate, 0b0, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
+def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 timm0_31:$condition)),
+          (MSRpstatePseudo svcr_op:$pstate, 0b1, timm0_31:$condition)>;
+def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 timm0_31:$condition)),
+          (MSRpstatePseudo svcr_op:$pstate, 0b0, timm0_31:$condition)>;
+
+// Unconditional start/stop
+def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)),
+          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b1)>;
+def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)),
+          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
+
 
 //===----------------------------------------------------------------------===//
 // SME2 Instructions
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e0a010af41553..dd5e11c0f5e35 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2042,9 +2042,8 @@ let Predicates = [HasSVEorSME] in {
   defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;
 
   def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)),
-                 (i64 (!cast<Instruction>(CNTP_XPP_B)
-                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1)),
-                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1))))>;
+            (CNTP_XPP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op1),
+                        (BRKB_PPzP (PTRUE_B 31), PPR:$Op1))>;
 }
 
   defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
@@ -2131,15 +2130,12 @@ let Predicates = [HasSVEorSME] in {
   defm DECP_ZP     : sve_int_count_v<0b10100, "decp">;
 
   def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv16i1:$Op2)))),
-                 (i64 (!cast<Instruction>(INCP_XP_B)
-                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
-                        GPR64:$Op1))>;
+            (INCP_XP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op2), GPR64:$Op1)>;
 
   def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv16i1:$Op2))))),
-                 (i32 (EXTRACT_SUBREG (i64 (!cast<Instruction>(INCP_XP_B)
-                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
-                        (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Op1, sub_32))),
-                      sub_32))>;
+            (EXTRACT_SUBREG (INCP_XP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op2),
+                                       (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+                            sub_32)>;
 
   defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
   defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
@@ -2521,9 +2517,9 @@ let Predicates = [HasSVEorSME] in {
               (ADDVL_XXI GPR64:$op, $imm)>;
 
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                             GPR32:$op, sub_32), $imm),
-                                   sub_32))>;
+              (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
+                                        GPR32:$op, sub_32), $imm),
+                              sub_32)>;
 
     def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
               (INCH_XPiI GPR64:$op, 31, $imm)>;
@@ -2540,30 +2536,30 @@ let Predicates = [HasSVEorSME] in {
               (DECD_XPiI GPR64:$op, 31, $imm)>;
 
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (INCW_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (INCW_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (INCD_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (INCD_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
 
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm_neg i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (DECW_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (DECW_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm_neg i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
   }
 
   // FIXME: BigEndian requires an additional REV instruction to satisfy the
@@ -3266,58 +3262,58 @@ let Predicates = [HasSVEorSME] in {
   // Extract element from vector with immediate index that's within the bottom 128-bits.
   let Predicates = [IsNeonAvailable], AddedComplexity = 1 in {
   def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)),
-            (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+            (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)),
-            (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+            (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
   def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
-            (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+            (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
   def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)),
-            (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>;
+            (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>;
   } // End IsNeonAvailable
 
   let Predicates = [IsNeonAvailable] in {
   def : Pat<(sext_inreg (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index), i8),
-            (i32 (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+            (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index))), i8),
-            (i64 (SMOVvi8to64 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+            (SMOVvi8to64 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
 
   def : Pat<(sext_inreg (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index), i16),
-            (i32 (SMOVvi16to32 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+            (SMOVvi16to32 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
   def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index))), i16),
-            (i64 (SMOVvi16to64 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+            (SMOVvi16to64 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
 
   def : Pat<(sext (i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index))),
-            (i64 (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+            (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
   } // End IsNeonAvailable
 
   // Extract first element from vector.
   let AddedComplexity = 2 in {
-  def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)),
-            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv8i16 ZPR:$Zs), (i64 0)),
-            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv4i32 ZPR:$Zs), (i64 0)),
-            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv2i64 ZPR:$Zs), (i64 0)),
-            (i64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
-  def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
-            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv4f16 ZPR:$Zs), (i64 0)),
-            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv2f16 ZPR:$Zs), (i64 0)),
-            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv8bf16 ZPR:$Zs), (i64 0)),
-            (bf16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv4bf16 ZPR:$Zs), (i64 0)),
-            (bf16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv2bf16 ZPR:$Zs), (i64 0)),
-            (bf16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
-            (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv2f32 ZPR:$Zs), (i64 0)),
-            (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
-            (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, dsub)>;
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(bf16 (vector_extract (nxv8bf16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(bf16 (vector_extract (nxv4bf16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(bf16 (vector_extract (nxv2bf16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, dsub)>;
   }
 
   multiclass sve_predicated_add<SDNode extend, int value> {
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index ef7c517732ef3..aabc5d5d22e2d 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -532,8 +532,10 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
                               ConstantInt::get(IRB.getInt64Ty(), Tag)});
     if (Info.AI->hasName())
       TagPCall->setName(Info.AI->getName() + ".tag");
-    // Does not replace metadata, so we don't have to handle DPValues.
-    Info.AI->replaceNonMetadataUsesWith(TagPCall);
+    // Does not replace metadata, so we don't have to handle DbgVariableRecords.
+    Info.AI->replaceUsesWithIf(TagPCall, [&](const Use &U) {
+      return !memtag::isLifetimeIntrinsic(U.getUser());
+    });
     TagPCall->setOperand(0, Info.AI);
 
     // Calls to functions that may return twice (e.g. setjmp) confuse the
@@ -550,7 +552,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       uint64_t Size =
           cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
       Size = alignTo(Size, kTagGranuleSize);
-      tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
+      tagAlloca(AI, Start->getNextNode(), TagPCall, Size);
 
       auto TagEnd = [&](Instruction *Node) { untagAlloca(AI, Node, Size); };
       if (!DT || !PDT ||
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 755b034764ed2..7a86c5c608812 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1129,7 +1129,7 @@ static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
       auto *NewRHS =
           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
       auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
-          OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
+          OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
       return IC.replaceInstUsesWith(II, NewBinOp);
     }
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 7a49422c064b7..677dd0b502b95 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2852,8 +2852,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       return false;
     }
 
-    uint64_t MemSizeInBytes = LdSt.getMemSize();
-    unsigned MemSizeInBits = LdSt.getMemSizeInBits();
+    uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
+    unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
@@ -3276,7 +3276,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
       if (LoadMI && IsGPR) {
         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
-        unsigned BytesLoaded = MemOp->getSize();
+        unsigned BytesLoaded = MemOp->getSize().getValue();
         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
           return selectCopy(I, TII, MRI, TRI, RBI);
       }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 36adada279653..996abe8e47396 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -373,6 +373,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalForTypesWithMemDesc(
           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
       .widenScalarToNextPow2(0, /* MinSize = */ 8)
+      .clampMaxNumElements(0, s8, 16)
+      .clampMaxNumElements(0, s16, 8)
+      .clampMaxNumElements(0, s32, 4)
+      .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotByteSizePow2()
       .clampScalar(0, s8, s64)
       .narrowScalarIf(
@@ -383,11 +388,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                    Query.Types[0].getSizeInBits() > 32;
           },
           changeTo(0, s32))
-      .clampMaxNumElements(0, s8, 16)
-      .clampMaxNumElements(0, s16, 8)
-      .clampMaxNumElements(0, s32, 4)
-      .clampMaxNumElements(0, s64, 2)
-      .clampMaxNumElements(0, p0, 2)
       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
       .bitcastIf(typeInSet(0, {v4s8}),
                  [=](const LegalityQuery &Query) {
@@ -600,7 +600,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalIf(ExtLegalFunc)
       .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}})
       .clampScalar(0, s64, s64) // Just for s128, others are handled above.
-      .moreElementsToNextPow2(1)
+      .moreElementsToNextPow2(0)
       .clampMaxNumElements(1, s8, 8)
       .clampMaxNumElements(1, s16, 4)
       .clampMaxNumElements(1, s32, 2)
@@ -628,7 +628,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
                DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
       })
-
+      .clampMinNumElements(0, s8, 8)
+      .clampMinNumElements(0, s16, 4)
+      .clampMinNumElements(0, s32, 2)
       .alwaysLegal();
 
   getActionDefinitionsBuilder(G_SEXT_INREG)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 51c52aad35949..d8ca5494ba50a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -311,7 +311,7 @@ bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) {
   LLT ValTy = MRI.getType(Store.getValueReg());
   if (!ValTy.isVector() || ValTy.getSizeInBits() != 128)
     return false;
-  if (ValTy.getSizeInBits() != Store.getMemSizeInBits())
+  if (Store.getMemSizeInBits() != ValTy.getSizeInBits())
     return false; // Don't split truncating stores.
   if (!MRI.hasOneNonDBGUse(Store.getValueReg()))
     return false;
@@ -658,7 +658,7 @@ bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
         APInt Offset;
         LLT StoredValTy = MRI.getType(St->getValueReg());
         unsigned ValSize = StoredValTy.getSizeInBits();
-        if (ValSize < 32 || ValSize != St->getMMO().getSizeInBits())
+        if (ValSize < 32 || St->getMMO().getSizeInBits() != ValSize)
           continue;
 
         Register PtrReg = St->getPointerReg();
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index c19e02bb03d1f..1e76d58669da1 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -958,39 +958,39 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm,
 
   // combine_op(x, trunc(cntp(all_active, p))) ==> inst p, x
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv16i1 (SVEAllActive)), (nxv16i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv8i1 (SVEAllActive)), (nxv8i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv4i1 (SVEAllActive)), (nxv4i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv2i1 (SVEAllActive)), (nxv2i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
 
   // combine_op(x, trunc(cntp(p, p))) ==> inst p, x
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv16i1 PPRAny:$pred), (nxv16i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv8i1 PPRAny:$pred), (nxv8i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv4i1 PPRAny:$pred), (nxv4i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv2i1 PPRAny:$pred), (nxv2i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
 }
 
 class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
@@ -1195,19 +1195,19 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
               (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>;
 
     def : Pat<(i32 (op GPR32:$Rdn, (i32 (trunc (opcnt (sve_pred_enum:$pattern)))))),
-              (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF),
                                                GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, 1),
-                                    sub_32))>;
+                                    sub_32)>;
 
     def : Pat<(i32 (op GPR32:$Rdn, (mul (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_mul_imm_i32 i32:$imm)))),
-              (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF),
                                                GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm),
-                                    sub_32))>;
+                                    sub_32)>;
 
     def : Pat<(i32 (op GPR32:$Rdn, (shl (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_shl_imm i32:$imm)))),
-              (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF),
                                                GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm),
-                                    sub_32))>;
+                                    sub_32)>;
   }
 }
 
@@ -5160,9 +5160,9 @@ multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
             (cmp $Op1, $Op2, $Op3)>;
   def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)),
             (cmp $Op1, $Op3, $Op2)>;
-  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, cc))),
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (SVEAllActive)), intvt:$Op2, intvt:$Op3, cc))),
             (cmp $Pg, $Op2, $Op3)>;
-  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, invcc))),
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (SVEAllActive)), intvt:$Op2, intvt:$Op3, invcc))),
             (cmp $Pg, $Op3, $Op2)>;
 }
 
@@ -5172,9 +5172,9 @@ multiclass SVE_SETCC_Pat_With_Zero<CondCode cc, CondCode invcc, ValueType predvt
             (cmp $Op1, $Op2)>;
   def : Pat<(predvt (AArch64setcc_z predvt:$Op1, (SVEDup0), intvt:$Op2, invcc)),
             (cmp $Op1, $Op2)>;
-  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)), intvt:$Op1, (SVEDup0), cc))),
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (SVEAllActive)), intvt:$Op1, (SVEDup0), cc))),
             (cmp $Pg, $Op1)>;
-  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)), (SVEDup0), intvt:$Op1, invcc))),
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (SVEAllActive)), (SVEDup0), intvt:$Op1, invcc))),
             (cmp $Pg, $Op1)>;
 }
 
@@ -5258,13 +5258,13 @@ multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
                                     commuted_cc)),
             (cmp $Pg, $Zs1, immtype:$imm)>;
   def : Pat<(predvt (and predvt:$Pg,
-                         (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)),
+                         (AArch64setcc_z_oneuse (predvt (SVEAllActive)),
                                          (intvt ZPR:$Zs1),
                                          (intvt (splat_vector (immtype:$imm))),
                                          cc))),
             (cmp $Pg, $Zs1, immtype:$imm)>;
   def : Pat<(predvt (and predvt:$Pg,
-                         (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)),
+                         (AArch64setcc_z_oneuse (predvt (SVEAllActive)),
                                          (intvt (splat_vector (immtype:$imm))),
                                          (intvt ZPR:$Zs1),
                                          commuted_cc))),
@@ -5743,23 +5743,23 @@ multiclass sve_int_index_ir<string asm, SDPatternOperator mulop, SDPatternOperat
             (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // mul(step_vector(1), dup(Y)) -> index(0, Y).
-  def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv16i1 (SVEAllActive)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_B") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv8i1 (SVEAllActive)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_H") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv4i1 (SVEAllActive)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_S") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),
+  def : Pat<(mulop (nxv2i1 (SVEAllActive)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),
             (!cast<Instruction>(NAME # "_D") (i64 0), GPR64:$Rm)>;
 
   // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
-  def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv16i1 (SVEAllActive)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
             (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv8i1 (SVEAllActive)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
             (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv4i1 (SVEAllActive)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
             (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv2i1 (SVEAllActive)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
             (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>;
 }
 
@@ -5837,13 +5837,13 @@ multiclass sve_int_index_rr<string asm, SDPatternOperator mulop> {
             (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
-  def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv16i1 (SVEAllActive)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_B") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),(nxv8i16 (splat_vector(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv8i1 (SVEAllActive)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),(nxv8i16 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_H") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),(nxv4i32 (splat_vector(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv4i1 (SVEAllActive)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),(nxv4i32 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),(nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
+  def : Pat<(add (mulop (nxv2i1 (SVEAllActive)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),(nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
             (!cast<Instruction>(NAME # "_D") GPR64:$Rn, GPR64:$Rm)>;
 }
 
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index ed8336a2e8ad3..f821bb527aedb 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -591,6 +591,14 @@ namespace AArch64BTIHint {
   #include "AArch64GenSystemOperands.inc"
 }
 
+namespace AArch64SME {
+enum ToggleCondition : unsigned {
+  Always,
+  IfCallerIsStreaming,
+  IfCallerIsNonStreaming
+};
+}
+
 namespace AArch64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d9970a200804a..72e8b59e0a409 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1033,6 +1033,27 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
   OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
 }
 
+// Helper function to add common PAL Metadata 3.0+
+static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
+                                  const SIProgramInfo &CurrentProgramInfo,
+                                  CallingConv::ID CC, const GCNSubtarget &ST) {
+  if (ST.hasIEEEMode())
+    MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
+
+  MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
+  MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
+
+  if (AMDGPU::isCompute(CC)) {
+    MD->setHwStage(CC, ".trap_present",
+                   (bool)CurrentProgramInfo.TrapHandlerEnable);
+    MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
+
+    MD->setHwStage(CC, ".lds_size",
+                   (unsigned)(CurrentProgramInfo.LdsSize *
+                              getLdsDwGranularity(ST) * sizeof(uint32_t)));
+  }
+}
+
 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
 // is AMDPAL.  It stores each compute/SPI register setting and other PAL
 // metadata items into the PALMD::Metadata, combining with any provided by the
@@ -1064,24 +1085,8 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
     }
   } else {
     MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
-    MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
-    MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
-    MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
-
-    if (AMDGPU::isCompute(CC)) {
-      MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
-      MD->setHwStage(CC, ".trap_present",
-                     (bool)CurrentProgramInfo.TrapHandlerEnable);
-
-      // EXCPEnMSB?
-      const unsigned LdsDwGranularity = 128;
-      MD->setHwStage(CC, ".lds_size",
-                     (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
-                                sizeof(uint32_t)));
-      MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
-    } else {
-      MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
-    }
+    MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+    EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
   }
 
   // ScratchSize is in bytes, 16 aligned.
@@ -1135,10 +1140,15 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
   MD->setFunctionScratchSize(FnName, MFI.getStackSize());
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
-  // Set compute registers
-  MD->setRsrc1(CallingConv::AMDGPU_CS,
-               CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
-  MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
+  if (MD->getPALMajorVersion() < 3) {
+    // Set compute registers
+    MD->setRsrc1(CallingConv::AMDGPU_CS,
+                 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
+    MD->setRsrc2(CallingConv::AMDGPU_CS,
+                 CurrentProgramInfo.getComputePGMRSrc2());
+  } else {
+    EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
+  }
 
   // Set optional info
   MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 6d05c3678bf09..7e1f041fa1093 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1301,6 +1301,9 @@ bool AMDGPUCallLowering::lowerTailCall(
   if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
     return false;
 
+  if (Info.ConvergenceCtrlToken) {
+    MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
+  }
   handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
                               ImplicitArgRegs);
 
@@ -1483,6 +1486,9 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
+  if (Info.ConvergenceCtrlToken) {
+    MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
+  }
   handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
                               ImplicitArgRegs);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 0edbbf7cb0af5..bddf3d958a1ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1749,7 +1749,7 @@ static bool isInterestingPHIIncomingValue(const Value *V) {
     // Non constant index/out of bounds index -> folding is unlikely.
     // The latter is more of a sanity check because canonical IR should just
     // have replaced those with poison.
-    if (!Idx || Idx->getSExtValue() >= FVT->getNumElements())
+    if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
       return false;
 
     const auto *VecSrc = IE->getOperand(0);
@@ -1761,7 +1761,7 @@ static bool isInterestingPHIIncomingValue(const Value *V) {
       return false;
 
     CurVal = VecSrc;
-    EltsCovered.set(Idx->getSExtValue());
+    EltsCovered.set(Idx->getZExtValue());
 
     // All elements covered.
     if (EltsCovered.all())
@@ -1987,7 +1987,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
   for (VectorSlice &S : Slices) {
     // We need to reset the build on each iteration, because getSlicedVal may
     // have inserted something into I's BB.
-    B.SetInsertPoint(I.getParent()->getFirstNonPHI());
+    B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
     S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
 
     for (const auto &[Idx, BB] : enumerate(I.blocks())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c99e490014668..bba7682cd7a0d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3556,7 +3556,10 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
   if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
     return false;
 
-  return Ld->getAlign() >= Align(std::min(MMO->getSize(), uint64_t(4))) &&
+  return MMO->getSize().hasValue() &&
+         Ld->getAlign() >=
+             Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
+                            uint64_t(4))) &&
          ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
            Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
           (Subtarget->getScalarizeGlobalBehavior() &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8a71550e5532c..bee43b6c18c88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -582,6 +582,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setMaxAtomicSizeInBitsSupported(64);
   setMaxDivRemBitWidthSupported(64);
+  setMaxLargeFPConvertBitWidthSupported(64);
 }
 
 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index fb829fab0a2c1..5b7fa13f2e835 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -767,19 +767,21 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     // Checking for NaN before canonicalization provides better fidelity when
     // mapping other operations onto fmed3 since the order of operands is
     // unchanged.
-    CallInst *NewCall = nullptr;
+    Value *V = nullptr;
     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
-      NewCall = IC.Builder.CreateMinNum(Src1, Src2);
+      V = IC.Builder.CreateMinNum(Src1, Src2);
     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
-      NewCall = IC.Builder.CreateMinNum(Src0, Src2);
+      V = IC.Builder.CreateMinNum(Src0, Src2);
     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
-      NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
+      V = IC.Builder.CreateMaxNum(Src0, Src1);
     }
 
-    if (NewCall) {
-      NewCall->copyFastMathFlags(&II);
-      NewCall->takeName(&II);
-      return IC.replaceInstUsesWith(II, NewCall);
+    if (V) {
+      if (auto *CI = dyn_cast<CallInst>(V)) {
+        CI->copyFastMathFlags(&II);
+        CI->takeName(&II);
+      }
+      return IC.replaceInstUsesWith(II, V);
     }
 
     bool Swap = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0029c51231f28..90872516dd6db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5661,7 +5661,7 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
   Register RSrc = MI.getOperand(2).getReg();
 
   MachineMemOperand *MMO = *MI.memoperands_begin();
-  const int MemSize = MMO->getSize();
+  const int MemSize = MMO->getSize().getValue();
 
   unsigned ImmOffset;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 2da896edc79a7..84b4ccc1ae7ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -470,9 +470,11 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
     nf.setId(AMDGPULibFunc::EI_COS);
     FunctionCallee cosExpr = getFunction(M, nf);
     if (sinExpr && cosExpr) {
-      Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI);
-      Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI);
-      new StoreInst(cosval, aCI->getArgOperand(1), aCI);
+      Value *sinval =
+          CallInst::Create(sinExpr, opr0, "splitsin", aCI->getIterator());
+      Value *cosval =
+          CallInst::Create(cosExpr, opr0, "splitcos", aCI->getIterator());
+      new StoreInst(cosval, aCI->getArgOperand(1), aCI->getIterator());
 
       DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
                                           << " with native version of sin/cos");
@@ -1655,7 +1657,7 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
     // sincos
     assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
            "math function with ptr arg not supported yet");
-    new StoreInst(nval1, aCI->getArgOperand(1), aCI);
+    new StoreInst(nval1, aCI->getArgOperand(1), aCI->getIterator());
   }
 
   replaceCall(aCI, nval0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 7b5dc3795b022..aef0ade6d9be6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -290,8 +290,8 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
     Value *sumC = ConstantInt::get(SizetTy, Sum, false);
     SmallVector<Value *, 1> alloc_args;
     alloc_args.push_back(sumC);
-    CallInst *pcall =
-        CallInst::Create(PrintfAllocFn, alloc_args, "printf_alloc_fn", CI);
+    CallInst *pcall = CallInst::Create(PrintfAllocFn, alloc_args,
+                                       "printf_alloc_fn", CI->getIterator());
 
     //
     // Insert code to split basicblock with a
@@ -309,25 +309,27 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
     SplitBlock(CI->getParent(), cmp);
     Instruction *Brnch =
         SplitBlockAndInsertIfThen(cmp, cmp->getNextNode(), false);
+    BasicBlock::iterator BrnchPoint = Brnch->getIterator();
 
     Builder.SetInsertPoint(Brnch);
 
     // store unique printf id in the buffer
     //
     GetElementPtrInst *BufferIdx = GetElementPtrInst::Create(
-        I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID", Brnch);
+        I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID",
+        BrnchPoint);
 
     Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
     Value *id_gep_cast =
-        new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch);
+        new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", BrnchPoint);
 
-    new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch);
+    new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, BrnchPoint);
 
     // 1st 4 bytes hold the printf_id
     // the following GEP is the buffer pointer
     BufferIdx = GetElementPtrInst::Create(I8Ty, pcall,
                                           ConstantInt::get(Ctx, APInt(32, 4)),
-                                          "PrintBuffGep", Brnch);
+                                          "PrintBuffGep", BrnchPoint);
 
     Type *Int32Ty = Type::getInt32Ty(Ctx);
     for (unsigned ArgCount = 1;
@@ -405,7 +407,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
       for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) {
         Value *TheBtCast = WhatToStore[I];
         unsigned ArgSize = TD->getTypeAllocSize(TheBtCast->getType());
-        StoreInst *StBuff = new StoreInst(TheBtCast, BufferIdx, Brnch);
+        StoreInst *StBuff = new StoreInst(TheBtCast, BufferIdx, BrnchPoint);
         LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n"
                           << *StBuff << '\n');
         (void)StBuff;
@@ -413,7 +415,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
           break;
         BufferIdx = GetElementPtrInst::Create(
             I8Ty, BufferIdx, {ConstantInt::get(I32Ty, ArgSize)},
-            "PrintBuffNextPtr", Brnch);
+            "PrintBuffNextPtr", BrnchPoint);
         LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
                           << *BufferIdx << '\n');
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index b1b15e9915aea..6f3cdf54dceec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
@@ -39,6 +40,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -64,10 +66,17 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
     cl::desc("Maximum byte size to consider promote alloca to vector"),
     cl::init(0));
 
+static cl::opt<unsigned>
+    LoopUserWeight("promote-alloca-vector-loop-user-weight",
+                   cl::desc("The bonus weight of users of allocas within loop "
+                            "when sorting profitable allocas"),
+                   cl::init(4));
+
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
 private:
   const TargetMachine &TM;
+  LoopInfo &LI;
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
 
@@ -101,8 +110,11 @@ class AMDGPUPromoteAllocaImpl {
   bool tryPromoteAllocaToVector(AllocaInst &I);
   bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
 
+  void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+
 public:
-  AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
+  AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
+
     const Triple &TT = TM.getTargetTriple();
     IsAMDGCN = TT.getArch() == Triple::amdgcn;
     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
@@ -122,7 +134,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
     if (skipFunction(F))
       return false;
     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-      return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+      return AMDGPUPromoteAllocaImpl(
+                 TPC->getTM<TargetMachine>(),
+                 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
           .run(F, /*PromoteToLDS*/ true);
     return false;
   }
@@ -131,6 +145,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -145,7 +160,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
     if (skipFunction(F))
       return false;
     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-      return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+      return AMDGPUPromoteAllocaImpl(
+                 TPC->getTM<TargetMachine>(),
+                 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
           .run(F, /*PromoteToLDS*/ false);
     return false;
   }
@@ -156,6 +173,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -186,18 +204,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
 // Move LDS uses from functions to kernels before promote alloca for accurate
 // estimation of LDS available
 INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
                     "AMDGPU promote alloca to vector or LDS", false, false)
 
-INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
-                "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                      "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                    "AMDGPU promote alloca to vector", false, false)
 
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
 
 PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
-  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -208,7 +231,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
 
 PreservedAnalyses
 AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
-  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -225,6 +249,55 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
   return new AMDGPUPromoteAllocaToVector();
 }
 
+static void collectAllocaUses(AllocaInst &Alloca,
+                              SmallVectorImpl<Use *> &Uses) {
+  SmallVector<Instruction *, 4> WorkList({&Alloca});
+  while (!WorkList.empty()) {
+    auto *Cur = WorkList.pop_back_val();
+    for (auto &U : Cur->uses()) {
+      Uses.push_back(&U);
+
+      if (isa<GetElementPtrInst>(U.getUser()))
+        WorkList.push_back(cast<Instruction>(U.getUser()));
+    }
+  }
+}
+
+void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
+    SmallVectorImpl<AllocaInst *> &Allocas) {
+  DenseMap<AllocaInst *, unsigned> Scores;
+
+  for (auto *Alloca : Allocas) {
+    LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
+    unsigned &Score = Scores[Alloca];
+    // Increment score by one for each user + a bonus for users within loops.
+    SmallVector<Use *, 8> Uses;
+    collectAllocaUses(*Alloca, Uses);
+    for (auto *U : Uses) {
+      Instruction *Inst = cast<Instruction>(U->getUser());
+      if (isa<GetElementPtrInst>(Inst))
+        continue;
+      unsigned UserScore =
+          1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+      LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
+      Score += UserScore;
+    }
+    LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+  }
+
+  stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
+    return Scores.at(A) > Scores.at(B);
+  });
+
+  // clang-format off
+  LLVM_DEBUG(
+    dbgs() << "Sorted Worklist:\n";
+    for (auto *A: Allocas)
+      dbgs() << "  " << *A << "\n";
+  );
+  // clang-format on
+}
+
 bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   Mod = F.getParent();
   DL = &Mod->getDataLayout();
@@ -237,6 +310,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
 
   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
 
+  // Use up to 1/4 of available register budget for vectorization.
+  // FIXME: Increase the limit for whole function budgets? Perhaps x2?
+  unsigned VectorizationBudget =
+      (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+                                  : (MaxVGPRs * 32)) /
+      4;
+
   SmallVector<AllocaInst *, 16> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -248,11 +328,27 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
     }
   }
 
+  sortAllocasToPromote(Allocas);
+
   bool Changed = false;
   for (AllocaInst *AI : Allocas) {
-    if (tryPromoteAllocaToVector(*AI))
+    const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
+    if (AllocaCost > VectorizationBudget) {
+      LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization: " << *AI
+                        << "\n");
+      return false;
+    }
+
+    if (tryPromoteAllocaToVector(*AI)) {
       Changed = true;
-    else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+      assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+             "Underflow!");
+      VectorizationBudget -= AllocaCost;
+      LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
+                        << VectorizationBudget << "\n");
+      if (VectorizationBudget == 0)
+        break;
+    } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
       Changed = true;
   }
 
@@ -641,16 +737,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
                                       ArrayTy->getNumElements());
   }
 
-  // Use up to 1/4 of available register budget for vectorization.
-  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
-                                              : (MaxVGPRs * 32);
-
-  if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
-    LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with " << MaxVGPRs
-                      << " registers available\n");
-    return false;
-  }
-
   // FIXME: There is no reason why we can't support larger arrays, we
   // are just being conservative for now.
   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
@@ -671,7 +757,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   SmallVector<Instruction *> WorkList;
   SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;
-  SmallVector<Use *, 8> Uses;
   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
 
   const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
@@ -680,15 +765,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     return false;
   };
 
-  for (Use &U : Alloca.uses())
-    Uses.push_back(&U);
+  SmallVector<Use *, 8> Uses;
+  collectAllocaUses(Alloca, Uses);
 
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
   Type *VecEltTy = VectorTy->getElementType();
   unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
-  while (!Uses.empty()) {
-    Use *U = Uses.pop_back_val();
+  for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
@@ -724,14 +808,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       continue;
     }
 
-    if (isa<BitCastInst>(Inst)) {
-      // Look through bitcasts.
-      for (Use &U : Inst->uses())
-        Uses.push_back(&U);
-      UsersToRemove.push_back(Inst);
-      continue;
-    }
-
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
@@ -740,8 +816,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
       GEPVectorIdx[GEP] = Index;
-      for (Use &U : Inst->uses())
-        Uses.push_back(&U);
       UsersToRemove.push_back(Inst);
       continue;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b174d57bd5765..0037825ce0893 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -449,7 +449,7 @@ bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
   const unsigned AS = MMO->getAddrSpace();
   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
-  const unsigned MemSize = 8 * MMO->getSize();
+  const unsigned MemSize = 8 * MMO->getSize().getValue();
 
   // Require 4-byte alignment.
   return (MMO->getAlign() >= Align(4) ||
@@ -1070,7 +1070,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
       return false;
 
     MachineMemOperand *MMO = *MI.memoperands_begin();
-    const unsigned MemSize = 8 * MMO->getSize();
+    const unsigned MemSize = 8 * MMO->getSize().getValue();
     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
     // scalar loads should have a load size of 32 but memory access size of less
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 027dd0f2c2244..529705479646f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDKernelCodeT.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
@@ -1816,6 +1817,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
 public:
   void onBeginOfFile() override;
+  bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
 
   ParseStatus parseCustomOperand(OperandVector &Operands, unsigned MCK);
 
@@ -8330,6 +8332,59 @@ void AMDGPUAsmParser::onBeginOfFile() {
     getTargetStreamer().EmitDirectiveAMDGCNTarget();
 }
 
+/// Parse AMDGPU specific expressions.
+///
+///  expr ::= or(expr, ...) |
+///           max(expr, ...)
+///
+bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  using AGVK = AMDGPUVariadicMCExpr::VariadicKind;
+
+  if (isToken(AsmToken::Identifier)) {
+    StringRef TokenId = getTokenStr();
+    AGVK VK = StringSwitch<AGVK>(TokenId)
+                  .Case("max", AGVK::AGVK_Max)
+                  .Case("or", AGVK::AGVK_Or)
+                  .Default(AGVK::AGVK_None);
+
+    if (VK != AGVK::AGVK_None && peekToken().is(AsmToken::LParen)) {
+      SmallVector<const MCExpr *, 4> Exprs;
+      uint64_t CommaCount = 0;
+      lex(); // Eat 'max'/'or'
+      lex(); // Eat '('
+      while (true) {
+        if (trySkipToken(AsmToken::RParen)) {
+          if (Exprs.empty()) {
+            Error(getToken().getLoc(),
+                  "empty " + Twine(TokenId) + " expression");
+            return true;
+          }
+          if (CommaCount + 1 != Exprs.size()) {
+            Error(getToken().getLoc(),
+                  "mismatch of commas in " + Twine(TokenId) + " expression");
+            return true;
+          }
+          Res = AMDGPUVariadicMCExpr::create(VK, Exprs, getContext());
+          return false;
+        }
+        const MCExpr *Expr;
+        if (getParser().parseExpression(Expr, EndLoc))
+          return true;
+        Exprs.push_back(Expr);
+        bool LastTokenWasComma = trySkipToken(AsmToken::Comma);
+        if (LastTokenWasComma)
+          CommaCount++;
+        if (!LastTokenWasComma && !isToken(AsmToken::RParen)) {
+          Error(getToken().getLoc(),
+                "unexpected token in " + Twine(TokenId) + " expression");
+          return true;
+        }
+      }
+    }
+  }
+  return getParser().parsePrimaryExpr(Res, EndLoc, nullptr);
+}
+
 ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) {
   StringRef Name = getTokenStr();
   if (Name == "mul") {
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 87ace01a6d0e5..e944dde159904 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1735,14 +1735,12 @@ def DS_WRITE_B128_vi      : DS_Real_vi<0xdf, DS_WRITE_B128>;
 def DS_READ_B96_vi        : DS_Real_vi<0xfe, DS_READ_B96>;
 def DS_READ_B128_vi       : DS_Real_vi<0xff, DS_READ_B128>;
 
-let SubtargetPredicate = isGFX90APlus in {
-  def DS_ADD_F64_vi     : DS_Real_vi<0x5c, DS_ADD_F64>;
-  def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
-} // End SubtargetPredicate = isGFX90APlus
-
-let SubtargetPredicate = isGFX940Plus in {
-  def DS_PK_ADD_F16_vi     : DS_Real_vi<0x17, DS_PK_ADD_F16>;
-  def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
-  def DS_PK_ADD_BF16_vi     : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
-  def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
-} // End SubtargetPredicate = isGFX940Plus
+// GFX90A+.
+def DS_ADD_F64_vi     : DS_Real_vi<0x5c, DS_ADD_F64>;
+def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
+
+// GFX940+.
+def DS_PK_ADD_F16_vi     : DS_Real_vi<0x17, DS_PK_ADD_F16>;
+def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
+def DS_PK_ADD_BF16_vi     : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
+def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f42d4ae416bd7..3c0a97e3d050c 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -115,8 +115,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   bits<1> lds = ps.lds; // LDS DMA for global and scratch
 
   // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
-  bits<2> seg = !if(ps.is_flat_global, 0b10,
-                  !if(ps.is_flat_scratch, 0b01, 0));
+  bits<2> seg = {ps.is_flat_global, ps.is_flat_scratch};
 
   // Signed offset. Highest bit ignored for flat and treated as 12-bit
   // unsigned for flat accesses.
@@ -174,7 +173,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   bits<8> vaddr;
   bits<24> offset;
 
-  let Inst{6-0} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0);
+  let Inst{6-0} = !if(ps.enabled_saddr, saddr, SGPR_NULL_gfx11plus.Index);
   let Inst{21-14} = op;
   let Inst{31-26} = 0x3b;
   let Inst{39-32} = !if(ps.has_vdst, vdst, ?);
@@ -240,7 +239,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
 }
 
 multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
-  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
+  let is_flat_global = 1 in {
     def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
       GlobalSaddrTable<0, opName>;
     def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
@@ -277,7 +276,7 @@ multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass,
 }
 
 multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
-  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
+  let is_flat_global = 1 in {
     def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
       GlobalSaddrTable<0, opName>;
     def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
@@ -390,6 +389,7 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
      !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in),
                         (ins CPol_0:$cpol))),
   " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
+  let is_flat_scratch = 1;
   let has_data = 0;
   let mayLoad = 1;
   let has_saddr = 1;
@@ -417,6 +417,7 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
         (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
         (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol)))),
   " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
+  let is_flat_scratch = 1;
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
@@ -429,37 +430,33 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
 }
 
 multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> {
-  let is_flat_scratch = 1 in {
-    def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>,
-             FlatScratchInst<opName, "SV">;
-    def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
-                 FlatScratchInst<opName, "SS">;
-
-    let SubtargetPredicate = HasFlatScratchSVSMode in
-    def _SVS : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1, 1>,
-               FlatScratchInst<opName, "SVS">;
+  def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>,
+           FlatScratchInst<opName, "SV">;
+  def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
+               FlatScratchInst<opName, "SS">;
 
-    let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST  : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0, 0>,
-               FlatScratchInst<opName, "ST">;
-  }
+  let SubtargetPredicate = HasFlatScratchSVSMode in
+  def _SVS : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1, 1>,
+             FlatScratchInst<opName, "SVS">;
+
+  let SubtargetPredicate = HasFlatScratchSTMode in
+  def _ST  : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0, 0>,
+             FlatScratchInst<opName, "ST">;
 }
 
 multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
-  let is_flat_scratch = 1 in {
-    def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>,
-             FlatScratchInst<opName, "SV">;
-    def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
-                 FlatScratchInst<opName, "SS">;
-
-    let SubtargetPredicate = HasFlatScratchSVSMode in
-    def _SVS : FLAT_Scratch_Store_Pseudo<opName, regClass, 1, 1>,
-               FlatScratchInst<opName, "SVS">;
+  def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>,
+           FlatScratchInst<opName, "SV">;
+  def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
+               FlatScratchInst<opName, "SS">;
 
-    let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST  : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0, 0>,
-               FlatScratchInst<opName, "ST">;
-  }
+  let SubtargetPredicate = HasFlatScratchSVSMode in
+  def _SVS : FLAT_Scratch_Store_Pseudo<opName, regClass, 1, 1>,
+             FlatScratchInst<opName, "SVS">;
+
+  let SubtargetPredicate = HasFlatScratchSTMode in
+  def _ST  : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0, 0>,
+             FlatScratchInst<opName, "ST">;
 }
 
 class FLAT_Scratch_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0,
@@ -584,25 +581,27 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
   RegisterClass data_rc = vdst_rc,
   RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
 
-  def "" : FLAT_AtomicNoRet_Pseudo <opName,
-    (outs),
-    (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
-    " $vaddr, $vdata, off$offset$cpol">,
-    GlobalSaddrTable<0, opName> {
-    let has_saddr = 1;
-    let PseudoInstr = NAME;
-    let FPAtomic = data_vt.isFP;
-  }
+  let is_flat_global = 1 in {
+    def "" : FLAT_AtomicNoRet_Pseudo <opName,
+      (outs),
+      (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
+      " $vaddr, $vdata, off$offset$cpol">,
+      GlobalSaddrTable<0, opName> {
+      let has_saddr = 1;
+      let PseudoInstr = NAME;
+      let FPAtomic = data_vt.isFP;
+    }
 
-  def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
-    (outs),
-    (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
-    " $vaddr, $vdata, $saddr$offset$cpol">,
-    GlobalSaddrTable<1, opName> {
-    let has_saddr = 1;
-    let enabled_saddr = 1;
-    let PseudoInstr = NAME#"_SADDR";
-    let FPAtomic = data_vt.isFP;
+    def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
+      (outs),
+      (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+      " $vaddr, $vdata, $saddr$offset$cpol">,
+      GlobalSaddrTable<1, opName> {
+      let has_saddr = 1;
+      let enabled_saddr = 1;
+      let PseudoInstr = NAME#"_SADDR";
+      let FPAtomic = data_vt.isFP;
+    }
   }
 }
 
@@ -615,24 +614,26 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
   RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
   RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
 
-  def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_op:$vdst),
-      (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
-    " $vdst, $vaddr, $vdata, off$offset$cpol">,
-    GlobalSaddrTable<0, opName#"_rtn"> {
-    let has_saddr = 1;
-    let FPAtomic = data_vt.isFP;
-  }
+  let is_flat_global = 1 in {
+    def _RTN : FLAT_AtomicRet_Pseudo <opName,
+      (outs vdst_op:$vdst),
+        (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+      " $vdst, $vaddr, $vdata, off$offset$cpol">,
+      GlobalSaddrTable<0, opName#"_rtn"> {
+      let has_saddr = 1;
+      let FPAtomic = data_vt.isFP;
+    }
 
-  def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_op:$vdst),
-      (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
-    " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
-    GlobalSaddrTable<1, opName#"_rtn"> {
-     let has_saddr = 1;
-     let enabled_saddr = 1;
-     let PseudoInstr = NAME#"_SADDR_RTN";
-     let FPAtomic = data_vt.isFP;
+    def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
+      (outs vdst_op:$vdst),
+        (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+      " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
+      GlobalSaddrTable<1, opName#"_rtn"> {
+       let has_saddr = 1;
+       let enabled_saddr = 1;
+       let PseudoInstr = NAME#"_SADDR_RTN";
+       let FPAtomic = data_vt.isFP;
+    }
   }
 }
 
@@ -642,10 +643,8 @@ multiclass FLAT_Global_Atomic_Pseudo<
   ValueType vt,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc> {
-  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
-    defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
-    defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
-  }
+  defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
+  defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -849,7 +848,6 @@ defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_
 defm GLOBAL_STORE_BYTE_D16_HI  : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
 defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
 
-let is_flat_global = 1 in {
 defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap",
                                VGPR_32, i32, v2i32, VReg_64>;
 
@@ -948,9 +946,6 @@ let SubtargetPredicate = isGFX12Plus in {
   def GLOBAL_WBINV  : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
 } // End SubtargetPredicate = isGFX12Plus
 
-} // End is_flat_global = 1
-
-let SubtargetPredicate = HasFlatScratchInsts in {
 defm SCRATCH_LOAD_UBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>;
 defm SCRATCH_LOAD_SBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte", VGPR_32>;
 defm SCRATCH_LOAD_USHORT   : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort", VGPR_32>;
@@ -985,8 +980,6 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u
 defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">;
 defm SCRATCH_LOAD_LDS_DWORD  : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
 
-} // End SubtargetPredicate = HasFlatScratchInsts
-
 let SubtargetPredicate = isGFX12Plus in {
   let WaveSizePredicate = isWave32 in {
     defm GLOBAL_LOAD_TR_B128_w32  : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
@@ -998,7 +991,7 @@ let SubtargetPredicate = isGFX12Plus in {
   }
 } // End SubtargetPredicate = isGFX12Plus
 
-let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
+let SubtargetPredicate = isGFX10Plus in {
   defm GLOBAL_ATOMIC_FCMPSWAP :
     FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
   defm GLOBAL_ATOMIC_FMIN :
@@ -1011,9 +1004,8 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
     FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
   defm GLOBAL_ATOMIC_FMAX_X2 :
     FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
-} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
+} // End SubtargetPredicate = isGFX10Plus
 
-let is_flat_global = 1 in {
 let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
   defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
     "global_atomic_add_f32", VGPR_32, f32
@@ -1030,7 +1022,6 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
   defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
     "global_atomic_pk_add_f16", VGPR_32, v2f16
   >;
-} // End is_flat_global = 1
 
 //===----------------------------------------------------------------------===//
 // Flat Patterns
@@ -1787,45 +1778,46 @@ def FLAT_STORE_DWORDX2_ci      : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>;
 def FLAT_STORE_DWORDX4_ci      : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>;
 def FLAT_STORE_DWORDX3_ci      : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>;
 
-multiclass FLAT_Real_Atomics_ci <bits<7> op, FLAT_Pseudo ps> {
+multiclass FLAT_Real_Atomics_ci <bits<7> op> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
   def _ci     : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
   def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
 }
 
-defm FLAT_ATOMIC_SWAP          : FLAT_Real_Atomics_ci <0x30, FLAT_ATOMIC_SWAP>;
-defm FLAT_ATOMIC_CMPSWAP       : FLAT_Real_Atomics_ci <0x31, FLAT_ATOMIC_CMPSWAP>;
-defm FLAT_ATOMIC_ADD           : FLAT_Real_Atomics_ci <0x32, FLAT_ATOMIC_ADD>;
-defm FLAT_ATOMIC_SUB           : FLAT_Real_Atomics_ci <0x33, FLAT_ATOMIC_SUB>;
-defm FLAT_ATOMIC_SMIN          : FLAT_Real_Atomics_ci <0x35, FLAT_ATOMIC_SMIN>;
-defm FLAT_ATOMIC_UMIN          : FLAT_Real_Atomics_ci <0x36, FLAT_ATOMIC_UMIN>;
-defm FLAT_ATOMIC_SMAX          : FLAT_Real_Atomics_ci <0x37, FLAT_ATOMIC_SMAX>;
-defm FLAT_ATOMIC_UMAX          : FLAT_Real_Atomics_ci <0x38, FLAT_ATOMIC_UMAX>;
-defm FLAT_ATOMIC_AND           : FLAT_Real_Atomics_ci <0x39, FLAT_ATOMIC_AND>;
-defm FLAT_ATOMIC_OR            : FLAT_Real_Atomics_ci <0x3a, FLAT_ATOMIC_OR>;
-defm FLAT_ATOMIC_XOR           : FLAT_Real_Atomics_ci <0x3b, FLAT_ATOMIC_XOR>;
-defm FLAT_ATOMIC_INC           : FLAT_Real_Atomics_ci <0x3c, FLAT_ATOMIC_INC>;
-defm FLAT_ATOMIC_DEC           : FLAT_Real_Atomics_ci <0x3d, FLAT_ATOMIC_DEC>;
-defm FLAT_ATOMIC_SWAP_X2       : FLAT_Real_Atomics_ci <0x50, FLAT_ATOMIC_SWAP_X2>;
-defm FLAT_ATOMIC_CMPSWAP_X2    : FLAT_Real_Atomics_ci <0x51, FLAT_ATOMIC_CMPSWAP_X2>;
-defm FLAT_ATOMIC_ADD_X2        : FLAT_Real_Atomics_ci <0x52, FLAT_ATOMIC_ADD_X2>;
-defm FLAT_ATOMIC_SUB_X2        : FLAT_Real_Atomics_ci <0x53, FLAT_ATOMIC_SUB_X2>;
-defm FLAT_ATOMIC_SMIN_X2       : FLAT_Real_Atomics_ci <0x55, FLAT_ATOMIC_SMIN_X2>;
-defm FLAT_ATOMIC_UMIN_X2       : FLAT_Real_Atomics_ci <0x56, FLAT_ATOMIC_UMIN_X2>;
-defm FLAT_ATOMIC_SMAX_X2       : FLAT_Real_Atomics_ci <0x57, FLAT_ATOMIC_SMAX_X2>;
-defm FLAT_ATOMIC_UMAX_X2       : FLAT_Real_Atomics_ci <0x58, FLAT_ATOMIC_UMAX_X2>;
-defm FLAT_ATOMIC_AND_X2        : FLAT_Real_Atomics_ci <0x59, FLAT_ATOMIC_AND_X2>;
-defm FLAT_ATOMIC_OR_X2         : FLAT_Real_Atomics_ci <0x5a, FLAT_ATOMIC_OR_X2>;
-defm FLAT_ATOMIC_XOR_X2        : FLAT_Real_Atomics_ci <0x5b, FLAT_ATOMIC_XOR_X2>;
-defm FLAT_ATOMIC_INC_X2        : FLAT_Real_Atomics_ci <0x5c, FLAT_ATOMIC_INC_X2>;
-defm FLAT_ATOMIC_DEC_X2        : FLAT_Real_Atomics_ci <0x5d, FLAT_ATOMIC_DEC_X2>;
+defm FLAT_ATOMIC_SWAP          : FLAT_Real_Atomics_ci <0x30>;
+defm FLAT_ATOMIC_CMPSWAP       : FLAT_Real_Atomics_ci <0x31>;
+defm FLAT_ATOMIC_ADD           : FLAT_Real_Atomics_ci <0x32>;
+defm FLAT_ATOMIC_SUB           : FLAT_Real_Atomics_ci <0x33>;
+defm FLAT_ATOMIC_SMIN          : FLAT_Real_Atomics_ci <0x35>;
+defm FLAT_ATOMIC_UMIN          : FLAT_Real_Atomics_ci <0x36>;
+defm FLAT_ATOMIC_SMAX          : FLAT_Real_Atomics_ci <0x37>;
+defm FLAT_ATOMIC_UMAX          : FLAT_Real_Atomics_ci <0x38>;
+defm FLAT_ATOMIC_AND           : FLAT_Real_Atomics_ci <0x39>;
+defm FLAT_ATOMIC_OR            : FLAT_Real_Atomics_ci <0x3a>;
+defm FLAT_ATOMIC_XOR           : FLAT_Real_Atomics_ci <0x3b>;
+defm FLAT_ATOMIC_INC           : FLAT_Real_Atomics_ci <0x3c>;
+defm FLAT_ATOMIC_DEC           : FLAT_Real_Atomics_ci <0x3d>;
+defm FLAT_ATOMIC_SWAP_X2       : FLAT_Real_Atomics_ci <0x50>;
+defm FLAT_ATOMIC_CMPSWAP_X2    : FLAT_Real_Atomics_ci <0x51>;
+defm FLAT_ATOMIC_ADD_X2        : FLAT_Real_Atomics_ci <0x52>;
+defm FLAT_ATOMIC_SUB_X2        : FLAT_Real_Atomics_ci <0x53>;
+defm FLAT_ATOMIC_SMIN_X2       : FLAT_Real_Atomics_ci <0x55>;
+defm FLAT_ATOMIC_UMIN_X2       : FLAT_Real_Atomics_ci <0x56>;
+defm FLAT_ATOMIC_SMAX_X2       : FLAT_Real_Atomics_ci <0x57>;
+defm FLAT_ATOMIC_UMAX_X2       : FLAT_Real_Atomics_ci <0x58>;
+defm FLAT_ATOMIC_AND_X2        : FLAT_Real_Atomics_ci <0x59>;
+defm FLAT_ATOMIC_OR_X2         : FLAT_Real_Atomics_ci <0x5a>;
+defm FLAT_ATOMIC_XOR_X2        : FLAT_Real_Atomics_ci <0x5b>;
+defm FLAT_ATOMIC_INC_X2        : FLAT_Real_Atomics_ci <0x5c>;
+defm FLAT_ATOMIC_DEC_X2        : FLAT_Real_Atomics_ci <0x5d>;
 
 // CI Only flat instructions
-defm FLAT_ATOMIC_FCMPSWAP      : FLAT_Real_Atomics_ci <0x3e, FLAT_ATOMIC_FCMPSWAP>;
-defm FLAT_ATOMIC_FMIN          : FLAT_Real_Atomics_ci <0x3f, FLAT_ATOMIC_FMIN>;
-defm FLAT_ATOMIC_FMAX          : FLAT_Real_Atomics_ci <0x40, FLAT_ATOMIC_FMAX>;
-defm FLAT_ATOMIC_FCMPSWAP_X2   : FLAT_Real_Atomics_ci <0x5e, FLAT_ATOMIC_FCMPSWAP_X2>;
-defm FLAT_ATOMIC_FMIN_X2       : FLAT_Real_Atomics_ci <0x5f, FLAT_ATOMIC_FMIN_X2>;
-defm FLAT_ATOMIC_FMAX_X2       : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2>;
+defm FLAT_ATOMIC_FCMPSWAP      : FLAT_Real_Atomics_ci <0x3e>;
+defm FLAT_ATOMIC_FMIN          : FLAT_Real_Atomics_ci <0x3f>;
+defm FLAT_ATOMIC_FMAX          : FLAT_Real_Atomics_ci <0x40>;
+defm FLAT_ATOMIC_FCMPSWAP_X2   : FLAT_Real_Atomics_ci <0x5e>;
+defm FLAT_ATOMIC_FMIN_X2       : FLAT_Real_Atomics_ci <0x5f>;
+defm FLAT_ATOMIC_FMAX_X2       : FLAT_Real_Atomics_ci <0x60>;
 
 
 //===----------------------------------------------------------------------===//
@@ -1925,8 +1917,9 @@ def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
 def FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
 def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
 
-multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps,
+multiclass FLAT_Real_Atomics_vi <bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
   def _vi     : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
   def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
 }
@@ -1939,32 +1932,32 @@ multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
 }
 
 
-defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40, FLAT_ATOMIC_SWAP>;
-defm FLAT_ATOMIC_CMPSWAP    : FLAT_Real_Atomics_vi <0x41, FLAT_ATOMIC_CMPSWAP>;
-defm FLAT_ATOMIC_ADD        : FLAT_Real_Atomics_vi <0x42, FLAT_ATOMIC_ADD>;
-defm FLAT_ATOMIC_SUB        : FLAT_Real_Atomics_vi <0x43, FLAT_ATOMIC_SUB>;
-defm FLAT_ATOMIC_SMIN       : FLAT_Real_Atomics_vi <0x44, FLAT_ATOMIC_SMIN>;
-defm FLAT_ATOMIC_UMIN       : FLAT_Real_Atomics_vi <0x45, FLAT_ATOMIC_UMIN>;
-defm FLAT_ATOMIC_SMAX       : FLAT_Real_Atomics_vi <0x46, FLAT_ATOMIC_SMAX>;
-defm FLAT_ATOMIC_UMAX       : FLAT_Real_Atomics_vi <0x47, FLAT_ATOMIC_UMAX>;
-defm FLAT_ATOMIC_AND        : FLAT_Real_Atomics_vi <0x48, FLAT_ATOMIC_AND>;
-defm FLAT_ATOMIC_OR         : FLAT_Real_Atomics_vi <0x49, FLAT_ATOMIC_OR>;
-defm FLAT_ATOMIC_XOR        : FLAT_Real_Atomics_vi <0x4a, FLAT_ATOMIC_XOR>;
-defm FLAT_ATOMIC_INC        : FLAT_Real_Atomics_vi <0x4b, FLAT_ATOMIC_INC>;
-defm FLAT_ATOMIC_DEC        : FLAT_Real_Atomics_vi <0x4c, FLAT_ATOMIC_DEC>;
-defm FLAT_ATOMIC_SWAP_X2    : FLAT_Real_Atomics_vi <0x60, FLAT_ATOMIC_SWAP_X2>;
-defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_vi <0x61, FLAT_ATOMIC_CMPSWAP_X2>;
-defm FLAT_ATOMIC_ADD_X2     : FLAT_Real_Atomics_vi <0x62, FLAT_ATOMIC_ADD_X2>;
-defm FLAT_ATOMIC_SUB_X2     : FLAT_Real_Atomics_vi <0x63, FLAT_ATOMIC_SUB_X2>;
-defm FLAT_ATOMIC_SMIN_X2    : FLAT_Real_Atomics_vi <0x64, FLAT_ATOMIC_SMIN_X2>;
-defm FLAT_ATOMIC_UMIN_X2    : FLAT_Real_Atomics_vi <0x65, FLAT_ATOMIC_UMIN_X2>;
-defm FLAT_ATOMIC_SMAX_X2    : FLAT_Real_Atomics_vi <0x66, FLAT_ATOMIC_SMAX_X2>;
-defm FLAT_ATOMIC_UMAX_X2    : FLAT_Real_Atomics_vi <0x67, FLAT_ATOMIC_UMAX_X2>;
-defm FLAT_ATOMIC_AND_X2     : FLAT_Real_Atomics_vi <0x68, FLAT_ATOMIC_AND_X2>;
-defm FLAT_ATOMIC_OR_X2      : FLAT_Real_Atomics_vi <0x69, FLAT_ATOMIC_OR_X2>;
-defm FLAT_ATOMIC_XOR_X2     : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
-defm FLAT_ATOMIC_INC_X2     : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
-defm FLAT_ATOMIC_DEC_X2     : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
+defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40>;
+defm FLAT_ATOMIC_CMPSWAP    : FLAT_Real_Atomics_vi <0x41>;
+defm FLAT_ATOMIC_ADD        : FLAT_Real_Atomics_vi <0x42>;
+defm FLAT_ATOMIC_SUB        : FLAT_Real_Atomics_vi <0x43>;
+defm FLAT_ATOMIC_SMIN       : FLAT_Real_Atomics_vi <0x44>;
+defm FLAT_ATOMIC_UMIN       : FLAT_Real_Atomics_vi <0x45>;
+defm FLAT_ATOMIC_SMAX       : FLAT_Real_Atomics_vi <0x46>;
+defm FLAT_ATOMIC_UMAX       : FLAT_Real_Atomics_vi <0x47>;
+defm FLAT_ATOMIC_AND        : FLAT_Real_Atomics_vi <0x48>;
+defm FLAT_ATOMIC_OR         : FLAT_Real_Atomics_vi <0x49>;
+defm FLAT_ATOMIC_XOR        : FLAT_Real_Atomics_vi <0x4a>;
+defm FLAT_ATOMIC_INC        : FLAT_Real_Atomics_vi <0x4b>;
+defm FLAT_ATOMIC_DEC        : FLAT_Real_Atomics_vi <0x4c>;
+defm FLAT_ATOMIC_SWAP_X2    : FLAT_Real_Atomics_vi <0x60>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_vi <0x61>;
+defm FLAT_ATOMIC_ADD_X2     : FLAT_Real_Atomics_vi <0x62>;
+defm FLAT_ATOMIC_SUB_X2     : FLAT_Real_Atomics_vi <0x63>;
+defm FLAT_ATOMIC_SMIN_X2    : FLAT_Real_Atomics_vi <0x64>;
+defm FLAT_ATOMIC_UMIN_X2    : FLAT_Real_Atomics_vi <0x65>;
+defm FLAT_ATOMIC_SMAX_X2    : FLAT_Real_Atomics_vi <0x66>;
+defm FLAT_ATOMIC_UMAX_X2    : FLAT_Real_Atomics_vi <0x67>;
+defm FLAT_ATOMIC_AND_X2     : FLAT_Real_Atomics_vi <0x68>;
+defm FLAT_ATOMIC_OR_X2      : FLAT_Real_Atomics_vi <0x69>;
+defm FLAT_ATOMIC_XOR_X2     : FLAT_Real_Atomics_vi <0x6a>;
+defm FLAT_ATOMIC_INC_X2     : FLAT_Real_Atomics_vi <0x6b>;
+defm FLAT_ATOMIC_DEC_X2     : FLAT_Real_Atomics_vi <0x6c>;
 
 defm GLOBAL_LOAD_UBYTE : FLAT_Real_AllAddr_vi <0x10>;
 defm GLOBAL_LOAD_SBYTE : FLAT_Real_AllAddr_vi <0x11>;
@@ -2060,9 +2053,9 @@ let SubtargetPredicate = isGFX8GFX9NotGFX940 in {
 }
 
 let SubtargetPredicate = isGFX90AOnly in {
-  defm FLAT_ATOMIC_ADD_F64   : FLAT_Real_Atomics_vi<0x4f, FLAT_ATOMIC_ADD_F64, 0>;
-  defm FLAT_ATOMIC_MIN_F64   : FLAT_Real_Atomics_vi<0x50, FLAT_ATOMIC_MIN_F64, 0>;
-  defm FLAT_ATOMIC_MAX_F64   : FLAT_Real_Atomics_vi<0x51, FLAT_ATOMIC_MAX_F64, 0>;
+  defm FLAT_ATOMIC_ADD_F64   : FLAT_Real_Atomics_vi<0x4f, 0>;
+  defm FLAT_ATOMIC_MIN_F64   : FLAT_Real_Atomics_vi<0x50, 0>;
+  defm FLAT_ATOMIC_MAX_F64   : FLAT_Real_Atomics_vi<0x51, 0>;
   defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_vi<0x4f, 0>;
   defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_vi<0x50, 0>;
   defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>;
@@ -2073,7 +2066,8 @@ multiclass FLAT_Real_AllAddr_gfx940<bits<7> op> {
   def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
 }
 
-multiclass FLAT_Real_Atomics_gfx940 <bits<7> op, FLAT_Pseudo ps> {
+multiclass FLAT_Real_Atomics_gfx940 <bits<7> op> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
   def _gfx940     : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
   def _RTN_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
 }
@@ -2089,15 +2083,15 @@ let SubtargetPredicate = isGFX940Plus in {
   defm GLOBAL_ATOMIC_ADD_F32     : FLAT_Global_Real_Atomics_gfx940 <0x04d>;
   defm GLOBAL_ATOMIC_PK_ADD_F16  : FLAT_Global_Real_Atomics_gfx940 <0x04e>;
 
-  defm FLAT_ATOMIC_ADD_F64       : FLAT_Real_Atomics_gfx940<0x4f, FLAT_ATOMIC_ADD_F64>;
-  defm FLAT_ATOMIC_MIN_F64       : FLAT_Real_Atomics_gfx940<0x50, FLAT_ATOMIC_MIN_F64>;
-  defm FLAT_ATOMIC_MAX_F64       : FLAT_Real_Atomics_gfx940<0x51, FLAT_ATOMIC_MAX_F64>;
+  defm FLAT_ATOMIC_ADD_F64       : FLAT_Real_Atomics_gfx940<0x4f>;
+  defm FLAT_ATOMIC_MIN_F64       : FLAT_Real_Atomics_gfx940<0x50>;
+  defm FLAT_ATOMIC_MAX_F64       : FLAT_Real_Atomics_gfx940<0x51>;
   defm GLOBAL_ATOMIC_ADD_F64     : FLAT_Global_Real_Atomics_gfx940<0x4f>;
   defm GLOBAL_ATOMIC_MIN_F64     : FLAT_Global_Real_Atomics_gfx940<0x50>;
   defm GLOBAL_ATOMIC_MAX_F64     : FLAT_Global_Real_Atomics_gfx940<0x51>;
-  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi<0x4d, FLAT_ATOMIC_ADD_F32>;
-  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi<0x4e, FLAT_ATOMIC_PK_ADD_F16>;
-  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi<0x52, FLAT_ATOMIC_PK_ADD_BF16>;
+  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi<0x4d>;
+  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi<0x4e>;
+  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi<0x52>;
   defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
 } // End SubtargetPredicate = isGFX940Plus
 
@@ -2112,7 +2106,9 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
 
   let Inst{11-0}  = offset{11-0};
   let Inst{12}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue);
-  let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d);
+  let Inst{54-48} = !cond(ps.enabled_saddr : saddr,
+                          !and(ps.is_flat_scratch, !not(ps.has_vaddr)) : EXEC_HI.Index{6-0}, // ST mode
+                          true : SGPR_NULL_gfxpre11.Index{6-0});
   let Inst{55}    = 0;
 }
 
@@ -2139,10 +2135,7 @@ multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> {
 
 multiclass FLAT_Real_ST_gfx10<bits<7> op> {
   def _ST_gfx10 :
-    FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")> {
-      let Inst{54-48} = EXEC_HI.Index;
-      let OtherPredicates = [HasFlatScratchSTMode];
-    }
+    FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
 }
 
 multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> :
@@ -2350,6 +2343,7 @@ class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic>
   let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue);
   let Inst{15}    = cpol{CPolBit.SLC};
   let Inst{17-16} = seg;
+  let Inst{54-48} = !if(ps.enabled_saddr, saddr, SGPR_NULL_gfx11plus.Index);
   let Inst{55}    = ps.sve;
 }
 
@@ -2360,15 +2354,11 @@ multiclass FLAT_Aliases_gfx11<string ps, string opName, int renamed> {
 
 multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
   FLAT_Aliases_gfx11<ps, opName, renamed> {
-  def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName> {
-    let Inst{54-48} = SGPR_NULL_gfx11plus.Index;
-  }
+  def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName>;
 }
 
 multiclass FLAT_Real_RTN_gfx11<bits<7> op, string ps, string opName> {
-  def _RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> {
-    let Inst{54-48} = SGPR_NULL_gfx11plus.Index;
-  }
+  def _RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName>;
 }
 
 multiclass FLAT_Real_SADDR_gfx11<bits<7> op, string ps, string opName> {
@@ -2380,16 +2370,11 @@ multiclass FLAT_Real_SADDR_RTN_gfx11<bits<7> op, string ps, string opName> {
 }
 
 multiclass FLAT_Real_ST_gfx11<bits<7> op, string ps, string opName> {
-  def _ST_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> {
-    let Inst{54-48} = SGPR_NULL_gfx11plus.Index;
-    let OtherPredicates = [HasFlatScratchSTMode];
-  }
+  def _ST_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName>;
 }
 
 multiclass FLAT_Real_SVS_gfx11<bits<7> op, string ps, string opName> {
-  def _SVS_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName> {
-    let OtherPredicates = [HasFlatScratchSVSMode];
-  }
+  def _SVS_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName>;
 }
 
 multiclass FLAT_Real_AllAddr_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
@@ -2562,8 +2547,7 @@ class VFLAT_Real_gfx12 <bits<8> op, FLAT_Pseudo ps,
   let AssemblerPredicate = isGFX12Plus;
   let DecoderNamespace = "GFX12";
 
-  let Inst{25-24} = !if(ps.is_flat_scratch, 0b01,
-                        !if(ps.is_flat_global, 0b10, 0b00));
+  let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
 }
 
 multiclass VFLAT_Aliases_gfx12<string ps, string opName, int renamed, string alias> {
@@ -2576,15 +2560,11 @@ multiclass VFLAT_Aliases_gfx12<string ps, string opName, int renamed, string ali
 multiclass VFLAT_Real_Base_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
                                  int renamed = false, string alias = ""> :
   VFLAT_Aliases_gfx12<ps, opName, renamed, alias> {
-  def _gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps), opName> {
-    let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
-  }
+  def _gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps), opName>;
 }
 
 multiclass VFLAT_Real_RTN_gfx12<bits<8> op, string ps, string opName> {
-  def _RTN_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> {
-    let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
-  }
+  def _RTN_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName>;
 }
 
 multiclass VFLAT_Real_SADDR_gfx12<bits<8> op, string ps, string opName> {
@@ -2596,16 +2576,11 @@ multiclass VFLAT_Real_SADDR_RTN_gfx12<bits<8> op, string ps, string opName> {
 }
 
 multiclass VFLAT_Real_ST_gfx12<bits<8> op, string ps, string opName> {
-  def _ST_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> {
-    let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
-    let OtherPredicates = [HasFlatScratchSTMode];
-  }
+  def _ST_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName>;
 }
 
 multiclass VFLAT_Real_SVS_gfx12<bits<8> op, string ps, string opName> {
-  def _SVS_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName> {
-    let OtherPredicates = [HasFlatScratchSVSMode];
-  }
+  def _SVS_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName>;
 }
 
 multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
new file mode 100644
index 0000000000000..4578c33d92dce
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -0,0 +1,115 @@
+//===- AMDGPUMCExpr.cpp - AMDGPU specific MC expression classes -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+
+using namespace llvm;
+
+AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
+                                           ArrayRef<const MCExpr *> Args,
+                                           MCContext &Ctx)
+    : Kind(Kind), Ctx(Ctx) {
+  assert(Args.size() >= 1 && "Needs a minimum of one expression.");
+  assert(Kind != AGVK_None &&
+         "Cannot construct AMDGPUVariadicMCExpr of kind none.");
+
+  // Allocating the variadic arguments through the same allocation mechanism
+  // that the object itself is allocated with so they end up in the same memory.
+  //
+  // Will result in an asan failure if allocated on the heap through standard
+  // allocation (e.g., through SmallVector's grow).
+  RawArgs = static_cast<const MCExpr **>(
+      Ctx.allocate(sizeof(const MCExpr *) * Args.size()));
+  std::uninitialized_copy(Args.begin(), Args.end(), RawArgs);
+  this->Args = ArrayRef<const MCExpr *>(RawArgs, Args.size());
+}
+
+AMDGPUVariadicMCExpr::~AMDGPUVariadicMCExpr() { Ctx.deallocate(RawArgs); }
+
+const AMDGPUVariadicMCExpr *
+AMDGPUVariadicMCExpr::create(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
+                             MCContext &Ctx) {
+  return new (Ctx) AMDGPUVariadicMCExpr(Kind, Args, Ctx);
+}
+
+const MCExpr *AMDGPUVariadicMCExpr::getSubExpr(size_t Index) const {
+  assert(Index < Args.size() &&
+         "Indexing out of bounds AMDGPUVariadicMCExpr sub-expr");
+  return Args[Index];
+}
+
+void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
+                                     const MCAsmInfo *MAI) const {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
+  case AGVK_Or:
+    OS << "or(";
+    break;
+  case AGVK_Max:
+    OS << "max(";
+    break;
+  }
+  for (auto It = Args.begin(); It != Args.end(); ++It) {
+    (*It)->print(OS, MAI, /*InParens=*/false);
+    if ((It + 1) != Args.end())
+      OS << ", ";
+  }
+  OS << ')';
+}
+
+static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1,
+                  int64_t Arg2) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
+  case AMDGPUVariadicMCExpr::AGVK_Max:
+    return std::max(Arg1, Arg2);
+  case AMDGPUVariadicMCExpr::AGVK_Or:
+    return Arg1 | Arg2;
+  }
+}
+
+bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
+    MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
+  std::optional<int64_t> Total;
+
+  for (const MCExpr *Arg : Args) {
+    MCValue ArgRes;
+    if (!Arg->evaluateAsRelocatable(ArgRes, Layout, Fixup) ||
+        !ArgRes.isAbsolute())
+      return false;
+
+    if (!Total.has_value())
+      Total = ArgRes.getConstant();
+    Total = op(Kind, *Total, ArgRes.getConstant());
+  }
+
+  Res = MCValue::get(*Total);
+  return true;
+}
+
+void AMDGPUVariadicMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  for (const MCExpr *Arg : Args)
+    Streamer.visitUsedExpr(*Arg);
+}
+
+MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
+  for (const MCExpr *Arg : Args) {
+    if (Arg->findAssociatedFragment())
+      return Arg->findAssociatedFragment();
+  }
+  return nullptr;
+}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
new file mode 100644
index 0000000000000..238e0dea791b2
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -0,0 +1,72 @@
+//===- AMDGPUMCExpr.h - AMDGPU specific MC expression classes ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCEXPR_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCEXPR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+/// AMDGPU target specific variadic MCExpr operations.
+///
+/// Takes in a minimum of 1 argument to be used with an operation. The supported
+/// operations are:
+///   - (bitwise) or
+///   - max
+///
+/// \note If the 'or'/'max' operations are provided only a single argument, the
+/// operation will act as a no-op and simply resolve as the provided argument.
+///
+class AMDGPUVariadicMCExpr : public MCTargetExpr {
+public:
+  enum VariadicKind { AGVK_None, AGVK_Or, AGVK_Max };
+
+private:
+  VariadicKind Kind;
+  MCContext &Ctx;
+  const MCExpr **RawArgs;
+  ArrayRef<const MCExpr *> Args;
+
+  AMDGPUVariadicMCExpr(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
+                       MCContext &Ctx);
+  ~AMDGPUVariadicMCExpr();
+
+public:
+  static const AMDGPUVariadicMCExpr *
+  create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
+
+  static const AMDGPUVariadicMCExpr *createOr(ArrayRef<const MCExpr *> Args,
+                                              MCContext &Ctx) {
+    return create(VariadicKind::AGVK_Or, Args, Ctx);
+  }
+
+  static const AMDGPUVariadicMCExpr *createMax(ArrayRef<const MCExpr *> Args,
+                                               MCContext &Ctx) {
+    return create(VariadicKind::AGVK_Max, Args, Ctx);
+  }
+
+  VariadicKind getKind() const { return Kind; }
+  const MCExpr *getSubExpr(size_t Index) const;
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override;
+  void fixELFSymbolsInTLSFixups(MCAssembler &) const override{};
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCEXPR_H
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index 5dc76071b0594..0842a58f794b3 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMAMDGPUDesc
   AMDGPUInstPrinter.cpp
   AMDGPUMCAsmInfo.cpp
   AMDGPUMCCodeEmitter.cpp
+  AMDGPUMCExpr.cpp
   AMDGPUMCTargetDesc.cpp
   AMDGPUTargetStreamer.cpp
   R600InstPrinter.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 0b516bfffb9b6..1f0207ddb0ebd 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -460,7 +460,8 @@ enum Id { // Message ID, width(4) [3:0].
   ID_RTN_GET_REALTIME = 131,
   ID_RTN_SAVE_WAVE = 132,
   ID_RTN_GET_TBA = 133,
-  ID_RTN_GET_SE_AID_ID = 134,
+  ID_RTN_GET_TBA_TO_PC = 134,
+  ID_RTN_GET_SE_AID_ID = 135,
 
   ID_MASK_PreGFX11_ = 0xF,
   ID_MASK_GFX11Plus_ = 0xFF
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index a6184c5e1e048..27621906e4c5f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -485,6 +485,16 @@ class WaitcntGenerator {
   virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
 
   virtual ~WaitcntGenerator() = default;
+
+  // Create a mask value from the initializer list of wait event types.
+  static constexpr unsigned
+  eventMask(std::initializer_list<WaitEventType> Events) {
+    unsigned Mask = 0;
+    for (auto &E : Events)
+      Mask |= 1 << E;
+
+    return Mask;
+  }
 };
 
 class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
@@ -506,14 +516,12 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
     assert(ST);
 
     static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
-        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) |
-            (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS),
-        (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
-            (1 << SQ_MESSAGE),
-        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
-            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
-            (1 << EXP_LDS_ACCESS),
-        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
+                   VMEM_BVH_READ_ACCESS}),
+        eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
+        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
+                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
         0,
         0,
         0};
@@ -543,15 +551,14 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
     assert(ST);
 
     static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
-        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
-        (1 << LDS_ACCESS) | (1 << GDS_ACCESS),
-        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
-            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
-            (1 << EXP_LDS_ACCESS),
-        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
-        (1 << VMEM_SAMPLER_READ_ACCESS),
-        (1 << VMEM_BVH_READ_ACCESS),
-        (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)};
+        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
+        eventMask({LDS_ACCESS, GDS_ACCESS}),
+        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
+                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+        eventMask({VMEM_SAMPLER_READ_ACCESS}),
+        eventMask({VMEM_BVH_READ_ACCESS}),
+        eventMask({SMEM_ACCESS, SQ_MESSAGE})};
 
     return WaitEventMaskForInstGFX12Plus;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c19c3c6017a7c..f4b21b7dfac39 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3635,12 +3635,13 @@ memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
   return true;
 }
 
-static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
-                                int WidthB, int OffsetB) {
+static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
+                                LocationSize WidthB, int OffsetB) {
   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
-  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
-  return LowOffset + LowWidth <= HighOffset;
+  LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+  return LowWidth.hasValue() &&
+         LowOffset + (int)LowWidth.getValue() <= HighOffset;
 }
 
 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
@@ -3662,8 +3663,8 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
     // FIXME: Handle ds_read2 / ds_write2.
     return false;
   }
-  unsigned Width0 = MIa.memoperands().front()->getSize();
-  unsigned Width1 = MIb.memoperands().front()->getSize();
+  LocationSize Width0 = MIa.memoperands().front()->getSize();
+  LocationSize Width1 = MIb.memoperands().front()->getSize();
   return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a62bf779fe2e2..4c5978cdc6665 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -815,7 +815,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   }
 
   static bool isMFMAorWMMA(const MachineInstr &MI) {
-    return isMFMA(MI) || isWMMA(MI);
+    return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
   }
 
   static bool isSWMMAC(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 835a5a2472315..1694436bad15c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -835,6 +835,19 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
   return fp16SrcZerosHighBits(N->getOpcode());
 }]>;
 
+def is_canonicalized : PatLeaf<(fAny srcvalue:$src), [{
+  const SITargetLowering &Lowering =
+      *static_cast<const SITargetLowering *>(getTargetLowering());
+  return Lowering.isCanonicalized(*CurDAG, SDValue(N, 0));
+}]> {
+  let GISelPredicateCode = [{
+    const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+        MF.getSubtarget().getTargetLowering());
+    const MachineOperand &Dst = MI.getOperand(0);
+    assert(Dst.isDef());
+    return TLI->isCanonicalized(Dst.getReg(), MF);
+   }];
+}
 
 //===----------------------------------------------------------------------===//
 // MUBUF/SMEM Patterns
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3ab788406ecb2..1c942dcefdace 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2946,30 +2946,12 @@ def : GCNPat<
 
 // If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
 let AddedComplexity = 1000 in {
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> f16:$src),
-  (COPY f16:$src)
->;
-
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> v2f16:$src),
-  (COPY v2f16:$src)
->;
-
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> f32:$src),
-  (COPY f32:$src)
->;
-
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> v2f32:$src),
-  (COPY v2f32:$src)
->;
-
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> f64:$src),
-  (COPY f64:$src)
->;
+foreach vt = [f16, v2f16, f32, v2f32, f64] in {
+  def : GCNPat<
+    (fcanonicalize (vt is_canonicalized:$src)),
+    (COPY vt:$src)
+  >;
+}
 }
 
 // Prefer selecting to max when legal, but using mul is always valid.
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 9c85ff3c43e22..4ddee2f6d5bef 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -862,7 +862,7 @@ SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
 
-  unsigned Size = MMOa->getSize() + MMOb->getSize();
+  unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
 
   // A base pointer for the combined operation is the same as the leading
   // operation's pointer.
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5c64c6bcd1968..79a7d1cf66c4d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1753,12 +1753,12 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
-                        FrameReg, Offset * SB.EltSize, MMO, SB.RS);
+                        FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
   } else {
     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
-                        FrameReg, Offset * SB.EltSize, MMO, SB.RS);
+                        FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
     // This only ever adds one VGPR spill
     SB.MFI.addToSpilledVGPRs(1);
   }
@@ -2265,10 +2265,10 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
           *MI->memoperands_begin(), RS);
-      
-      if (IsWWMRegSpill) 
+
+      if (IsWWMRegSpill)
         TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
-  
+
       MI->eraseFromParent();
       return true;
     }
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index bccdf4b5ca637..1159c4e0fc2ed 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -744,19 +744,19 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64",
 
 // There are also separate patterns for types other than i32
 def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
-  [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+  [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (not i32:$src1)))]
 >;
 
 def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64",
-  [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+  [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (not i64:$src1)))]
 >;
 
 def S_ORN2_B32 : SOP2_32 <"s_orn2_b32",
-  [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+  [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (not i32:$src1)))]
 >;
 
 def S_ORN2_B64 : SOP2_64 <"s_orn2_b64",
-  [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+  [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (not i64:$src1)))]
 >;
 } // End Defs = [SCC]
 
@@ -1905,21 +1905,20 @@ def : GCNPat<
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
 >;
 
-// FIXME: ValueType should have isVector field
 class ScalarNot2Pat<Instruction inst, SDPatternOperator op, ValueType vt,
-                    bit isVector = 1> : GCNPat<
-  (UniformBinFrag<op> vt:$src0, (UniformUnaryFrag<!if(isVector, vnot, not)> vt:$src1)),
+                    SDPatternOperator notnode = !if(vt.isVector, vnot, not)> : GCNPat<
+  (UniformBinFrag<op> vt:$src0, (notnode vt:$src1)),
   (inst getSOPSrcForVT<vt>.ret:$src0, getSOPSrcForVT<vt>.ret:$src1)
 >;
 
 // Match these for some more types
 // TODO: i1
-def : ScalarNot2Pat<S_ANDN2_B32, and, i16, 0>;
+def : ScalarNot2Pat<S_ANDN2_B32, and, i16>;
 def : ScalarNot2Pat<S_ANDN2_B32, and, v2i16>;
 def : ScalarNot2Pat<S_ANDN2_B64, and, v4i16>;
 def : ScalarNot2Pat<S_ANDN2_B64, and, v2i32>;
 
-def : ScalarNot2Pat<S_ORN2_B32, or, i16, 0>;
+def : ScalarNot2Pat<S_ORN2_B32, or, i16>;
 def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
 def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
 def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 23434d2de0fc6..d468b14d54d3f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -60,6 +60,7 @@ const CustomOperand<const MCSubtargetInfo &> Msg[] = {
   {{"MSG_RTN_GET_REALTIME"},    ID_RTN_GET_REALTIME,        isGFX11Plus},
   {{"MSG_RTN_SAVE_WAVE"},       ID_RTN_SAVE_WAVE,           isGFX11Plus},
   {{"MSG_RTN_GET_TBA"},         ID_RTN_GET_TBA,             isGFX11Plus},
+  {{"MSG_RTN_GET_TBA_TO_PC"},   ID_RTN_GET_TBA_TO_PC,       isGFX11Plus},
   {{"MSG_RTN_GET_SE_AID_ID"},   ID_RTN_GET_SE_AID_ID,       isGFX12Plus},
 };
 // clang-format on
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index aa47dccf2dd28..6d53f68ace70d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3055,6 +3055,11 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc) {
   return hasAny64BitVGPROperands(OpDesc);
 }
 
+unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
+  // Currently this is 128 for all subtargets
+  return 128;
+}
+
 } // namespace AMDGPU
 
 raw_ostream &operator<<(raw_ostream &OS,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f8521cba077c6..29ac402d95351 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1513,6 +1513,11 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
 /// \returns true if the intrinsic is uniform
 bool isIntrinsicAlwaysUniform(unsigned IntrID);
 
+/// \returns lds block size in terms of dwords. \p
+/// This is used to calculate the lds size encoded for PAL metadata 3.0+ which
+/// must be defined in terms of bytes.
+unsigned getLdsDwGranularity(const MCSubtargetInfo &ST);
+
 } // end namespace AMDGPU
 
 raw_ostream &operator<<(raw_ostream &OS,
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 4510c7cf4f42e..66596dbda83c9 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -1742,6 +1742,7 @@ def ARMAsmWriter : AsmWriter {
 
 def ARMAsmParser : AsmParser {
   bit ReportMultipleNearMisses = 1;
+  let PreferSmallerInstructions = true;
 }
 
 def ARMAsmParserVariant : AsmParserVariant {
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index dd63ca17e5b9f..5d0468948dfb6 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -3710,7 +3710,7 @@ unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr &MI) const {
   for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
                                   E = MI.memoperands_end();
        I != E; ++I) {
-    Size += (*I)->getSize();
+    Size += (*I)->getSize().getValue();
   }
   // FIXME: The scheduler currently can't handle values larger than 16. But
   // the values can actually go up to 32 for floating-point load/store
diff --git a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
index 9b26aac6c0b71..34b6f0575f727 100644
--- a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -191,7 +191,7 @@ ARMBankConflictHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   auto BasePseudoVal0 = MO0->getPseudoValue();
   int64_t Offset0 = 0;
 
-  if (MO0->getSize() > 4)
+  if (!MO0->getSize().hasValue() || MO0->getSize().getValue() > 4)
     return NoHazard;
 
   bool SPvalid = false;
@@ -259,8 +259,8 @@ void ARMBankConflictHazardRecognizer::EmitInstruction(SUnit *SU) {
     return;
 
   auto MO = *MI.memoperands().begin();
-  uint64_t Size1 = MO->getSize();
-  if (Size1 > 4)
+  LocationSize Size1 = MO->getSize();
+  if (Size1.hasValue() && Size1.getValue() > 4)
     return;
   Accesses.push_back(&MI);
 }
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 404085820a666..d0678f378da1e 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -155,7 +155,11 @@ def iflags_op : Operand<i32> {
 
 // ARM Predicate operand. Default to 14 = always (AL). Second part is CC
 // register whose default is 0 (no register).
-def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+def CondCodeOperand : AsmOperandClass {
+  let Name = "CondCode";
+  let DefaultMethod = "defaultCondCodeOp";
+  let IsOptional = true;
+}
 def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
                                      (ops (i32 14), (i32 zero_reg))> {
   let PrintMethod = "printPredicateOperand";
@@ -174,7 +178,11 @@ def cmovpred : Operand<i32>, PredicateOp,
 }
 
 // Conditional code result for instructions whose 's' bit is set, e.g. subs.
-def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
+def CCOutOperand : AsmOperandClass {
+  let Name = "CCOut";
+  let DefaultMethod = "defaultCCOutOp";
+  let IsOptional = true;
+}
 def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
   let EncoderMethod = "getCCOutOpValue";
   let PrintMethod = "printSBitModifierOperand";
@@ -202,10 +210,14 @@ def inv_cond_XFORM : SDNodeXForm<imm, [{
 def VPTPredNOperand : AsmOperandClass {
   let Name = "VPTPredN";
   let PredicateMethod = "isVPTPred";
+  let DefaultMethod = "defaultVPTPredOp";
+  let IsOptional = true;
 }
 def VPTPredROperand : AsmOperandClass {
   let Name = "VPTPredR";
   let PredicateMethod = "isVPTPred";
+  let DefaultMethod = "defaultVPTPredOp";
+  let IsOptional = true;
 }
 
 // Operand classes for the cluster of MC operands describing a
@@ -468,7 +480,7 @@ class InstThumb<AddrMode am, int sz, IndexMode im,
 // These are aliases that require C++ handling to convert to the target
 // instruction, while InstAliases can be handled directly by tblgen.
 class AsmPseudoInst<string asm, dag iops, dag oops = (outs)>
-  : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+  : InstTemplate<AddrModeNone, 4, IndexModeNone, Pseudo, GenericDomain,
                  "", NoItinerary> {
   let OutOperandList = oops;
   let InOperandList = iops;
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index be0ca964d3f91..e7f4059935138 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1210,7 +1210,7 @@ def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255_expr:$imm8), IIC_iMOVi,
 // Because we have an explicit tMOVSr below, we need an alias to handle
 // the immediate "movs" form here. Blech.
 def : tInstAlias <"movs $Rdn, $imm8",
-                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255_expr:$imm8, 14, 0)>;
+                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255_expr:$imm8, 14, zero_reg)>;
 
 // A7-73: MOV(2) - mov setting flag.
 
@@ -1768,7 +1768,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
 // In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
 // encoding is available on ARMv6K, but we don't differentiate that finely.
-def : InstAlias<"nop", (tMOVr R8, R8, 14, 0), 0>, Requires<[IsThumb, IsThumb1Only]>;
+def : InstAlias<"nop", (tMOVr R8, R8, 14, zero_reg), 0>, Requires<[IsThumb, IsThumb1Only]>;
 
 
 // "neg" is and alias for "rsb rd, rn, #0"
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index acd46e8093aa7..f227d68deeb8b 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5092,14 +5092,14 @@ def : InstAlias<"dmb${p}.w", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"dsb${p}.w\t$opt", (t2DSB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"dsb${p}.w", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
-def : InstAlias<"isb${p}.w\t$opt", (t2ISB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}.w\t$opt", (t2ISB instsyncb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"isb${p}.w", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 
-// Non-predicable aliases of a predicable DSB: the predicate is (14, 0) where
-// 14 = AL (always execute) and 0 = "instruction doesn't read the CPSR".
-def : InstAlias<"ssbb", (t2DSB 0x0, 14, 0), 1>, Requires<[HasDB, IsThumb2]>;
-def : InstAlias<"pssbb", (t2DSB 0x4, 14, 0), 1>, Requires<[HasDB, IsThumb2]>;
+// Non-predicable aliases of a predicable DSB: the predicate is (14, zero_reg) where
+// 14 = AL (always execute) and zero_reg = "instruction doesn't read the CPSR".
+def : InstAlias<"ssbb", (t2DSB 0x0, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>;
+def : InstAlias<"pssbb", (t2DSB 0x4, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>;
 
 // Armv8-R 'Data Full Barrier'
 def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c320bf723c88b..9cfdb15a0f43d 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -47,6 +48,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
@@ -61,6 +63,7 @@
 #include <iterator>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -79,6 +82,7 @@ extern const ARMInstrTable ARMDescs;
 } // end namespace llvm
 
 namespace {
+class ARMOperand;
 
 enum class ImplicitItModeTy { Always, Never, ARMOnly, ThumbOnly };
 
@@ -446,13 +450,15 @@ class ARMAsmParser : public MCTargetAsmParser {
   }
 
   bool validatetLDMRegList(const MCInst &Inst, const OperandVector &Operands,
-                           unsigned ListNo, bool IsARPop = false);
+                           unsigned MnemonicOpsEndInd, unsigned ListIndex,
+                           bool IsARPop = false);
   bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
-                           unsigned ListNo);
+                           unsigned MnemonicOpsEndInd, unsigned ListIndex);
 
   int tryParseRegister(bool AllowOutofBoundReg = false);
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
+  std::optional<ARM_AM::ShiftOpc> tryParseShiftToken();
   bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
                          bool AllowRAAC = false,
                          bool AllowOutOfBoundReg = false);
@@ -505,6 +511,10 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseDirectiveSEHEpilogEnd(SMLoc L);
   bool parseDirectiveSEHCustom(SMLoc L);
 
+  std::unique_ptr<ARMOperand> defaultCondCodeOp();
+  std::unique_ptr<ARMOperand> defaultCCOutOp();
+  std::unique_ptr<ARMOperand> defaultVPTPredOp();
+
   bool isMnemonicVPTPredicable(StringRef Mnemonic, StringRef ExtraToken);
   StringRef splitMnemonic(StringRef Mnemonic, StringRef ExtraToken,
                           ARMCC::CondCodes &PredicationCode,
@@ -517,9 +527,13 @@ class ARMAsmParser : public MCTargetAsmParser {
                              bool &CanAcceptVPTPredicationCode);
   bool enableArchExtFeature(StringRef Name, SMLoc &ExtLoc);
 
-  void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
-                                     OperandVector &Operands);
-  bool CDEConvertDualRegOperand(StringRef Mnemonic, OperandVector &Operands);
+  void tryConvertingToTwoOperandForm(StringRef Mnemonic,
+                                     ARMCC::CondCodes PredicationCode,
+                                     bool CarrySetting, OperandVector &Operands,
+                                     unsigned MnemonicOpsEndInd);
+
+  bool CDEConvertDualRegOperand(StringRef Mnemonic, OperandVector &Operands,
+                                unsigned MnemonicOpsEndInd);
 
   bool isThumb() const {
     // FIXME: Can tablegen auto-generate this?
@@ -635,12 +649,13 @@ class ARMAsmParser : public MCTargetAsmParser {
   ParseStatus parseProcIFlagsOperand(OperandVector &);
   ParseStatus parseMSRMaskOperand(OperandVector &);
   ParseStatus parseBankedRegOperand(OperandVector &);
-  ParseStatus parsePKHImm(OperandVector &O, StringRef Op, int Low, int High);
+  ParseStatus parsePKHImm(OperandVector &O, ARM_AM::ShiftOpc, int Low,
+                          int High);
   ParseStatus parsePKHLSLImm(OperandVector &O) {
-    return parsePKHImm(O, "lsl", 0, 31);
+    return parsePKHImm(O, ARM_AM::lsl, 0, 31);
   }
   ParseStatus parsePKHASRImm(OperandVector &O) {
-    return parsePKHImm(O, "asr", 1, 32);
+    return parsePKHImm(O, ARM_AM::asr, 1, 32);
   }
   ParseStatus parseSetEndImm(OperandVector &);
   ParseStatus parseShifterImm(OperandVector &);
@@ -659,15 +674,20 @@ class ARMAsmParser : public MCTargetAsmParser {
   void cvtThumbBranches(MCInst &Inst, const OperandVector &);
   void cvtMVEVMOVQtoDReg(MCInst &Inst, const OperandVector &);
 
-  bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
-  bool processInstruction(MCInst &Inst, const OperandVector &Ops, MCStreamer &Out);
-  bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands);
-  bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
-  bool shouldOmitVectorPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
+  bool validateInstruction(MCInst &Inst, const OperandVector &Ops,
+                           unsigned MnemonicOpsEndInd);
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops,
+                          unsigned MnemonicOpsEndInd, MCStreamer &Out);
+  bool shouldOmitVectorPredicateOperand(StringRef Mnemonic,
+                                        OperandVector &Operands,
+                                        unsigned MnemonicOpsEndInd);
   bool isITBlockTerminator(MCInst &Inst) const;
-  void fixupGNULDRDAlias(StringRef Mnemonic, OperandVector &Operands);
-  bool validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands,
-                        bool Load, bool ARMMode, bool Writeback);
+
+  void fixupGNULDRDAlias(StringRef Mnemonic, OperandVector &Operands,
+                         unsigned MnemonicOpsEndInd);
+  bool validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands, bool Load,
+                        bool ARMMode, bool Writeback,
+                        unsigned MnemonicOpsEndInd);
 
 public:
   enum ARMMatchResultTy {
@@ -716,6 +736,9 @@ class ARMAsmParser : public MCTargetAsmParser {
   unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
   unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+  unsigned
+  checkEarlyTargetMatchPredicate(MCInst &Inst,
+                                 const OperandVector &Operands) override;
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
@@ -1347,6 +1370,14 @@ class ARMOperand : public MCParsedAsmOperand {
   bool isRegListWithAPSR() const {
     return Kind == k_RegisterListWithAPSR || Kind == k_RegisterList;
   }
+  bool isDReg() const {
+    return isReg() &&
+           ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg.RegNum);
+  }
+  bool isQReg() const {
+    return isReg() &&
+           ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg.RegNum);
+  }
   bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
   bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
   bool isFPSRegListWithVPR() const { return Kind == k_FPSRegisterListWithVPR; }
@@ -2012,6 +2043,8 @@ class ARMOperand : public MCParsedAsmOperand {
   bool isProcIFlags() const { return Kind == k_ProcIFlags; }
 
   // NEON operands.
+  bool isVectorList() const { return Kind == k_VectorList; }
+
   bool isSingleSpacedVectorList() const {
     return Kind == k_VectorList && !VectorList.isDoubleSpaced;
   }
@@ -2452,6 +2485,20 @@ class ARMOperand : public MCParsedAsmOperand {
            CC == ARMCC::GT || CC == ARMCC::LE || CC == ARMCC::GE;
   }
 
+  void setVecListDPair(unsigned int DPair) {
+    Kind = k_VectorList;
+    VectorList.RegNum = DPair;
+    VectorList.Count = 2;
+    VectorList.isDoubleSpaced = false;
+  }
+
+  void setVecListOneD(unsigned int DReg) {
+    Kind = k_VectorList;
+    VectorList.RegNum = DReg;
+    VectorList.Count = 1;
+    VectorList.isDoubleSpaced = false;
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
     if (!Expr)
@@ -4054,6 +4101,63 @@ static MCRegister MatchRegisterName(StringRef Name);
 
 /// }
 
+static bool isDataTypeToken(StringRef Tok) {
+  static const DenseSet<StringRef> DataTypes{
+      ".8",  ".16",  ".32",  ".64",  ".i8", ".i16", ".i32", ".i64",
+      ".u8", ".u16", ".u32", ".u64", ".s8", ".s16", ".s32", ".s64",
+      ".p8", ".p16", ".f32", ".f64", ".f",  ".d"};
+  return DataTypes.contains(Tok);
+}
+
+static unsigned getMnemonicOpsEndInd(const OperandVector &Operands) {
+  unsigned MnemonicOpsEndInd = 1;
+  // Special case for CPS which has a Mnemonic side token for possibly storing
+  // ie/id variant
+  if (Operands[0]->isToken() &&
+      static_cast<ARMOperand &>(*Operands[0]).getToken() == "cps") {
+    if (Operands.size() > 1 && Operands[1]->isImm() &&
+        static_cast<ARMOperand &>(*Operands[1]).getImm()->getKind() ==
+            llvm::MCExpr::Constant &&
+        (dyn_cast<MCConstantExpr>(
+             static_cast<ARMOperand &>(*Operands[1]).getImm())
+                 ->getValue() == ARM_PROC::IE ||
+         dyn_cast<MCConstantExpr>(
+             static_cast<ARMOperand &>(*Operands[1]).getImm())
+                 ->getValue() == ARM_PROC::ID))
+      ++MnemonicOpsEndInd;
+  }
+
+  // In some circumstances the condition code moves to the right
+  bool RHSCondCode = false;
+  while (MnemonicOpsEndInd < Operands.size()) {
+    auto Op = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
+    // Special case for it instructions which have a condition code on the RHS
+    if (Op.isITMask()) {
+      RHSCondCode = true;
+      MnemonicOpsEndInd++;
+    } else if (Op.isToken() &&
+               (
+                   // There are several special cases not covered by
+                   // isDataTypeToken
+                   Op.getToken() == ".w" || Op.getToken() == ".bf16" ||
+                   Op.getToken() == ".p64" || Op.getToken() == ".f16" ||
+                   isDataTypeToken(Op.getToken()))) {
+      // In the mnemonic operators the cond code must always precede the data
+      // type. So we can now safely assume any subsequent cond code is on the
+      // RHS. As is the case for VCMP and VPT.
+      RHSCondCode = true;
+      MnemonicOpsEndInd++;
+    }
+    // Skip all mnemonic operator types
+    else if (Op.isCCOut() || (Op.isCondCode() && !RHSCondCode) ||
+             Op.isVPTPred() || (Op.isToken() && Op.getToken() == ".w"))
+      MnemonicOpsEndInd++;
+    else
+      break;
+  }
+  return MnemonicOpsEndInd;
+}
+
 bool ARMAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
                                  SMLoc &EndLoc) {
   const AsmToken &Tok = getParser().getTok();
@@ -4127,30 +4231,36 @@ int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
   return RegNum;
 }
 
-// Try to parse a shifter  (e.g., "lsl <amt>"). On success, return 0.
-// If a recoverable error occurs, return 1. If an irrecoverable error
-// occurs, return -1. An irrecoverable error is one where tokens have been
-// consumed in the process of trying to parse the shifter (i.e., when it is
-// indeed a shifter operand, but malformed).
-int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
+std::optional<ARM_AM::ShiftOpc> ARMAsmParser::tryParseShiftToken() {
   MCAsmParser &Parser = getParser();
-  SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
-    return -1;
+    return std::nullopt;
 
   std::string lowerCase = Tok.getString().lower();
-  ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase)
+  return StringSwitch<std::optional<ARM_AM::ShiftOpc>>(lowerCase)
       .Case("asl", ARM_AM::lsl)
       .Case("lsl", ARM_AM::lsl)
       .Case("lsr", ARM_AM::lsr)
       .Case("asr", ARM_AM::asr)
       .Case("ror", ARM_AM::ror)
       .Case("rrx", ARM_AM::rrx)
-      .Default(ARM_AM::no_shift);
+      .Default(std::nullopt);
+}
 
-  if (ShiftTy == ARM_AM::no_shift)
+// Try to parse a shifter  (e.g., "lsl <amt>"). On success, return 0.
+// If a recoverable error occurs, return 1. If an irrecoverable error
+// occurs, return -1. An irrecoverable error is one where tokens have been
+// consumed in the process of trying to parse the shifter (i.e., when it is
+// indeed a shifter operand, but malformed).
+int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+
+  auto ShiftTyOpt = tryParseShiftToken();
+  if (ShiftTyOpt == std::nullopt)
     return 1;
+  auto ShiftTy = ShiftTyOpt.value();
 
   Parser.Lex(); // Eat the operator.
 
@@ -4680,6 +4790,8 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
   // As an extension (to match gas), support a plain D register or Q register
   // (without encosing curly braces) as a single or double entry list,
   // respectively.
+  // If there is no lane supplied, just parse as a register and
+  // use the custom matcher to convert to list if necessary
   if (!hasMVE() && Parser.getTok().is(AsmToken::Identifier)) {
     SMLoc E = Parser.getTok().getEndLoc();
     int Reg = tryParseRegister();
@@ -4691,7 +4803,7 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
         return Res;
       switch (LaneKind) {
       case NoLanes:
-        Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, false, S, E));
+        Operands.push_back(ARMOperand::CreateReg(Reg, S, E));
         break;
       case AllLanes:
         Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 1, false,
@@ -4712,9 +4824,7 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
         return Res;
       switch (LaneKind) {
       case NoLanes:
-        Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
-                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
-        Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, false, S, E));
+        Operands.push_back(ARMOperand::CreateReg(Reg, S, E));
         break;
       case AllLanes:
         Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
@@ -4730,7 +4840,8 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
       }
       return ParseStatus::Success;
     }
-    return Error(S, "vector register expected");
+    Operands.push_back(ARMOperand::CreateReg(Reg, S, E));
+    return ParseStatus::Success;
   }
 
   if (Parser.getTok().isNot(AsmToken::LCurly))
@@ -5060,6 +5171,10 @@ ParseStatus ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) {
 
 /// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
 ParseStatus ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
+  // Don't parse two MSR registers in a row
+  if (static_cast<ARMOperand &>(*Operands.back()).isMSRMask() ||
+      static_cast<ARMOperand &>(*Operands.back()).isBankedReg())
+    return ParseStatus::NoMatch;
   MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
@@ -5157,6 +5272,10 @@ ParseStatus ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
 /// parseBankedRegOperand - Try to parse a banked register (e.g. "lr_irq") for
 /// use in the MRS/MSR instructions added to support virtualization.
 ParseStatus ARMAsmParser::parseBankedRegOperand(OperandVector &Operands) {
+  // Don't parse two Banked registers in a row
+  if (static_cast<ARMOperand &>(*Operands.back()).isBankedReg() ||
+      static_cast<ARMOperand &>(*Operands.back()).isMSRMask())
+    return ParseStatus::NoMatch;
   MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
@@ -5174,23 +5293,30 @@ ParseStatus ARMAsmParser::parseBankedRegOperand(OperandVector &Operands) {
   return ParseStatus::Success;
 }
 
-ParseStatus ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op,
-                                      int Low, int High) {
+// FIXME: Unify the different methods for handling shift operators
+// and use TableGen matching mechanisms to do the validation rather than
+// separate parsing paths.
+ParseStatus ARMAsmParser::parsePKHImm(OperandVector &Operands,
+                                      ARM_AM::ShiftOpc Op, int Low, int High) {
   MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.isNot(AsmToken::Identifier))
-    return Error(Parser.getTok().getLoc(), Op + " operand expected.");
-  StringRef ShiftName = Tok.getString();
-  std::string LowerOp = Op.lower();
-  std::string UpperOp = Op.upper();
-  if (ShiftName != LowerOp && ShiftName != UpperOp)
-    return Error(Parser.getTok().getLoc(), Op + " operand expected.");
+  auto ShiftCodeOpt = tryParseShiftToken();
+
+  if (!ShiftCodeOpt.has_value())
+    return ParseStatus::NoMatch;
+  auto ShiftCode = ShiftCodeOpt.value();
+
+  // The wrong shift code has been provided. Can error here as has matched the
+  // correct operand in this case.
+  if (ShiftCode != Op)
+    return Error(Parser.getTok().getLoc(),
+                 ARM_AM::getShiftOpcStr(Op) + " operand expected.");
+
   Parser.Lex(); // Eat shift type token.
 
   // There must be a '#' and a shift amount.
   if (Parser.getTok().isNot(AsmToken::Hash) &&
       Parser.getTok().isNot(AsmToken::Dollar))
-    return Error(Parser.getTok().getLoc(), "'#' expected");
+    return ParseStatus::NoMatch;
   Parser.Lex(); // Eat hash token.
 
   const MCExpr *ShiftAmount;
@@ -5240,7 +5366,7 @@ ParseStatus ARMAsmParser::parseShifterImm(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier))
-    return Error(S, "shift operator 'asr' or 'lsl' expected");
+    return ParseStatus::NoMatch;
   StringRef ShiftName = Tok.getString();
   bool isASR;
   if (ShiftName == "lsl" || ShiftName == "LSL")
@@ -5248,7 +5374,7 @@ ParseStatus ARMAsmParser::parseShifterImm(OperandVector &Operands) {
   else if (ShiftName == "asr" || ShiftName == "ASR")
     isASR = true;
   else
-    return Error(S, "shift operator 'asr' or 'lsl' expected");
+    return ParseStatus::NoMatch;
   Parser.Lex(); // Eat the operator.
 
   // A '#' and a shift amount.
@@ -5441,7 +5567,7 @@ ParseStatus ARMAsmParser::parseBitfield(OperandVector &Operands) {
   // The bitfield descriptor is really two operands, the LSB and the width.
   if (Parser.getTok().isNot(AsmToken::Hash) &&
       Parser.getTok().isNot(AsmToken::Dollar))
-    return Error(Parser.getTok().getLoc(), "'#' expected");
+    return ParseStatus::NoMatch;
   Parser.Lex(); // Eat hash token.
 
   const MCExpr *LSBExpr;
@@ -5600,37 +5726,86 @@ ParseStatus ARMAsmParser::parseAM3Offset(OperandVector &Operands) {
   return ParseStatus::Success;
 }
 
+// Finds the index of the first CondCode operator, if there is none returns 0
+unsigned findCondCodeInd(const OperandVector &Operands,
+                         unsigned MnemonicOpsEndInd) {
+  for (unsigned I = 1; I < MnemonicOpsEndInd; ++I) {
+    auto Op = static_cast<ARMOperand &>(*Operands[I]);
+    if (Op.isCondCode())
+      return I;
+  }
+  return 0;
+}
+
+unsigned findCCOutInd(const OperandVector &Operands,
+                      unsigned MnemonicOpsEndInd) {
+  for (unsigned I = 1; I < MnemonicOpsEndInd; ++I) {
+    auto Op = static_cast<ARMOperand &>(*Operands[I]);
+    if (Op.isCCOut())
+      return I;
+  }
+  return 0;
+}
+
 /// Convert parsed operands to MCInst.  Needed here because this instruction
 /// only has two register operands, but multiplication is commutative so
 /// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN".
 void ARMAsmParser::cvtThumbMultiply(MCInst &Inst,
                                     const OperandVector &Operands) {
-  ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1);
-  ((ARMOperand &)*Operands[1]).addCCOutOperands(Inst, 1);
-  // If we have a three-operand form, make sure to set Rn to be the operand
-  // that isn't the same as Rd.
-  unsigned RegOp = 4;
-  if (Operands.size() == 6 &&
-      ((ARMOperand &)*Operands[4]).getReg() ==
-          ((ARMOperand &)*Operands[3]).getReg())
-    RegOp = 5;
-  ((ARMOperand &)*Operands[RegOp]).addRegOperands(Inst, 1);
-  Inst.addOperand(Inst.getOperand(0));
-  ((ARMOperand &)*Operands[2]).addCondCodeOperands(Inst, 2);
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+  unsigned CondI = findCondCodeInd(Operands, MnemonicOpsEndInd);
+  unsigned CondOutI = findCCOutInd(Operands, MnemonicOpsEndInd);
+
+  // 2 operand form
+  unsigned RegRd = MnemonicOpsEndInd;
+  unsigned RegRn = MnemonicOpsEndInd + 1;
+  unsigned RegRm = MnemonicOpsEndInd;
+
+  if (Operands.size() == MnemonicOpsEndInd + 3) {
+    // If we have a three-operand form, make sure to set Rn to be the operand
+    // that isn't the same as Rd.
+    if (((ARMOperand &)*Operands[RegRd]).getReg() ==
+        ((ARMOperand &)*Operands[MnemonicOpsEndInd + 1]).getReg()) {
+      RegRn = MnemonicOpsEndInd + 2;
+      RegRm = MnemonicOpsEndInd + 1;
+    } else {
+      RegRn = MnemonicOpsEndInd + 1;
+      RegRm = MnemonicOpsEndInd + 2;
+    }
+  }
+
+  // Rd
+  ((ARMOperand &)*Operands[RegRd]).addRegOperands(Inst, 1);
+  // CCOut
+  if (CondOutI != 0) {
+    ((ARMOperand &)*Operands[CondOutI]).addCCOutOperands(Inst, 1);
+  } else {
+    ARMOperand Op = *ARMOperand::CreateCCOut(0, Operands[0]->getEndLoc());
+    Op.addCCOutOperands(Inst, 1);
+  }
+  // Rn
+  ((ARMOperand &)*Operands[RegRn]).addRegOperands(Inst, 1);
+  // Rm
+  ((ARMOperand &)*Operands[RegRm]).addRegOperands(Inst, 1);
+
+  // Cond code
+  if (CondI != 0) {
+    ((ARMOperand &)*Operands[CondI]).addCondCodeOperands(Inst, 2);
+  } else {
+    ARMOperand Op =
+        *ARMOperand::CreateCondCode(llvm::ARMCC::AL, Operands[0]->getEndLoc());
+    Op.addCondCodeOperands(Inst, 2);
+  }
 }
 
 void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
                                     const OperandVector &Operands) {
-  int CondOp = -1, ImmOp = -1;
-  switch(Inst.getOpcode()) {
-    case ARM::tB:
-    case ARM::tBcc:  CondOp = 1; ImmOp = 2; break;
-
-    case ARM::t2B:
-    case ARM::t2Bcc: CondOp = 1; ImmOp = 3; break;
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+  unsigned CondI = findCondCodeInd(Operands, MnemonicOpsEndInd);
+  unsigned Cond =
+      (CondI == 0 ? ARMCC::AL
+                  : static_cast<ARMOperand &>(*Operands[CondI]).getCondCode());
 
-    default: llvm_unreachable("Unexpected instruction in cvtThumbBranches");
-  }
   // first decide whether or not the branch should be conditional
   // by looking at it's location relative to an IT block
   if(inITBlock()) {
@@ -5641,9 +5816,6 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
       case ARM::t2Bcc: Inst.setOpcode(ARM::t2B); break;
     }
   } else {
-    // outside IT blocks we can only have unconditional branches with AL
-    // condition code or conditional branches with non-AL condition code
-    unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode();
     switch(Inst.getOpcode()) {
       case ARM::tB:
       case ARM::tBcc:
@@ -5660,36 +5832,56 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
   switch(Inst.getOpcode()) {
     // classify tB as either t2B or t1B based on range of immediate operand
     case ARM::tB: {
-      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
       if (!op.isSignedOffset<11, 1>() && isThumb() && hasV8MBaseline())
         Inst.setOpcode(ARM::t2B);
       break;
     }
     // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
     case ARM::tBcc: {
-      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
       if (!op.isSignedOffset<8, 1>() && isThumb() && hasV8MBaseline())
         Inst.setOpcode(ARM::t2Bcc);
       break;
     }
   }
-  ((ARMOperand &)*Operands[ImmOp]).addImmOperands(Inst, 1);
-  ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2);
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd]).addImmOperands(Inst, 1);
+  if (CondI != 0) {
+    ((ARMOperand &)*Operands[CondI]).addCondCodeOperands(Inst, 2);
+  } else {
+    ARMOperand Op =
+        *ARMOperand::CreateCondCode(llvm::ARMCC::AL, Operands[0]->getEndLoc());
+    Op.addCondCodeOperands(Inst, 2);
+  }
 }
 
 void ARMAsmParser::cvtMVEVMOVQtoDReg(
   MCInst &Inst, const OperandVector &Operands) {
 
-  // mnemonic, condition code, Rt, Rt2, Qd, idx, Qd again, idx2
-  assert(Operands.size() == 8);
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+  unsigned CondI = findCondCodeInd(Operands, MnemonicOpsEndInd);
 
-  ((ARMOperand &)*Operands[2]).addRegOperands(Inst, 1); // Rt
-  ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1); // Rt2
-  ((ARMOperand &)*Operands[4]).addRegOperands(Inst, 1); // Qd
-  ((ARMOperand &)*Operands[5]).addMVEPairVectorIndexOperands(Inst, 1); // idx
+  // mnemonic, condition code, Rt, Rt2, Qd, idx, Qd again, idx2
+  assert(Operands.size() == MnemonicOpsEndInd + 6);
+
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd]).addRegOperands(Inst, 1); // Rt
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd + 1])
+      .addRegOperands(Inst, 1); // Rt2
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd + 2])
+      .addRegOperands(Inst, 1); // Qd
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd + 3])
+      .addMVEPairVectorIndexOperands(Inst, 1); // idx
   // skip second copy of Qd in Operands[6]
-  ((ARMOperand &)*Operands[7]).addMVEPairVectorIndexOperands(Inst, 1); // idx2
-  ((ARMOperand &)*Operands[1]).addCondCodeOperands(Inst, 2); // condition code
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd + 5])
+      .addMVEPairVectorIndexOperands(Inst, 1); // idx2
+  if (CondI != 0) {
+    ((ARMOperand &)*Operands[CondI])
+        .addCondCodeOperands(Inst, 2); // condition code
+  } else {
+    ARMOperand Op =
+        *ARMOperand::CreateCondCode(ARMCC::AL, Operands[0]->getEndLoc());
+    Op.addCondCodeOperands(Inst, 2);
+  }
 }
 
 /// Parse an ARM memory expression, return false if successful else return true
@@ -5948,6 +6140,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
 
 /// parseFPImm - A floating point immediate expression operand.
 ParseStatus ARMAsmParser::parseFPImm(OperandVector &Operands) {
+  LLVM_DEBUG(dbgs() << "PARSE FPImm, Ops: " << Operands.size());
+
   MCAsmParser &Parser = getParser();
   // Anything that can accept a floating point constant as an operand
   // needs to go through here, as the regular parseExpression is
@@ -5974,10 +6168,19 @@ ParseStatus ARMAsmParser::parseFPImm(OperandVector &Operands) {
   // integer constant. Make sure we don't try to parse an FPImm
   // for these:
   // vmov.i{8|16|32|64} <dreg|qreg>, #imm
-  ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]);
-  bool isVmovf = TyOp.isToken() &&
-                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64" ||
-                  TyOp.getToken() == ".f16");
+
+  bool isVmovf = false;
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+  for (unsigned I = 1; I < MnemonicOpsEndInd; ++I) {
+    ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[I]);
+    if (TyOp.isToken() &&
+        (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64" ||
+         TyOp.getToken() == ".f16")) {
+      isVmovf = true;
+      break;
+    }
+  }
+
   ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]);
   bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" ||
                                          Mnemonic.getToken() == "fconsts");
@@ -6493,18 +6696,30 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic,
     CanAcceptPredicationCode = true;
 }
 
+bool operandsContainWide(OperandVector &Operands, unsigned MnemonicOpsEndInd) {
+  for (unsigned I = 0; I < MnemonicOpsEndInd; ++I) {
+    auto &Op = static_cast<ARMOperand &>(*Operands[I]);
+    if (Op.isToken() && Op.getToken() == ".w")
+      return true;
+  }
+  return false;
+}
+
 // Some Thumb instructions have two operand forms that are not
 // available as three operand, convert to two operand form if possible.
 //
 // FIXME: We would really like to be able to tablegen'erate this.
-void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
-                                                 bool CarrySetting,
-                                                 OperandVector &Operands) {
-  if (Operands.size() != 6)
+void ARMAsmParser::tryConvertingToTwoOperandForm(
+    StringRef Mnemonic, ARMCC::CondCodes PredicationCode, bool CarrySetting,
+    OperandVector &Operands, unsigned MnemonicOpsEndInd) {
+
+  if (operandsContainWide(Operands, MnemonicOpsEndInd))
+    return;
+  if (Operands.size() != MnemonicOpsEndInd + 3)
     return;
 
-  const auto &Op3 = static_cast<ARMOperand &>(*Operands[3]);
-        auto &Op4 = static_cast<ARMOperand &>(*Operands[4]);
+  const auto &Op3 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
+  auto &Op4 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]);
   if (!Op3.isReg() || !Op4.isReg())
     return;
 
@@ -6515,7 +6730,7 @@ void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
   // it in processInstruction(), but the 3 operand form of ADD (t2ADDrr)
   // won't accept SP or PC so we do the transformation here taking care
   // with immediate range in the 'add sp, sp #imm' case.
-  auto &Op5 = static_cast<ARMOperand &>(*Operands[5]);
+  auto &Op5 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 2]);
   if (isThumbTwo()) {
     if (Mnemonic != "add")
       return;
@@ -6575,7 +6790,7 @@ void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
   if (Transform) {
     if (Swap)
       std::swap(Op4, Op5);
-    Operands.erase(Operands.begin() + 3);
+    Operands.erase(Operands.begin() + MnemonicOpsEndInd);
   }
 }
 
@@ -6600,183 +6815,9 @@ static bool isThumbI8Relocation(MCParsedAsmOperand &MCOp) {
   return false;
 }
 
-bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
-                                          OperandVector &Operands) {
-  // FIXME: This is all horribly hacky. We really need a better way to deal
-  // with optional operands like this in the matcher table.
-
-  // The 'mov' mnemonic is special. One variant has a cc_out operand, while
-  // another does not. Specifically, the MOVW instruction does not. So we
-  // special case it here and remove the defaulted (non-setting) cc_out
-  // operand if that's the instruction we're trying to match.
-  //
-  // We do this as post-processing of the explicit operands rather than just
-  // conditionally adding the cc_out in the first place because we need
-  // to check the type of the parsed immediate operand.
-  if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
-      !static_cast<ARMOperand &>(*Operands[4]).isModImm() &&
-      static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
-    return true;
-
-  if (Mnemonic == "movs" && Operands.size() > 3 && isThumb() &&
-      isThumbI8Relocation(*Operands[3]))
-    return true;
-
-  // Register-register 'add' for thumb does not have a cc_out operand
-  // when there are only two register operands.
-  if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
-    return true;
-  // Register-register 'add' for thumb does not have a cc_out operand
-  // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
-  // have to check the immediate range here since Thumb2 has a variant
-  // that can handle a different range and has a cc_out operand.
-  if (((isThumb() && Mnemonic == "add") ||
-       (isThumbTwo() && Mnemonic == "sub")) &&
-      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::SP &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      ((Mnemonic == "add" && static_cast<ARMOperand &>(*Operands[5]).isReg()) ||
-       static_cast<ARMOperand &>(*Operands[5]).isImm0_1020s4()))
-    return true;
-  // For Thumb2, add/sub immediate does not have a cc_out operand for the
-  // imm0_4095 variant. That's the least-preferred variant when
-  // selecting via the generic "add" mnemonic, so to know that we
-  // should remove the cc_out operand, we have to explicitly check that
-  // it's not one of the other variants. Ugh.
-  if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
-      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
-    // Nest conditions rather than one big 'if' statement for readability.
-    //
-    // If both registers are low, we're in an IT block, and the immediate is
-    // in range, we should use encoding T1 instead, which has a cc_out.
-    if (inITBlock() &&
-        isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) &&
-        isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) &&
-        static_cast<ARMOperand &>(*Operands[5]).isImm0_7())
-      return false;
-    // Check against T3. If the second register is the PC, this is an
-    // alternate form of ADR, which uses encoding T4, so check for that too.
-    if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC &&
-        (static_cast<ARMOperand &>(*Operands[5]).isT2SOImm() ||
-         static_cast<ARMOperand &>(*Operands[5]).isT2SOImmNeg()))
-      return false;
-
-    // Otherwise, we use encoding T4, which does not have a cc_out
-    // operand.
-    return true;
-  }
-
-  // The thumb2 multiply instruction doesn't have a CCOut register, so
-  // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
-  // use the 16-bit encoding or not.
-  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[5]).isReg() &&
-      // If the registers aren't low regs, the destination reg isn't the
-      // same as one of the source regs, or the cc_out operand is zero
-      // outside of an IT block, we have to use the 32-bit encoding, so
-      // remove the cc_out operand.
-      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[5]).getReg()) ||
-       !inITBlock() || (static_cast<ARMOperand &>(*Operands[3]).getReg() !=
-                            static_cast<ARMOperand &>(*Operands[5]).getReg() &&
-                        static_cast<ARMOperand &>(*Operands[3]).getReg() !=
-                            static_cast<ARMOperand &>(*Operands[4]).getReg())))
-    return true;
-
-  // Also check the 'mul' syntax variant that doesn't specify an explicit
-  // destination register.
-  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      // If the registers aren't low regs  or the cc_out operand is zero
-      // outside of an IT block, we have to use the 32-bit encoding, so
-      // remove the cc_out operand.
-      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
-       !inITBlock()))
-    return true;
-
-  // Register-register 'add/sub' for thumb does not have a cc_out operand
-  // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also
-  // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't
-  // right, this will result in better diagnostics (which operand is off)
-  // anyway.
-  if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
-      (Operands.size() == 5 || Operands.size() == 6) &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::SP &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      (static_cast<ARMOperand &>(*Operands[4]).isImm() ||
-       (Operands.size() == 6 &&
-        static_cast<ARMOperand &>(*Operands[5]).isImm()))) {
-    // Thumb2 (add|sub){s}{p}.w GPRnopc, sp, #{T2SOImm} has cc_out
-    return (!(isThumbTwo() &&
-              (static_cast<ARMOperand &>(*Operands[4]).isT2SOImm() ||
-               static_cast<ARMOperand &>(*Operands[4]).isT2SOImmNeg())));
-  }
-  // Fixme: Should join all the thumb+thumb2 (add|sub) in a single if case
-  // Thumb2 ADD r0, #4095 -> ADDW r0, r0, #4095 (T4)
-  // Thumb2 SUB r0, #4095 -> SUBW r0, r0, #4095
-  if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
-      (Operands.size() == 5) &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[3]).getReg() != ARM::SP &&
-      static_cast<ARMOperand &>(*Operands[3]).getReg() != ARM::PC &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      static_cast<ARMOperand &>(*Operands[4]).isImm()) {
-    const ARMOperand &IMM = static_cast<ARMOperand &>(*Operands[4]);
-    if (IMM.isT2SOImm() || IMM.isT2SOImmNeg())
-      return false; // add.w / sub.w
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(IMM.getImm())) {
-      const int64_t Value = CE->getValue();
-      // Thumb1 imm8 sub / add
-      if ((Value < ((1 << 7) - 1) << 2) && inITBlock() && (!(Value & 3)) &&
-          isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()))
-        return false;
-      return true; // Thumb2 T4 addw / subw
-    }
-  }
-  return false;
-}
-
-bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
-                                              OperandVector &Operands) {
-  // VRINT{Z, X} have a predicate operand in VFP, but not in NEON
-  unsigned RegIdx = 3;
-  if ((((Mnemonic == "vrintz" || Mnemonic == "vrintx") && !hasMVE()) ||
-      Mnemonic == "vrintr") &&
-      (static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" ||
-       static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) {
-    if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-        (static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32" ||
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f16"))
-      RegIdx = 4;
-
-    if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() &&
-        (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(
-             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()) ||
-         ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
-             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg())))
-      return true;
-  }
-  return false;
-}
-
-bool ARMAsmParser::shouldOmitVectorPredicateOperand(StringRef Mnemonic,
-                                                    OperandVector &Operands) {
-  if (!hasMVE() || Operands.size() < 3)
+bool ARMAsmParser::shouldOmitVectorPredicateOperand(
+    StringRef Mnemonic, OperandVector &Operands, unsigned MnemonicOpsEndInd) {
+  if (!hasMVE() || Operands.size() <= MnemonicOpsEndInd)
     return true;
 
   if (Mnemonic.starts_with("vld2") || Mnemonic.starts_with("vld4") ||
@@ -6806,24 +6847,13 @@ bool ARMAsmParser::shouldOmitVectorPredicateOperand(StringRef Mnemonic,
       // MQPR, to more accurately report errors when using Q registers
       // outside of the allowed range.
       if (static_cast<ARMOperand &>(*Operand).isVectorIndex() ||
-          (Operand->isReg() &&
-           (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
-             Operand->getReg()))))
+          static_cast<ARMOperand &>(*Operand).isQReg())
         return false;
     }
     return true;
   }
 }
 
-static bool isDataTypeToken(StringRef Tok) {
-  return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
-    Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
-    Tok == ".u8" || Tok == ".u16" || Tok == ".u32" || Tok == ".u64" ||
-    Tok == ".s8" || Tok == ".s16" || Tok == ".s32" || Tok == ".s64" ||
-    Tok == ".p8" || Tok == ".p16" || Tok == ".f32" || Tok == ".f64" ||
-    Tok == ".f" || Tok == ".d";
-}
-
 // FIXME: This bit should probably be handled via an explicit match class
 // in the .td files that matches the suffix instead of having it be
 // a literal string token the way it is now.
@@ -6844,14 +6874,15 @@ static void applyMnemonicAliases(StringRef &Mnemonic,
 // bail out, and let the assembly parser report an error on the instruction as
 // it is written.
 void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,
-                                     OperandVector &Operands) {
+                                     OperandVector &Operands,
+                                     unsigned MnemonicOpsEndInd) {
   if (Mnemonic != "ldrd" && Mnemonic != "strd")
     return;
-  if (Operands.size() < 4)
+  if (Operands.size() < MnemonicOpsEndInd + 2)
     return;
 
-  ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
-  ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+  ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
+  ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]);
 
   if (!Op2.isReg())
     return;
@@ -6876,7 +6907,7 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,
     return;
 
   Operands.insert(
-      Operands.begin() + 3,
+      Operands.begin() + MnemonicOpsEndInd + 1,
       ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc()));
 }
 
@@ -6886,19 +6917,17 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,
 // operand. If the conversion fails an error is diagnosed, and the function
 // returns true.
 bool ARMAsmParser::CDEConvertDualRegOperand(StringRef Mnemonic,
-                                            OperandVector &Operands) {
+                                            OperandVector &Operands,
+                                            unsigned MnemonicOpsEndInd) {
   assert(MS.isCDEDualRegInstr(Mnemonic));
-  bool isPredicable =
-      Mnemonic == "cx1da" || Mnemonic == "cx2da" || Mnemonic == "cx3da";
-  size_t NumPredOps = isPredicable ? 1 : 0;
 
-  if (Operands.size() <= 3 + NumPredOps)
+  if (Operands.size() < 3 + MnemonicOpsEndInd)
     return false;
 
   StringRef Op2Diag(
       "operand must be an even-numbered register in the range [r0, r10]");
 
-  const MCParsedAsmOperand &Op2 = *Operands[2 + NumPredOps];
+  const MCParsedAsmOperand &Op2 = *Operands[MnemonicOpsEndInd + 1];
   if (!Op2.isReg())
     return Error(Op2.getStartLoc(), Op2Diag);
 
@@ -6933,16 +6962,43 @@ bool ARMAsmParser::CDEConvertDualRegOperand(StringRef Mnemonic,
     break;
   }
 
-  const MCParsedAsmOperand &Op3 = *Operands[3 + NumPredOps];
+  const MCParsedAsmOperand &Op3 = *Operands[MnemonicOpsEndInd + 2];
   if (!Op3.isReg() || Op3.getReg() != RNext)
     return Error(Op3.getStartLoc(), "operand must be a consecutive register");
 
-  Operands.erase(Operands.begin() + 3 + NumPredOps);
-  Operands[2 + NumPredOps] =
+  Operands.erase(Operands.begin() + MnemonicOpsEndInd + 2);
+  Operands[MnemonicOpsEndInd + 1] =
       ARMOperand::CreateReg(RPair, Op2.getStartLoc(), Op2.getEndLoc());
   return false;
 }
 
+void removeCondCode(OperandVector &Operands, unsigned &MnemonicOpsEndInd) {
+  for (unsigned I = 0; I < MnemonicOpsEndInd; ++I)
+    if (static_cast<ARMOperand &>(*Operands[I]).isCondCode()) {
+      Operands.erase(Operands.begin() + I);
+      --MnemonicOpsEndInd;
+      break;
+    }
+}
+
+void removeCCOut(OperandVector &Operands, unsigned &MnemonicOpsEndInd) {
+  for (unsigned I = 0; I < MnemonicOpsEndInd; ++I)
+    if (static_cast<ARMOperand &>(*Operands[I]).isCCOut()) {
+      Operands.erase(Operands.begin() + I);
+      --MnemonicOpsEndInd;
+      break;
+    }
+}
+
+void removeVPTCondCode(OperandVector &Operands, unsigned &MnemonicOpsEndInd) {
+  for (unsigned I = 0; I < MnemonicOpsEndInd; ++I)
+    if (static_cast<ARMOperand &>(*Operands[I]).isVPTPred()) {
+      Operands.erase(Operands.begin() + I);
+      --MnemonicOpsEndInd;
+      break;
+    }
+}
+
 /// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                     SMLoc NameLoc, OperandVector &Operands) {
@@ -7055,14 +7111,14 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   }
 
   // Add the carry setting operand, if necessary.
-  if (CanAcceptCarrySet) {
+  if (CanAcceptCarrySet && CarrySetting) {
     SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
     Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0,
                                                Loc));
   }
 
   // Add the predication code operand, if necessary.
-  if (CanAcceptPredicationCode) {
+  if (CanAcceptPredicationCode && PredicationCode != llvm::ARMCC::AL) {
     SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
                                       CarrySetting);
     Operands.push_back(ARMOperand::CreateCondCode(
@@ -7070,14 +7126,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   }
 
   // Add the VPT predication code operand, if necessary.
-  // FIXME: We don't add them for the instructions filtered below as these can
-  // have custom operands which need special parsing.  This parsing requires
-  // the operand to be in the same place in the OperandVector as their
-  // definition in tblgen.  Since these instructions may also have the
-  // scalar predication operand we do not add the vector one and leave until
-  // now to fix it up.
-  if (CanAcceptVPTPredicationCode && Mnemonic != "vmov" &&
-      !Mnemonic.starts_with("vcmp") &&
+  // Dont add in certain cases of VCVT as this needs to be disambiguated
+  // after operand parsing.
+  if (CanAcceptVPTPredicationCode && VPTPredicationCode != llvm::ARMVCC::None &&
       !(Mnemonic.starts_with("vcvt") && Mnemonic != "vcvta" &&
         Mnemonic != "vcvtn" && Mnemonic != "vcvtp" && Mnemonic != "vcvtm")) {
     SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
@@ -7123,6 +7174,11 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
+  // This marks the end of the LHS Mnemonic operators.
+  // This is used for indexing into the non-menmonic operators as some of the
+  // mnemonic operators are optional and therfore indexes can differ.
+  unsigned MnemonicOpsEndInd = Operands.size();
+
   // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
@@ -7141,7 +7197,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
     return true;
 
-  tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands);
+  tryConvertingToTwoOperandForm(Mnemonic, PredicationCode, CarrySetting,
+                                Operands, MnemonicOpsEndInd);
 
   if (hasCDE() && MS.isCDEInstr(Mnemonic)) {
     // Dual-register instructions use even-odd register pairs as their
@@ -7152,33 +7209,16 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     // returns false, the function either succeeded or an error (e.g. missing
     // operand) will be diagnosed elsewhere.
     if (MS.isCDEDualRegInstr(Mnemonic)) {
-      bool GotError = CDEConvertDualRegOperand(Mnemonic, Operands);
+      bool GotError =
+          CDEConvertDualRegOperand(Mnemonic, Operands, MnemonicOpsEndInd);
       if (GotError)
         return GotError;
     }
   }
 
-  // Some instructions, mostly Thumb, have forms for the same mnemonic that
-  // do and don't have a cc_out optional-def operand. With some spot-checks
-  // of the operand list, we can figure out which variant we're trying to
-  // parse and adjust accordingly before actually matching. We shouldn't ever
-  // try to remove a cc_out operand that was explicitly set on the
-  // mnemonic, of course (CarrySetting == true). Reason number #317 the
-  // table driven matcher doesn't fit well with the ARM instruction set.
-  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands))
-    Operands.erase(Operands.begin() + 1);
-
-  // Some instructions have the same mnemonic, but don't always
-  // have a predicate. Distinguish them here and delete the
-  // appropriate predicate if needed.  This could be either the scalar
-  // predication code or the vector predication code.
-  if (PredicationCode == ARMCC::AL &&
-      shouldOmitPredicateOperand(Mnemonic, Operands))
-    Operands.erase(Operands.begin() + 1);
-
-
   if (hasMVE()) {
-    if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands) &&
+    if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                          MnemonicOpsEndInd) &&
         Mnemonic == "vmov" && PredicationCode == ARMCC::LT) {
       // Very nasty hack to deal with the vector predicated variant of vmovlt
       // the scalar predicated vmov with condition 'lt'.  We can not tell them
@@ -7193,7 +7233,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       Operands.insert(Operands.begin(),
                       ARMOperand::CreateToken(StringRef("vmovlt"), MLoc));
     } else if (Mnemonic == "vcvt" && PredicationCode == ARMCC::NE &&
-               !shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+               !shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                                 MnemonicOpsEndInd)) {
       // Another nasty hack to deal with the ambiguity between vcvt with scalar
       // predication 'ne' and vcvtn with vector predication 'e'.  As above we
       // can only distinguish between the two after we have parsed their
@@ -7208,36 +7249,33 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       Operands.insert(Operands.begin(),
                       ARMOperand::CreateToken(StringRef("vcvtn"), MLoc));
     } else if (Mnemonic == "vmul" && PredicationCode == ARMCC::LT &&
-               !shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+               !shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                                 MnemonicOpsEndInd)) {
       // Another hack, this time to distinguish between scalar predicated vmul
       // with 'lt' predication code and the vector instruction vmullt with
       // vector predication code "none"
-      Operands.erase(Operands.begin() + 1);
+      removeCondCode(Operands, MnemonicOpsEndInd);
       Operands.erase(Operands.begin());
       SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer());
       Operands.insert(Operands.begin(),
                       ARMOperand::CreateToken(StringRef("vmullt"), MLoc));
-    }
-    // For vmov and vcmp, as mentioned earlier, we did not add the vector
-    // predication code, since these may contain operands that require
-    // special parsing.  So now we have to see if they require vector
-    // predication and replace the scalar one with the vector predication
-    // operand if that is the case.
-    else if (Mnemonic == "vmov" || Mnemonic.starts_with("vcmp") ||
-             (Mnemonic.starts_with("vcvt") && !Mnemonic.starts_with("vcvta") &&
-              !Mnemonic.starts_with("vcvtn") &&
-              !Mnemonic.starts_with("vcvtp") &&
-              !Mnemonic.starts_with("vcvtm"))) {
-      if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+    } else if (Mnemonic.starts_with("vcvt") && !Mnemonic.starts_with("vcvta") &&
+               !Mnemonic.starts_with("vcvtn") &&
+               !Mnemonic.starts_with("vcvtp") &&
+               !Mnemonic.starts_with("vcvtm")) {
+      if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                            MnemonicOpsEndInd)) {
         // We could not split the vector predicate off vcvt because it might
         // have been the scalar vcvtt instruction.  Now we know its a vector
         // instruction, we still need to check whether its the vector
         // predicated vcvt with 'Then' predication or the vector vcvtt.  We can
         // distinguish the two based on the suffixes, if it is any of
         // ".f16.f32", ".f32.f16", ".f16.f64" or ".f64.f16" then it is the vcvtt.
-        if (Mnemonic.starts_with("vcvtt") && Operands.size() >= 4) {
-          auto Sz1 = static_cast<ARMOperand &>(*Operands[2]);
-          auto Sz2 = static_cast<ARMOperand &>(*Operands[3]);
+        if (Mnemonic.starts_with("vcvtt") && MnemonicOpsEndInd > 2) {
+          auto Sz1 =
+              static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd - 2]);
+          auto Sz2 =
+              static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd - 1]);
           if (!(Sz1.isToken() && Sz1.getToken().starts_with(".f") &&
                 Sz2.isToken() && Sz2.getToken().starts_with(".f"))) {
             Operands.erase(Operands.begin());
@@ -7249,24 +7287,21 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                             ARMOperand::CreateToken(Mnemonic, MLoc));
           }
         }
-        Operands.erase(Operands.begin() + 1);
         SMLoc PLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
                                           Mnemonic.size() + CarrySetting);
+        // Add VPTPred
         Operands.insert(Operands.begin() + 1,
                         ARMOperand::CreateVPTPred(
                             ARMVCC::VPTCodes(VPTPredicationCode), PLoc));
+        ++MnemonicOpsEndInd;
       }
     } else if (CanAcceptVPTPredicationCode) {
       // For all other instructions, make sure only one of the two
       // predication operands is left behind, depending on whether we should
       // use the vector predication.
-      if (shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
-        if (CanAcceptPredicationCode)
-          Operands.erase(Operands.begin() + 2);
-        else
-          Operands.erase(Operands.begin() + 1);
-      } else if (CanAcceptPredicationCode && PredicationCode == ARMCC::AL) {
-        Operands.erase(Operands.begin() + 1);
+      if (shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                           MnemonicOpsEndInd)) {
+        removeVPTCondCode(Operands, MnemonicOpsEndInd);
       }
     }
   }
@@ -7291,69 +7326,73 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
-    // ARM mode 'blx' need special handling, as the register operand version
-    // is predicable, but the label operand version is not. So, we can't rely
-    // on the Mnemonic based checking to correctly figure out when to put
-    // a k_CondCode operand in the list. If we're trying to match the label
-    // version, remove the k_CondCode operand here.
-    if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
-        static_cast<ARMOperand &>(*Operands[2]).isImm())
-      Operands.erase(Operands.begin() + 1);
-
-    // Adjust operands of ldrexd/strexd to MCK_GPRPair.
-    // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
-    // a single GPRPair reg operand is used in the .td file to replace the two
-    // GPRs. However, when parsing from asm, the two GRPs cannot be
-    // automatically
-    // expressed as a GPRPair, so we have to manually merge them.
-    // FIXME: We would really like to be able to tablegen'erate this.
-    if (!isThumb() && Operands.size() > 4 &&
-        (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
-         Mnemonic == "stlexd")) {
-      bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
-      unsigned Idx = isLoad ? 2 : 3;
-      ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
-      ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
-
-      const MCRegisterClass &MRC = MRI->getRegClass(ARM::GPRRegClassID);
-      // Adjust only if Op1 and Op2 are GPRs.
-      if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
-          MRC.contains(Op2.getReg())) {
-        unsigned Reg1 = Op1.getReg();
-        unsigned Reg2 = Op2.getReg();
-        unsigned Rt = MRI->getEncodingValue(Reg1);
-        unsigned Rt2 = MRI->getEncodingValue(Reg2);
-
-        // Rt2 must be Rt + 1 and Rt must be even.
-        if (Rt + 1 != Rt2 || (Rt & 1)) {
-          return Error(Op2.getStartLoc(),
-                       isLoad ? "destination operands must be sequential"
-                              : "source operands must be sequential");
-        }
-        unsigned NewReg = MRI->getMatchingSuperReg(
-            Reg1, ARM::gsub_0, &(MRI->getRegClass(ARM::GPRPairRegClassID)));
-        Operands[Idx] =
-            ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
-        Operands.erase(Operands.begin() + Idx + 1);
+  // ARM mode 'blx' need special handling, as the register operand version
+  // is predicable, but the label operand version is not. So, we can't rely
+  // on the Mnemonic based checking to correctly figure out when to put
+  // a k_CondCode operand in the list. If we're trying to match the label
+  // version, remove the k_CondCode operand here.
+  if (!isThumb() && Mnemonic == "blx" &&
+      Operands.size() == MnemonicOpsEndInd + 1 &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]).isImm())
+    removeCondCode(Operands, MnemonicOpsEndInd);
+
+  // Adjust operands of ldrexd/strexd to MCK_GPRPair.
+  // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+  // a single GPRPair reg operand is used in the .td file to replace the two
+  // GPRs. However, when parsing from asm, the two GRPs cannot be
+  // automatically
+  // expressed as a GPRPair, so we have to manually merge them.
+  // FIXME: We would really like to be able to tablegen'erate this.
+  if (!isThumb() && Operands.size() > MnemonicOpsEndInd + 1 &&
+      (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
+       Mnemonic == "stlexd")) {
+    bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
+    unsigned Idx = isLoad ? MnemonicOpsEndInd : MnemonicOpsEndInd + 1;
+    ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
+
+    const MCRegisterClass &MRC = MRI->getRegClass(ARM::GPRRegClassID);
+    // Adjust only if Op1 and Op2 are GPRs.
+    if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
+        MRC.contains(Op2.getReg())) {
+      unsigned Reg1 = Op1.getReg();
+      unsigned Reg2 = Op2.getReg();
+      unsigned Rt = MRI->getEncodingValue(Reg1);
+      unsigned Rt2 = MRI->getEncodingValue(Reg2);
+
+      // Rt2 must be Rt + 1 and Rt must be even.
+      if (Rt + 1 != Rt2 || (Rt & 1)) {
+        return Error(Op2.getStartLoc(),
+                     isLoad ? "destination operands must be sequential"
+                            : "source operands must be sequential");
       }
+      unsigned NewReg = MRI->getMatchingSuperReg(
+          Reg1, ARM::gsub_0, &(MRI->getRegClass(ARM::GPRPairRegClassID)));
+      Operands[Idx] =
+          ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
+      Operands.erase(Operands.begin() + Idx + 1);
+    }
   }
 
   // GNU Assembler extension (compatibility).
-  fixupGNULDRDAlias(Mnemonic, Operands);
+  fixupGNULDRDAlias(Mnemonic, Operands, MnemonicOpsEndInd);
 
   // FIXME: As said above, this is all a pretty gross hack.  This instruction
   // does not fit with other "subs" and tblgen.
   // Adjust operands of B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction
   // so the Mnemonic is the original name "subs" and delete the predicate
   // operand so it will match the table entry.
-  if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::PC &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::LR &&
-      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+  if (isThumbTwo() && Mnemonic == "sub" &&
+      Operands.size() == MnemonicOpsEndInd + 3 &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]).getReg() ==
+          ARM::PC &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]).getReg() ==
+          ARM::LR &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 2]).isImm()) {
     Operands.front() = ARMOperand::CreateToken(Name, NameLoc);
-    Operands.erase(Operands.begin() + 1);
+    removeCCOut(Operands, MnemonicOpsEndInd);
   }
   return false;
 }
@@ -7398,49 +7437,61 @@ static bool instIsBreakpoint(const MCInst &Inst) {
            Inst.getOpcode() == ARM::HLT;
 }
 
+unsigned getRegListInd(const OperandVector &Operands,
+                       unsigned MnemonicOpsEndInd) {
+  for (unsigned I = MnemonicOpsEndInd; I < Operands.size(); ++I) {
+    const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[I]);
+    if (Op.isRegList()) {
+      return I;
+    }
+  }
+  return 0;
+}
+
 bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst,
                                        const OperandVector &Operands,
-                                       unsigned ListNo, bool IsARPop) {
-  const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
-  bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
-
-  bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
-  bool ListContainsLR = listContainsReg(Inst, ListNo, ARM::LR);
-  bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+                                       unsigned MnemonicOpsEndInd,
+                                       unsigned ListIndex, bool IsARPop) {
+  bool ListContainsSP = listContainsReg(Inst, ListIndex, ARM::SP);
+  bool ListContainsLR = listContainsReg(Inst, ListIndex, ARM::LR);
+  bool ListContainsPC = listContainsReg(Inst, ListIndex, ARM::PC);
 
   if (!IsARPop && ListContainsSP)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "SP may not be in the register list");
-  else if (ListContainsPC && ListContainsLR)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "PC and LR may not be in the register list simultaneously");
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "SP may not be in the register list");
+  if (ListContainsPC && ListContainsLR)
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "PC and LR may not be in the register list simultaneously");
   return false;
 }
 
 bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst,
                                        const OperandVector &Operands,
-                                       unsigned ListNo) {
-  const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
-  bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
-
-  bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
-  bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+                                       unsigned MnemonicOpsEndInd,
+                                       unsigned ListIndex) {
+  bool ListContainsSP = listContainsReg(Inst, ListIndex, ARM::SP);
+  bool ListContainsPC = listContainsReg(Inst, ListIndex, ARM::PC);
 
   if (ListContainsSP && ListContainsPC)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "SP and PC may not be in the register list");
-  else if (ListContainsSP)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "SP may not be in the register list");
-  else if (ListContainsPC)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "PC may not be in the register list");
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "SP and PC may not be in the register list");
+  if (ListContainsSP)
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "SP may not be in the register list");
+  if (ListContainsPC)
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "PC may not be in the register list");
   return false;
 }
 
-bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
-                                    const OperandVector &Operands,
-                                    bool Load, bool ARMMode, bool Writeback) {
+bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands,
+                                    bool Load, bool ARMMode, bool Writeback,
+                                    unsigned MnemonicOpsEndInd) {
   unsigned RtIndex = Load || !Writeback ? 0 : 1;
   unsigned Rt = MRI->getEncodingValue(Inst.getOperand(RtIndex).getReg());
   unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(RtIndex + 1).getReg());
@@ -7448,21 +7499,21 @@ bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
   if (ARMMode) {
     // Rt can't be R14.
     if (Rt == 14)
-      return Error(Operands[3]->getStartLoc(),
-                  "Rt can't be R14");
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Rt can't be R14");
 
     // Rt must be even-numbered.
     if ((Rt & 1) == 1)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "Rt must be even-numbered");
 
     // Rt2 must be Rt + 1.
     if (Rt2 != Rt + 1) {
       if (Load)
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "destination operands must be sequential");
       else
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "source operands must be sequential");
     }
 
@@ -7472,7 +7523,7 @@ bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
 
   if (!ARMMode && Load) {
     if (Rt2 == Rt)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "destination operands can't be identical");
   }
 
@@ -7481,11 +7532,11 @@ bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
 
     if (Rn == Rt || Rn == Rt2) {
       if (Load)
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "base register needs to be different from destination "
                      "registers");
       else
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "source register and base register can't be identical");
     }
 
@@ -7523,7 +7574,8 @@ static bool isARMMCExpr(MCParsedAsmOperand &MCOp) {
 
 // FIXME: We would really like to be able to tablegen'erate this.
 bool ARMAsmParser::validateInstruction(MCInst &Inst,
-                                       const OperandVector &Operands) {
+                                       const OperandVector &Operands,
+                                       unsigned MnemonicOpsEndInd) {
   const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
   SMLoc Loc = Operands[0]->getStartLoc();
 
@@ -7538,7 +7590,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm());
     if (Cond != currentITCond()) {
       // Find the condition code Operand to get its SMLoc information.
-      SMLoc CondLoc;
+      SMLoc CondLoc = Operands[0]->getEndLoc();
       for (unsigned I = 1; I < Operands.size(); ++I)
         if (static_cast<ARMOperand &>(*Operands[I]).isCondCode())
           CondLoc = Operands[I]->getStartLoc();
@@ -7608,9 +7660,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::VLSTM_T2: {
     // Since in some cases both T1 and T2 are valid, tablegen can not always
     // pick the correct instruction.
-    if (Operands.size() == 4) { // a register list has been provided
+    if (Operands.size() ==
+        MnemonicOpsEndInd + 2) { // a register list has been provided
       ARMOperand &Op = static_cast<ARMOperand &>(
-          *Operands[3]); // the register list, a dpr_reglist
+          *Operands[MnemonicOpsEndInd + 1]); // the register list, a dpr_reglist
       assert(Op.isDPRRegList());
       auto &RegList = Op.getRegList();
       // T2 requires v8.1-M.Main (cannot be handled by tablegen)
@@ -7644,50 +7697,50 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::LDRD:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/true,
-                         /*Writeback*/false))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ true, /*ARMMode*/ true,
+                         /*Writeback*/ false, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::LDRD_PRE:
   case ARM::LDRD_POST:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/true,
-                         /*Writeback*/true))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ true, /*ARMMode*/ true,
+                         /*Writeback*/ true, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::t2LDRDi8:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/false,
-                         /*Writeback*/false))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ true, /*ARMMode*/ false,
+                         /*Writeback*/ false, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::t2LDRD_PRE:
   case ARM::t2LDRD_POST:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/false,
-                         /*Writeback*/true))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ true, /*ARMMode*/ false,
+                         /*Writeback*/ true, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::t2BXJ: {
     const unsigned RmReg = Inst.getOperand(0).getReg();
     // Rm = SP is no longer unpredictable in v8-A
     if (RmReg == ARM::SP && !hasV8Ops())
-      return Error(Operands[2]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "r13 (SP) is an unpredictable operand to BXJ");
     return false;
   }
   case ARM::STRD:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/true,
-                         /*Writeback*/false))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ false, /*ARMMode*/ true,
+                         /*Writeback*/ false, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::STRD_PRE:
   case ARM::STRD_POST:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/true,
-                         /*Writeback*/true))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ false, /*ARMMode*/ true,
+                         /*Writeback*/ true, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::t2STRD_PRE:
   case ARM::t2STRD_POST:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/false,
-                         /*Writeback*/true))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ false, /*ARMMode*/ false,
+                         /*Writeback*/ true, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::STR_PRE_IMM:
@@ -7711,7 +7764,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
 
     if (Rt == Rn)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "source register and base register can't be identical");
     return false;
   }
@@ -7724,19 +7777,19 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(1).getReg());
 
     if (Rt == Rn)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "destination register and base register can't be identical");
     if (Inst.getOpcode() == ARM::t2LDR_POST_imm ||
         Inst.getOpcode() == ARM::t2STR_POST_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     }
     if (Inst.getOpcode() == ARM::t2STR_PRE_imm ||
         Inst.getOpcode() == ARM::t2STR_POST_imm) {
       if (Inst.getOperand(0).getReg() == ARM::PC) {
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "operand must be a register in range [r0, r14]");
       }
     }
@@ -7755,17 +7808,17 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOpcode() == ARM::t2STRB_PRE_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     } else if (Inst.getOpcode() == ARM::t2LDRB_OFFSET_imm ||
                Inst.getOpcode() == ARM::t2STRB_OFFSET_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 0 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [0, 255] with a negative sign");
     }
     if (Inst.getOperand(0).getReg() == ARM::PC) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "if operand is PC, should call the LDRB (literal)");
     }
     return false;
@@ -7783,17 +7836,17 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOpcode() == ARM::t2STRH_PRE_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     } else if (Inst.getOpcode() == ARM::t2LDRH_OFFSET_imm ||
                Inst.getOpcode() == ARM::t2STRH_OFFSET_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 0 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [0, 255] with a negative sign");
     }
     if (Inst.getOperand(0).getReg() == ARM::PC) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "if operand is PC, should call the LDRH (literal)");
     }
     return false;
@@ -7806,16 +7859,16 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOpcode() == ARM::t2LDRSB_PRE_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     } else if (Inst.getOpcode() == ARM::t2LDRSB_OFFSET_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 0 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [0, 255] with a negative sign");
     }
     if (Inst.getOperand(0).getReg() == ARM::PC) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "if operand is PC, should call the LDRH (literal)");
     }
     return false;
@@ -7828,16 +7881,16 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOpcode() == ARM::t2LDRSH_PRE_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     } else if (Inst.getOpcode() == ARM::t2LDRSH_OFFSET_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 0 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [0, 255] with a negative sign");
     }
     if (Inst.getOperand(0).getReg() == ARM::PC) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "if operand is PC, should call the LDRH (literal)");
     }
     return false;
@@ -7872,7 +7925,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
 
     if (Rt == Rn)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "destination register and base register can't be identical");
     return false;
   }
@@ -7916,10 +7969,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Qm = MRI->getEncodingValue(Inst.getOperand(QmIdx).getReg());
 
     if (Qd == Qm) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    Twine("destination vector register and vector ") +
-                   (QmIsPointer ? "pointer" : "offset") +
-                   " register can't be identical");
+                       (QmIsPointer ? "pointer" : "offset") +
+                       " register can't be identical");
     }
     return false;
   }
@@ -7932,7 +7985,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     unsigned LSB = Inst.getOperand(2).getImm();
     unsigned Widthm1 = Inst.getOperand(3).getImm();
     if (Widthm1 >= 32 - LSB)
-      return Error(Operands[5]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "bitfield width must be in range [1,32-lsb]");
     return false;
   }
@@ -7946,24 +7999,29 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     // in the register list.
     unsigned Rn = Inst.getOperand(0).getReg();
     bool HasWritebackToken =
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+        (static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+             .isToken() &&
+         static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+                 .getToken() == "!");
+
     bool ListContainsBase;
     if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
-      return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
-                   "registers must be in range r0-r7");
+      return Error(
+          Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+          "registers must be in range r0-r7");
     // If we should have writeback, then there should be a '!' token.
     if (!ListContainsBase && !HasWritebackToken && !isThumbTwo())
-      return Error(Operands[2]->getStartLoc(),
-                   "writeback operator '!' expected");
+      return Error(
+          Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+          "writeback operator '!' expected");
     // If we should not have writeback, there must not be a '!'. This is
     // true even for the 32-bit wide encodings.
     if (ListContainsBase && HasWritebackToken)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
 
-    if (validatetLDMRegList(Inst, Operands, 3))
+    if (validatetLDMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
       return true;
     break;
   }
@@ -7981,12 +8039,12 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   case ARM::t2LDMIA:
   case ARM::t2LDMDB:
-    if (validatetLDMRegList(Inst, Operands, 3))
+    if (validatetLDMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
       return true;
     break;
   case ARM::t2STMIA:
   case ARM::t2STMDB:
-    if (validatetSTMRegList(Inst, Operands, 3))
+    if (validatetSTMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
       return true;
     break;
   case ARM::t2LDMIA_UPD:
@@ -7998,10 +8056,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
                    "writeback register not allowed in register list");
 
     if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
-      if (validatetLDMRegList(Inst, Operands, 3))
+      if (validatetLDMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
         return true;
     } else {
-      if (validatetSTMRegList(Inst, Operands, 3))
+      if (validatetSTMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
         return true;
     }
     break;
@@ -8011,7 +8069,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::sysLDMDB_UPD:
   case ARM::sysLDMIB_UPD:
     if (!listContainsReg(Inst, 3, ARM::PC))
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "writeback register only allowed on system LDM "
                    "if PC in register-list");
     break;
@@ -8019,26 +8077,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::sysSTMDA_UPD:
   case ARM::sysSTMDB_UPD:
   case ARM::sysSTMIB_UPD:
-    return Error(Operands[2]->getStartLoc(),
+    return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                  "system STM cannot have writeback register");
-  case ARM::tMUL:
-    // The second source operand must be the same register as the destination
-    // operand.
-    //
-    // In this case, we must directly check the parsed operands because the
-    // cvtThumbMultiply() function is written in such a way that it guarantees
-    // this first statement is always true for the new Inst.  Essentially, the
-    // destination is unconditionally copied into the second source operand
-    // without checking to see if it matches what we actually parsed.
-    if (Operands.size() == 6 && (((ARMOperand &)*Operands[3]).getReg() !=
-                                 ((ARMOperand &)*Operands[5]).getReg()) &&
-        (((ARMOperand &)*Operands[3]).getReg() !=
-         ((ARMOperand &)*Operands[4]).getReg())) {
-      return Error(Operands[3]->getStartLoc(),
-                   "destination register must match source register");
-    }
-    break;
-
   // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2,
   // so only issue a diagnostic for thumb1. The instructions will be
   // switched to the t2 encodings in processInstruction() if necessary.
@@ -8046,9 +8086,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     bool ListContainsBase;
     if (checkLowRegisterList(Inst, 2, 0, ARM::PC, ListContainsBase) &&
         !isThumbTwo())
-      return Error(Operands[2]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "registers must be in range r0-r7 or pc");
-    if (validatetLDMRegList(Inst, Operands, 2, !isMClass()))
+    if (validatetLDMRegList(Inst, Operands, MnemonicOpsEndInd, 2, !isMClass()))
       return true;
     break;
   }
@@ -8056,9 +8096,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     bool ListContainsBase;
     if (checkLowRegisterList(Inst, 2, 0, ARM::LR, ListContainsBase) &&
         !isThumbTwo())
-      return Error(Operands[2]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "registers must be in range r0-r7 or lr");
-    if (validatetSTMRegList(Inst, Operands, 2))
+    if (validatetSTMRegList(Inst, Operands, MnemonicOpsEndInd, 2))
       return true;
     break;
   }
@@ -8067,17 +8107,17 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     InvalidLowList = checkLowRegisterList(Inst, 4, Inst.getOperand(0).getReg(),
                                           0, ListContainsBase);
     if (InvalidLowList && !isThumbTwo())
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "registers must be in range r0-r7");
 
     // This would be converted to a 32-bit stm, but that's not valid if the
     // writeback register is in the list.
     if (InvalidLowList && ListContainsBase)
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
 
-    if (validatetSTMRegList(Inst, Operands, 4))
+    if (validatetSTMRegList(Inst, Operands, MnemonicOpsEndInd, 4))
       return true;
     break;
   }
@@ -8086,7 +8126,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     // same, we need thumb2 (for the wide encoding), or we have an error.
     if (!isThumbTwo() &&
         Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "source register must be the same as destination");
     }
     break;
@@ -8097,17 +8137,20 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::t2SUBrs:
     if (Inst.getOperand(0).getReg() == ARM::SP &&
         Inst.getOperand(1).getReg() != ARM::SP)
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "source register must be sp if destination is sp");
     break;
 
   // Final range checking for Thumb unconditional branch instructions.
   case ARM::tB:
-    if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>())
-      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    if (!(static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]))
+             .isSignedOffset<11, 1>())
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "branch target out of range");
     break;
   case ARM::t2B: {
-    int op = (Operands[2]->isImm()) ? 2 : 3;
+    int op = (Operands[MnemonicOpsEndInd]->isImm()) ? MnemonicOpsEndInd
+                                                    : MnemonicOpsEndInd + 1;
     ARMOperand &Operand = static_cast<ARMOperand &>(*Operands[op]);
     // Delay the checks of symbolic expressions until they are resolved.
     if (!isa<MCBinaryExpr>(Operand.getImm()) &&
@@ -8117,19 +8160,24 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   }
   // Final range checking for Thumb conditional branch instructions.
   case ARM::tBcc:
-    if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<8, 1>())
-      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd])
+             .isSignedOffset<8, 1>())
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "branch target out of range");
     break;
   case ARM::t2Bcc: {
-    int Op = (Operands[2]->isImm()) ? 2 : 3;
+    int Op = (Operands[MnemonicOpsEndInd]->isImm()) ? MnemonicOpsEndInd
+                                                    : MnemonicOpsEndInd + 1;
     if (!static_cast<ARMOperand &>(*Operands[Op]).isSignedOffset<20, 1>())
       return Error(Operands[Op]->getStartLoc(), "branch target out of range");
     break;
   }
   case ARM::tCBZ:
   case ARM::tCBNZ: {
-    if (!static_cast<ARMOperand &>(*Operands[2]).isUnsignedOffset<6, 1>())
-      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+             .isUnsignedOffset<6, 1>())
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
+                   "branch target out of range");
     break;
   }
   case ARM::MOVi16:
@@ -8143,7 +8191,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     // want the behavior of silently truncating, which can be unexpected and
     // lead to bugs that are difficult to find since this is an easy mistake
     // to make.
-    int i = (Operands[3]->isImm()) ? 3 : 4;
+    int i = (Operands[MnemonicOpsEndInd]->isImm()) ? MnemonicOpsEndInd
+                                                   : MnemonicOpsEndInd + 1;
     ARMOperand &Op = static_cast<ARMOperand &>(*Operands[i]);
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
     if (CE) break;
@@ -8158,7 +8207,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::tADDi8: {
-    MCParsedAsmOperand &Op = *Operands[4];
+    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd + 1];
     if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
       return Error(Op.getStartLoc(),
                    "Immediate expression for Thumb adds requires :lower0_7:,"
@@ -8166,7 +8215,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::tMOVi8: {
-    MCParsedAsmOperand &Op = *Operands[2];
+    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd];
     if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
       return Error(Op.getStartLoc(),
                    "Immediate expression for Thumb movs requires :lower0_7:,"
@@ -8193,30 +8242,36 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::t2BFr:
   case ARM::t2BFLi:
   case ARM::t2BFLr: {
-    if (!static_cast<ARMOperand &>(*Operands[2]).isUnsignedOffset<4, 1>() ||
-        (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0))
-      return Error(Operands[2]->getStartLoc(),
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd])
+             .isUnsignedOffset<4, 1>() ||
+        (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0)) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "branch location out of range or not a multiple of 2");
+    }
 
     if (Opcode == ARM::t2BFi) {
-      if (!static_cast<ARMOperand &>(*Operands[3]).isSignedOffset<16, 1>())
-        return Error(Operands[3]->getStartLoc(),
+      if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+               .isSignedOffset<16, 1>())
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "branch target out of range or not a multiple of 2");
     } else if (Opcode == ARM::t2BFLi) {
-      if (!static_cast<ARMOperand &>(*Operands[3]).isSignedOffset<18, 1>())
-        return Error(Operands[3]->getStartLoc(),
+      if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+               .isSignedOffset<18, 1>())
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "branch target out of range or not a multiple of 2");
     }
     break;
   }
   case ARM::t2BFic: {
-    if (!static_cast<ARMOperand &>(*Operands[1]).isUnsignedOffset<4, 1>() ||
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd])
+             .isUnsignedOffset<4, 1>() ||
         (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0))
       return Error(Operands[1]->getStartLoc(),
                    "branch location out of range or not a multiple of 2");
 
-    if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<16, 1>())
-      return Error(Operands[2]->getStartLoc(),
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+             .isSignedOffset<16, 1>())
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "branch target out of range or not a multiple of 2");
 
     assert(Inst.getOperand(0).isImm() == Inst.getOperand(2).isImm() &&
@@ -8237,7 +8292,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
       if (Inst.getOperand(i).isReg() &&
           !ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(
               Inst.getOperand(i).getReg())) {
-        return Error(Operands[2]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "invalid register in register list. Valid registers are "
                      "r0-r12, lr/r14 and APSR.");
       }
@@ -8269,7 +8324,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(2).getReg());
     const unsigned Sm1 = MRI->getEncodingValue(Inst.getOperand(3).getReg());
     if (Sm1 != Sm + 1)
-      return Error(Operands[5]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "source operands must be sequential");
     break;
   }
@@ -8278,16 +8333,17 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(0).getReg());
     const unsigned Sm1 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
     if (Sm1 != Sm + 1)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "destination operands must be sequential");
     break;
   }
   case ARM::VLDMDIA:
   case ARM::VSTMDIA: {
-    ARMOperand &Op = static_cast<ARMOperand&>(*Operands[3]);
+    ARMOperand &Op =
+        static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]);
     auto &RegList = Op.getRegList();
     if (RegList.size() < 1 || RegList.size() > 16)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "list of registers must be at least 1 and at most 16");
     break;
   }
@@ -8298,13 +8354,15 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::MVE_VMULLTs32:
   case ARM::MVE_VMULLBu32:
   case ARM::MVE_VMULLTu32: {
-    if (Operands[3]->getReg() == Operands[4]->getReg()) {
-      return Error (Operands[3]->getStartLoc(),
-                    "Qd register and Qn register can't be identical");
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 1]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Qd register and Qn register can't be identical");
     }
-    if (Operands[3]->getReg() == Operands[5]->getReg()) {
-      return Error (Operands[3]->getStartLoc(),
-                    "Qd register and Qm register can't be identical");
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 2]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Qd register and Qm register can't be identical");
     }
     break;
   }
@@ -8313,41 +8371,56 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::MVE_VREV64_32:
   case ARM::MVE_VQDMULL_qr_s32bh:
   case ARM::MVE_VQDMULL_qr_s32th: {
-    if (Operands[3]->getReg() == Operands[4]->getReg()) {
-      return Error (Operands[3]->getStartLoc(),
-                    "Qd register and Qn register can't be identical");
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 1]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Qd register and Qn register can't be identical");
     }
     break;
   }
   case ARM::MVE_VCADDi32:
   case ARM::MVE_VCADDf32:
   case ARM::MVE_VHCADDs32: {
-    if (Operands[3]->getReg() == Operands[5]->getReg()) {
-      return Error (Operands[3]->getStartLoc(),
-                    "Qd register and Qm register can't be identical");
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 2]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Qd register and Qm register can't be identical");
     }
     break;
   }
   case ARM::MVE_VMOV_rr_q: {
-    if (Operands[4]->getReg() != Operands[6]->getReg())
-      return Error (Operands[4]->getStartLoc(), "Q-registers must be the same");
-    if (static_cast<ARMOperand &>(*Operands[5]).getVectorIndex() !=
-        static_cast<ARMOperand &>(*Operands[7]).getVectorIndex() + 2)
-      return Error (Operands[5]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1");
+    if (Operands[MnemonicOpsEndInd + 2]->getReg() !=
+        Operands[MnemonicOpsEndInd + 4]->getReg())
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
+                   "Q-registers must be the same");
+    if (static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 3])
+            .getVectorIndex() !=
+        static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 5])
+                .getVectorIndex() +
+            2)
+      return Error(Operands[MnemonicOpsEndInd + 3]->getStartLoc(),
+                   "Q-register indexes must be 2 and 0 or 3 and 1");
     break;
   }
   case ARM::MVE_VMOV_q_rr: {
-    if (Operands[2]->getReg() != Operands[4]->getReg())
-      return Error (Operands[2]->getStartLoc(), "Q-registers must be the same");
-    if (static_cast<ARMOperand &>(*Operands[3]).getVectorIndex() !=
-        static_cast<ARMOperand &>(*Operands[5]).getVectorIndex() + 2)
-      return Error (Operands[3]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1");
+    if (Operands[MnemonicOpsEndInd]->getReg() !=
+        Operands[MnemonicOpsEndInd + 2]->getReg())
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Q-registers must be the same");
+    if (static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+            .getVectorIndex() !=
+        static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 3])
+                .getVectorIndex() +
+            2)
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
+                   "Q-register indexes must be 2 and 0 or 3 and 1");
     break;
   }
   case ARM::MVE_SQRSHR:
   case ARM::MVE_UQRSHL: {
-    if (Operands[2]->getReg() == Operands[3]->getReg()) {
-      return Error(Operands[2]->getStartLoc(),
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 1]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "Rda register and Rm register can't be identical");
     }
     break;
@@ -8751,6 +8824,7 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
 
 bool ARMAsmParser::processInstruction(MCInst &Inst,
                                       const OperandVector &Operands,
+                                      unsigned MnemonicOpsEndInd,
                                       MCStreamer &Out) {
   // Check if we have the wide qualifier, because if it's present we
   // must avoid selecting a 16-bit thumb instruction.
@@ -8768,9 +8842,10 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   case ARM::VLSTM: {
     // In some cases both T1 and T2 are valid, causing tablegen pick T1 instead
     // of T2
-    if (Operands.size() == 4) { // a register list has been provided
+    if (Operands.size() ==
+        MnemonicOpsEndInd + 2) { // a register list has been provided
       ARMOperand &Op = static_cast<ARMOperand &>(
-          *Operands[3]); // the register list, a dpr_reglist
+          *Operands[MnemonicOpsEndInd + 1]); // the register list, a dpr_reglist
       assert(Op.isDPRRegList());
       auto &RegList = Op.getRegList();
       // When the register list is {d0-d31} the instruction has to be the T2
@@ -9097,9 +9172,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     else if (Inst.getOpcode() == ARM::t2LDRConstPool)
       TmpInst.setOpcode(ARM::t2LDRpci);
     const ARMOperand &PoolOperand =
-      (HasWideQualifier ?
-       static_cast<ARMOperand &>(*Operands[4]) :
-       static_cast<ARMOperand &>(*Operands[3]));
+        static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]);
     const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
     // If SubExprVal is a constant we may be able to use a MOV
     if (isa<MCConstantExpr>(SubExprVal) &&
@@ -10526,7 +10599,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // to encoding T2 if <Rd> is specified and encoding T2 is preferred
     // to encoding T1 if <Rd> is omitted."
     if (Inst.getOperand(3).isImm() &&
-        (unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+        (unsigned)Inst.getOperand(3).getImm() < 8 &&
+        Operands.size() == MnemonicOpsEndInd + 3) {
       Inst.setOpcode(ARM::tADDi3);
       return true;
     }
@@ -10536,7 +10610,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
     // to encoding T2 if <Rd> is specified and encoding T2 is preferred
     // to encoding T1 if <Rd> is omitted."
-    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 &&
+        Operands.size() == MnemonicOpsEndInd + 3) {
       Inst.setOpcode(ARM::tSUBi3);
       return true;
     }
@@ -10656,8 +10731,10 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // an error in validateInstruction().
     unsigned Rn = Inst.getOperand(0).getReg();
     bool hasWritebackToken =
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+        (static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+             .isToken() &&
+         static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+                 .getToken() == "!");
     bool listContainsBase;
     if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
         (!listContainsBase && !hasWritebackToken) ||
@@ -10956,6 +11033,26 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   return false;
 }
 
+unsigned
+ARMAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
+                                             const OperandVector &Operands) {
+  unsigned Opc = Inst.getOpcode();
+  switch (Opc) {
+  // Prevent the mov r8 r8 encoding for nop being selected when the v6/thumb 2
+  // encoding is available.
+  case ARM::tMOVr: {
+    if (Operands[0]->isToken() &&
+        static_cast<ARMOperand &>(*Operands[0]).getToken() == "nop" &&
+        ((isThumb() && !isThumbOne()) || hasV6MOps())) {
+      return Match_MnemonicFail;
+    }
+  }
+    LLVM_FALLTHROUGH;
+  default:
+    return Match_Success;
+  }
+}
+
 unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   // 16-bit thumb arithmetic instructions either require or preclude the 'S'
   // suffix depending on whether they're in an IT block or not.
@@ -10966,22 +11063,23 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
            "optionally flag setting instruction missing optional def operand");
     assert(MCID.NumOperands == Inst.getNumOperands() &&
            "operand count mismatch!");
-    // Find the optional-def operand (cc_out).
-    unsigned OpNo;
-    for (OpNo = 0;
-         OpNo < MCID.NumOperands && !MCID.operands()[OpNo].isOptionalDef();
-         ++OpNo)
-      ;
+    bool IsCPSR = false;
+    // Check if the instruction has CPSR set.
+    for (unsigned OpNo = 0; OpNo < MCID.NumOperands; ++OpNo) {
+      if (MCID.operands()[OpNo].isOptionalDef() &&
+          Inst.getOperand(OpNo).isReg() &&
+          Inst.getOperand(OpNo).getReg() == ARM::CPSR)
+        IsCPSR = true;
+    }
+
     // If we're parsing Thumb1, reject it completely.
-    if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR)
+    if (isThumbOne() && !IsCPSR)
       return Match_RequiresFlagSetting;
     // If we're parsing Thumb2, which form is legal depends on whether we're
     // in an IT block.
-    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR &&
-        !inITBlock())
+    if (isThumbTwo() && !IsCPSR && !inITBlock())
       return Match_RequiresITBlock;
-    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
-        inITBlock())
+    if (isThumbTwo() && IsCPSR && inITBlock())
       return Match_RequiresNotITBlock;
     // LSL with zero immediate is not allowed in an IT block
     if (Opc == ARM::tLSLri && Inst.getOperand(3).getImm() == 0 && inITBlock())
@@ -11042,6 +11140,14 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     if (!hasV8Ops() && (Inst.getOperand(0).getReg() == ARM::SP))
       return Match_RequiresV8;
     break;
+  case ARM::tMUL:
+    // The second source operand must be the same register as the destination
+    // operand.
+    // FIXME: Ideally this would be handled by ARMGenAsmMatcher and
+    // emitAsmTiedOperandConstraints.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(3).getReg())
+      return Match_InvalidTiedOperand;
+    break;
   default:
     break;
   }
@@ -11200,6 +11306,9 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   MatchResult = MatchInstruction(Operands, Inst, NearMisses, MatchingInlineAsm,
                                  PendConditionalInstruction, Out);
 
+  // Find the number of operators that are part of the Mnumonic (LHS).
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+
   switch (MatchResult) {
   case Match_Success:
     LLVM_DEBUG(dbgs() << "Parsed as: ";
@@ -11208,7 +11317,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
     // Context sensitive operand constraints aren't handled by the matcher,
     // so check them here.
-    if (validateInstruction(Inst, Operands)) {
+    if (validateInstruction(Inst, Operands, MnemonicOpsEndInd)) {
       // Still progress the IT block, otherwise one wrong condition causes
       // nasty cascading errors.
       forwardITPosition();
@@ -11221,7 +11330,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       // encoding is selected. Loop on it while changes happen so the
       // individual transformations can chain off each other. E.g.,
       // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
-      while (processInstruction(Inst, Operands, Out))
+      while (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out))
         LLVM_DEBUG(dbgs() << "Changed to: ";
                    Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
                    dbgs() << "\n");
@@ -12562,6 +12671,8 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
   SmallSet<FeatureBitset, 4> FeatureMissesSeen;
   bool ReportedTooFewOperands = false;
 
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+
   // Process the near-misses in reverse order, so that we see more general ones
   // first, and so can avoid emitting more specific ones.
   for (NearMissInfo &I : reverse(NearMissesIn)) {
@@ -12670,6 +12781,16 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
       case Match_RequiresFlagSetting:
         Message.Message = "no flag-preserving variant of this instruction available";
         break;
+      case Match_InvalidTiedOperand: {
+        ARMOperand &Op = static_cast<ARMOperand &>(*Operands[0]);
+        if (Op.isToken() && Op.getToken() == "mul") {
+          Message.Message = "destination register must match a source register";
+          Message.Loc = Operands[MnemonicOpsEndInd]->getStartLoc();
+        } else {
+          llvm_unreachable("Match_InvalidTiedOperand only used for tMUL.");
+        }
+        break;
+      }
       case Match_InvalidOperand:
         Message.Message = "invalid operand for instruction";
         break;
@@ -12869,11 +12990,29 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
     if (hasV8Ops() && Op.isReg() && Op.getReg() == ARM::SP)
       return Match_Success;
     return Match_rGPR;
-  case MCK_GPRPair:
-    if (Op.isReg() &&
-        MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg()))
+  // Note: This mutates the operand which could cause issues for future
+  // matches if this one fails later.
+  // It would be better to do this in addVecList but as this doesn't have access
+  // to MRI this isn't possible.
+  // If trying to match a VecListDPair with a Q register, convert Q to list.
+  case MCK_VecListDPair:
+    if (Op.isQReg() && !hasMVE()) {
+      auto DPair = getDRegFromQReg(Op.getReg());
+      DPair = MRI->getMatchingSuperReg(
+          DPair, ARM::dsub_0, &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+      Op.setVecListDPair(DPair);
       return Match_Success;
-    break;
+    }
+    return Match_InvalidOperand;
+  // Note: This mutates the operand (see above).
+  // If trying to match a VecListDPair with a D register, convert D singleton
+  // list.
+  case MCK_VecListOneD:
+    if (Op.isDReg() && !hasMVE()) {
+      Op.setVecListOneD(Op.getReg());
+      return Match_Success;
+    }
+    return Match_InvalidOperand;
   }
   return Match_InvalidOperand;
 }
@@ -12921,3 +13060,15 @@ bool ARMAsmParser::isMnemonicVPTPredicable(StringRef Mnemonic,
       std::begin(predicable_prefixes), std::end(predicable_prefixes),
       [&Mnemonic](const char *prefix) { return Mnemonic.starts_with(prefix); });
 }
+
+std::unique_ptr<ARMOperand> ARMAsmParser::defaultCondCodeOp() {
+  return ARMOperand::CreateCondCode(ARMCC::AL, SMLoc());
+}
+
+std::unique_ptr<ARMOperand> ARMAsmParser::defaultCCOutOp() {
+  return ARMOperand::CreateCCOut(0, SMLoc());
+}
+
+std::unique_ptr<ARMOperand> ARMAsmParser::defaultVPTPredOp() {
+  return ARMOperand::CreateVPTPred(ARMVCC::None, SMLoc());
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 163360c08ffba..28e5840fdde5b 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -41,7 +41,7 @@ namespace ARM_AM {
 
   inline const char *getAddrOpcStr(AddrOpc Op) { return Op == sub ? "-" : ""; }
 
-  inline const char *getShiftOpcStr(ShiftOpc Op) {
+  inline const StringRef getShiftOpcStr(ShiftOpc Op) {
     switch (Op) {
     default: llvm_unreachable("Unknown shift opc!");
     case ARM_AM::asr: return "asr";
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 404c280ab9799..924a4580aa5fd 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -780,8 +780,9 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
   // Make sure the offsets are scaled correctly
   Instruction *ScaledOffsets = BinaryOperator::Create(
       Instruction::Shl, OffsetsIncoming,
-      Builder.CreateVectorSplat(Ty->getNumElements(), Builder.getInt32(TypeScale)),
-      "ScaledIndex", I);
+      Builder.CreateVectorSplat(Ty->getNumElements(),
+                                Builder.getInt32(TypeScale)),
+      "ScaledIndex", I->getIterator());
   // Add the base to the offsets
   OffsetsIncoming = BinaryOperator::Create(
       Instruction::Add, ScaledOffsets,
@@ -790,7 +791,7 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
           Builder.CreatePtrToInt(
               BasePtr,
               cast<VectorType>(ScaledOffsets->getType())->getElementType())),
-      "StartIndex", I);
+      "StartIndex", I->getIterator());
 
   if (I->getIntrinsicID() == Intrinsic::masked_gather)
     return tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate);
@@ -838,7 +839,8 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat(
   Instruction *ScaledOffsets = BinaryOperator::Create(
       Instruction::Shl, Phi->getIncomingValue(1 - IncrementIndex),
       Builder.CreateVectorSplat(NumElems, Builder.getInt32(TypeScale)),
-      "ScaledIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+      "ScaledIndex",
+      Phi->getIncomingBlock(1 - IncrementIndex)->back().getIterator());
   // Add the base to the offsets
   OffsetsIncoming = BinaryOperator::Create(
       Instruction::Add, ScaledOffsets,
@@ -847,13 +849,14 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat(
           Builder.CreatePtrToInt(
               BasePtr,
               cast<VectorType>(ScaledOffsets->getType())->getElementType())),
-      "StartIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+      "StartIndex",
+      Phi->getIncomingBlock(1 - IncrementIndex)->back().getIterator());
   // The gather is pre-incrementing
   OffsetsIncoming = BinaryOperator::Create(
       Instruction::Sub, OffsetsIncoming,
       Builder.CreateVectorSplat(NumElems, Builder.getInt32(Immediate)),
       "PreIncrementStartIndex",
-      &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+      Phi->getIncomingBlock(1 - IncrementIndex)->back().getIterator());
   Phi->setIncomingValue(1 - IncrementIndex, OffsetsIncoming);
 
   Builder.SetInsertPoint(I);
@@ -886,8 +889,8 @@ void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi,
                                           Value *OffsSecondOperand,
                                           unsigned StartIndex) {
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising add instruction\n");
-  Instruction *InsertionPoint =
-        &cast<Instruction>(Phi->getIncomingBlock(StartIndex)->back());
+  BasicBlock::iterator InsertionPoint =
+      Phi->getIncomingBlock(StartIndex)->back().getIterator();
   // Initialize the phi with a vector that contains a sum of the constants
   Instruction *NewIndex = BinaryOperator::Create(
       Instruction::Add, Phi->getIncomingValue(StartIndex), OffsSecondOperand,
@@ -911,8 +914,8 @@ void MVEGatherScatterLowering::pushOutMulShl(unsigned Opcode, PHINode *&Phi,
 
   // Create a new scalar add outside of the loop and transform it to a splat
   // by which loop variable can be incremented
-  Instruction *InsertionPoint = &cast<Instruction>(
-        Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back());
+  BasicBlock::iterator InsertionPoint =
+      Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back().getIterator();
 
   // Create a new index
   Value *StartIndex =
@@ -923,11 +926,14 @@ void MVEGatherScatterLowering::pushOutMulShl(unsigned Opcode, PHINode *&Phi,
   Instruction *Product =
       BinaryOperator::Create((Instruction::BinaryOps)Opcode, IncrementPerRound,
                              OffsSecondOperand, "Product", InsertionPoint);
+
+  BasicBlock::iterator NewIncrInsertPt =
+      Phi->getIncomingBlock(LoopIncrement)->back().getIterator();
+  NewIncrInsertPt = std::prev(NewIncrInsertPt);
+
   // Increment NewIndex by Product instead of the multiplication
   Instruction *NewIncrement = BinaryOperator::Create(
-      Instruction::Add, Phi, Product, "IncrementPushedOutMul",
-      cast<Instruction>(Phi->getIncomingBlock(LoopIncrement)->back())
-          .getPrevNode());
+      Instruction::Add, Phi, Product, "IncrementPushedOutMul", NewIncrInsertPt);
 
   Phi->addIncoming(StartIndex,
                    Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1));
@@ -1054,7 +1060,7 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
       // our phi, we need to copy it
       IncInstruction = BinaryOperator::Create(
           Instruction::BinaryOps(IncInstruction->getOpcode()), Phi,
-          IncrementPerRound, "LoopIncrement", IncInstruction);
+          IncrementPerRound, "LoopIncrement", IncInstruction->getIterator());
       Phi->setIncomingValue(IncrementingBlock, IncInstruction);
     }
     NewPhi = Phi;
@@ -1066,7 +1072,7 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
                         Phi->getIncomingBlock(IncrementingBlock == 1 ? 0 : 1));
     IncInstruction = BinaryOperator::Create(
         Instruction::BinaryOps(IncInstruction->getOpcode()), NewPhi,
-        IncrementPerRound, "LoopIncrement", IncInstruction);
+        IncrementPerRound, "LoopIncrement", IncInstruction->getIterator());
     NewPhi->addIncoming(IncInstruction,
                         Phi->getIncomingBlock(IncrementingBlock));
     IncrementingBlock = 1;
@@ -1229,7 +1235,7 @@ bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
         BaseTy = FixedVectorType::get(BaseTy, VecTy);
       GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
           Builder.getInt8Ty(), Builder.CreateBitCast(Base, BaseTy), Offsets,
-          "gep.merged", GEP);
+          "gep.merged", GEP->getIterator());
       LLVM_DEBUG(dbgs() << "Folded GEP: " << *GEP
                         << "\n      new :  " << *NewAddress << "\n");
       GEP->replaceAllUsesWith(
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 68e14f7dc9203..fe0d3b6c8189b 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -1387,7 +1387,7 @@ let mayLoad = 1, hasSideEffects = 0,
 
 // Load indirect with displacement operations.
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  let Constraints = "@earlyclobber $reg" in def LDDRdPtrQ
+  def LDDRdPtrQ
       : FSTDLDD<0,
                 (outs GPR8
                  : $reg),
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 3145bc3d19f5d..1688355f427cc 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -232,6 +232,7 @@ struct BPFOperand : public MCParsedAsmOperand {
         .Case("callx", true)
         .Case("goto", true)
         .Case("gotol", true)
+        .Case("may_goto", true)
         .Case("*", true)
         .Case("exit", true)
         .Case("lock", true)
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index f2d1206d02316..4be6220b358ba 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -426,8 +426,9 @@ static void replaceWithGEP(CallInst *Call, uint32_t DimensionIndex,
     IdxList.push_back(Zero);
   IdxList.push_back(Call->getArgOperand(GEPIndex));
 
-  auto *GEP = GetElementPtrInst::CreateInBounds(
-      getBaseElementType(Call), Call->getArgOperand(0), IdxList, "", Call);
+  auto *GEP = GetElementPtrInst::CreateInBounds(getBaseElementType(Call),
+                                                Call->getArgOperand(0), IdxList,
+                                                "", Call->getIterator());
   Call->replaceAllUsesWith(GEP);
   Call->eraseFromParent();
 }
@@ -1091,9 +1092,11 @@ bool BPFAbstractMemberAccess::transformGEPChain(CallInst *Call,
     // Load the global variable which represents the returned field info.
     LoadInst *LDInst;
     if (IsInt32Ret)
-      LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "", Call);
+      LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "",
+                            Call->getIterator());
     else
-      LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+      LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "",
+                            Call->getIterator());
 
     Instruction *PassThroughInst =
         BPFCoreSharedInfo::insertPassThrough(M, BB, LDInst, Call);
@@ -1113,7 +1116,8 @@ bool BPFAbstractMemberAccess::transformGEPChain(CallInst *Call,
   // The original Call inst is removed.
 
   // Load the global variable.
-  auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+  auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "",
+                              Call->getIterator());
 
   // Generate a BitCast
   auto *BCInst =
diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
index edd59aaa6d01d..2b78ed7134c92 100644
--- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
+++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -15,7 +15,7 @@
 //   - remove llvm.bpf.getelementptr.and.load builtins.
 //   - remove llvm.bpf.getelementptr.and.store builtins.
 //   - for loads and stores with base addresses from non-zero address space
-//     cast base address to zero address space (support for BPF arenas).
+//     cast base address to zero address space (support for BPF address spaces).
 //
 //===----------------------------------------------------------------------===//
 
@@ -482,7 +482,7 @@ static void aspaceWrapOperand(DenseMap<Value *, Value *> &Cache, Instruction *I,
   }
 }
 
-// Support for BPF arenas:
+// Support for BPF address spaces:
 // - for each function in the module M, update pointer operand of
 //   each memory access instruction (load/store/cmpxchg/atomicrmw)
 //   by casting it from non-zero address space to zero address space, e.g:
@@ -490,7 +490,7 @@ static void aspaceWrapOperand(DenseMap<Value *, Value *> &Cache, Instruction *I,
 //   (load (ptr addrspace (N) %p) ...)
 //     -> (load (addrspacecast ptr addrspace (N) %p to ptr))
 //
-// - assign section with name .arena.N for globals defined in
+// - assign section with name .addr_space.N for globals defined in
 //   non-zero address space N
 bool BPFCheckAndAdjustIR::insertASpaceCasts(Module &M) {
   bool Changed = false;
@@ -517,13 +517,13 @@ bool BPFCheckAndAdjustIR::insertASpaceCasts(Module &M) {
     Changed |= !CastsCache.empty();
   }
   // Merge all globals within same address space into single
-  // .arena.<addr space no> section
+  // .addr_space.<addr space no> section
   for (GlobalVariable &G : M.globals()) {
     if (G.getAddressSpace() == 0 || G.hasSection())
       continue;
     SmallString<16> SecName;
     raw_svector_ostream OS(SecName);
-    OS << ".arena." << G.getAddressSpace();
+    OS << ".addr_space." << G.getAddressSpace();
     G.setSection(SecName);
     // Prevent having separate section for constants
     G.setConstant(false);
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index d8139958e9fcf..7b8bcb2c5866d 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -242,7 +242,9 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
   bool to_replace = false;
   SDLoc DL(Node);
   const LoadSDNode *LD = cast<LoadSDNode>(Node);
-  uint64_t size = LD->getMemOperand()->getSize();
+  if (!LD->getMemOperand()->getSize().hasValue())
+    return;
+  uint64_t size = LD->getMemOperand()->getSize().getValue();
 
   if (!size || size > 8 || (size & (size - 1)) || !LD->isSimple())
     return;
diff --git a/llvm/lib/Target/BPF/BPFInstrFormats.td b/llvm/lib/Target/BPF/BPFInstrFormats.td
index 6ed83d877ac0f..feffdbc69465e 100644
--- a/llvm/lib/Target/BPF/BPFInstrFormats.td
+++ b/llvm/lib/Target/BPF/BPFInstrFormats.td
@@ -73,6 +73,7 @@ def BPF_JLT  : BPFJumpOp<0xa>;
 def BPF_JLE  : BPFJumpOp<0xb>;
 def BPF_JSLT : BPFJumpOp<0xc>;
 def BPF_JSLE : BPFJumpOp<0xd>;
+def BPF_JCOND : BPFJumpOp<0xe>;
 
 class BPFWidthModifer<bits<2> val> {
   bits<2> Value = val;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 7198e9499bc32..66c57952a7f10 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -215,6 +215,18 @@ class JMP_RI<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
   let BPFClass = BPF_JMP;
 }
 
+class JMP_JCOND<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
+    : TYPE_ALU_JMP<Opc.Value, BPF_K.Value,
+                   (outs),
+                   (ins brtarget:$BrDst),
+                   !strconcat(OpcodeStr, " $BrDst"),
+                   Pattern> {
+  bits<16> BrDst;
+
+  let Inst{47-32} = BrDst;
+  let BPFClass = BPF_JMP;
+}
+
 class JMP_RR_32<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
     : TYPE_ALU_JMP<Opc.Value, BPF_X.Value,
                    (outs),
@@ -267,6 +279,7 @@ defm JULE : J<BPF_JLE, "<=", BPF_CC_LEU, BPF_CC_LEU_32>;
 defm JSLT : J<BPF_JSLT, "s<", BPF_CC_LT, BPF_CC_LT_32>;
 defm JSLE : J<BPF_JSLE, "s<=", BPF_CC_LE, BPF_CC_LE_32>;
 defm JSET : J<BPF_JSET, "&", NoCond, NoCond>;
+def JCOND : JMP_JCOND<BPF_JCOND, "may_goto", []>;
 }
 
 // ALU instructions
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index dae1aeea35218..afc154988050c 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -116,8 +116,8 @@ static bool BPFPreserveDITypeImpl(Function &F) {
     GV->setMetadata(LLVMContext::MD_preserve_access_index, MD);
 
     // Load the global variable which represents the type info.
-    auto *LDInst =
-        new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+    auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "",
+                                Call->getIterator());
     Instruction *PassThroughInst =
         BPFCoreSharedInfo::insertPassThrough(M, BB, LDInst, Call);
     Call->replaceAllUsesWith(PassThroughInst);
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 7dad40803d477..44932383fb43e 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -81,7 +81,10 @@ class BPFMCInstrAnalysis : public MCInstrAnalysis {
     // The target is the 3rd operand of cond inst and the 1st of uncond inst.
     int32_t Imm;
     if (isConditionalBranch(Inst)) {
-      Imm = (short)Inst.getOperand(2).getImm();
+      if (Inst.getOpcode() == BPF::JCOND)
+        Imm = (short)Inst.getOperand(0).getImm();
+      else
+        Imm = (short)Inst.getOperand(2).getImm();
     } else if (isUnconditionalBranch(Inst)) {
       if (Inst.getOpcode() == BPF::JMP)
         Imm = (short)Inst.getOperand(0).getImm();
diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index bf93280779bf8..4c70b3f9230ed 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_target(DirectXCodeGen
   DirectXSubtarget.cpp
   DirectXTargetMachine.cpp
   DXContainerGlobals.cpp
+  DXILIntrinsicExpansion.cpp
   DXILMetadata.cpp
   DXILOpBuilder.cpp
   DXILOpLowering.cpp
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 56063b487f684..65cf1dfdb4031 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -28,7 +28,7 @@ using namespace llvm::dxil;
 namespace {
 class DXContainerGlobals : public llvm::ModulePass {
 
-  GlobalVariable *getShaderFlags(Module &M);
+  GlobalVariable *getFeatureFlags(Module &M);
   GlobalVariable *computeShaderHash(Module &M);
 
 public:
@@ -53,21 +53,24 @@ class DXContainerGlobals : public llvm::ModulePass {
 
 bool DXContainerGlobals::runOnModule(Module &M) {
   llvm::SmallVector<GlobalValue *> Globals;
-  Globals.push_back(getShaderFlags(M));
+  Globals.push_back(getFeatureFlags(M));
   Globals.push_back(computeShaderHash(M));
 
   appendToCompilerUsed(M, Globals);
   return true;
 }
 
-GlobalVariable *DXContainerGlobals::getShaderFlags(Module &M) {
-  const uint64_t Flags =
-      (uint64_t)(getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags());
+GlobalVariable *DXContainerGlobals::getFeatureFlags(Module &M) {
+  const uint64_t FeatureFlags =
+      static_cast<uint64_t>(getAnalysis<ShaderFlagsAnalysisWrapper>()
+                                .getShaderFlags()
+                                .getFeatureFlags());
 
-  Constant *FlagsConstant = ConstantInt::get(M.getContext(), APInt(64, Flags));
-  auto *GV = new llvm::GlobalVariable(M, FlagsConstant->getType(), true,
+  Constant *FeatureFlagsConstant =
+      ConstantInt::get(M.getContext(), APInt(64, FeatureFlags));
+  auto *GV = new llvm::GlobalVariable(M, FeatureFlagsConstant->getType(), true,
                                       GlobalValue::PrivateLinkage,
-                                      FlagsConstant, "dx.sfi0");
+                                      FeatureFlagsConstant, "dx.sfi0");
   GV->setSection("SFI0");
   GV->setAlignment(Align(4));
   return GV;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 66b0ef24332c2..216fa5b10c8f4 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -266,12 +266,26 @@ def Frac : DXILOpMapping<22, unary, int_dx_frac,
                          "Returns a fraction from 0 to 1 that represents the "
                          "decimal part of the input.",
                          [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def RSqrt : DXILOpMapping<25, unary, int_dx_rsqrt,
+                         "Returns the reciprocal of the square root of the specified value."
+                         "rsqrt(x) = 1 / sqrt(x).",
+                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
 def Round : DXILOpMapping<26, unary, int_round,
                          "Returns the input rounded to the nearest integer"
                          "within a floating-point type.",
                          [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def FMax : DXILOpMapping<35, binary, int_maxnum,
+                         "Float maximum. FMax(a,b) = a > b ? a : b">;
+def FMin : DXILOpMapping<36, binary, int_minnum,
+                         "Float minimum. FMin(a,b) = a < b ? a : b">;
+def SMax : DXILOpMapping<37, binary, int_smax,
+                         "Signed integer maximum. SMax(a,b) = a > b ? a : b">;
+def SMin : DXILOpMapping<38, binary, int_smin,
+                         "Signed integer minimum. SMin(a,b) = a < b ? a : b">;
 def UMax : DXILOpMapping<39, binary, int_umax,
                          "Unsigned integer maximum. UMax(a,b) = a > b ? a : b">;
+def UMin : DXILOpMapping<40, binary, int_umin,
+                         "Unsigned integer minimum. UMin(a,b) = a < b ? a : b">;
 def FMad : DXILOpMapping<46, tertiary, int_fmuladd,
                          "Floating point arithmetic multiply/add operation. fmad(m,a,b) = m * a + b.">;
 def IMad : DXILOpMapping<48, tertiary, int_dx_imad,
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
new file mode 100644
index 0000000000000..0db42bc0a0fb6
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -0,0 +1,271 @@
+//===- DXILIntrinsicExpansion.cpp - Prepare LLVM Module for DXIL encoding--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file contains DXIL intrinsic expansions for those that don't have
+//  opcodes in DirectX Intermediate Language (DXIL).
+//===----------------------------------------------------------------------===//
+
+#include "DXILIntrinsicExpansion.h"
+#include "DirectX.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+
+#define DEBUG_TYPE "dxil-intrinsic-expansion"
+
+using namespace llvm;
+
+static bool isIntrinsicExpansion(Function &F) {
+  switch (F.getIntrinsicID()) {
+  case Intrinsic::exp:
+  case Intrinsic::dx_any:
+  case Intrinsic::dx_clamp:
+  case Intrinsic::dx_uclamp:
+  case Intrinsic::dx_lerp:
+  case Intrinsic::dx_rcp:
+  case Intrinsic::dx_sdot:
+  case Intrinsic::dx_udot:
+    return true;
+  }
+  return false;
+}
+
+static bool expandIntegerDot(CallInst *Orig, Intrinsic::ID DotIntrinsic) {
+  assert(DotIntrinsic == Intrinsic::dx_sdot ||
+         DotIntrinsic == Intrinsic::dx_udot);
+  Intrinsic::ID MadIntrinsic = DotIntrinsic == Intrinsic::dx_sdot
+                                   ? Intrinsic::dx_imad
+                                   : Intrinsic::dx_umad;
+  Value *A = Orig->getOperand(0);
+  Value *B = Orig->getOperand(1);
+  Type *ATy = A->getType();
+  Type *BTy = B->getType();
+  assert(ATy->isVectorTy() && BTy->isVectorTy());
+
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+
+  auto *AVec = dyn_cast<FixedVectorType>(A->getType());
+  Value *Elt0 = Builder.CreateExtractElement(A, (uint64_t)0);
+  Value *Elt1 = Builder.CreateExtractElement(B, (uint64_t)0);
+  Value *Result = Builder.CreateMul(Elt0, Elt1);
+  for (unsigned I = 1; I < AVec->getNumElements(); I++) {
+    Elt0 = Builder.CreateExtractElement(A, I);
+    Elt1 = Builder.CreateExtractElement(B, I);
+    Result = Builder.CreateIntrinsic(Result->getType(), MadIntrinsic,
+                                     ArrayRef<Value *>{Elt0, Elt1, Result},
+                                     nullptr, "dx.mad");
+  }
+  Orig->replaceAllUsesWith(Result);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandExpIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  Type *Ty = X->getType();
+  Type *EltTy = Ty->getScalarType();
+  Constant *Log2eConst =
+      Ty->isVectorTy() ? ConstantVector::getSplat(
+                             ElementCount::getFixed(
+                                 cast<FixedVectorType>(Ty)->getNumElements()),
+                             ConstantFP::get(EltTy, numbers::log2e))
+                       : ConstantFP::get(EltTy, numbers::log2e);
+  Value *NewX = Builder.CreateFMul(Log2eConst, X);
+  auto *Exp2Call =
+      Builder.CreateIntrinsic(Ty, Intrinsic::exp2, {NewX}, nullptr, "dx.exp2");
+  Exp2Call->setTailCall(Orig->isTailCall());
+  Exp2Call->setAttributes(Orig->getAttributes());
+  Orig->replaceAllUsesWith(Exp2Call);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandAnyIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  Type *Ty = X->getType();
+  Type *EltTy = Ty->getScalarType();
+
+  if (!Ty->isVectorTy()) {
+    Value *Cond = EltTy->isFloatingPointTy()
+                      ? Builder.CreateFCmpUNE(X, ConstantFP::get(EltTy, 0))
+                      : Builder.CreateICmpNE(X, ConstantInt::get(EltTy, 0));
+    Orig->replaceAllUsesWith(Cond);
+  } else {
+    auto *XVec = dyn_cast<FixedVectorType>(Ty);
+    Value *Cond =
+        EltTy->isFloatingPointTy()
+            ? Builder.CreateFCmpUNE(
+                  X, ConstantVector::getSplat(
+                         ElementCount::getFixed(XVec->getNumElements()),
+                         ConstantFP::get(EltTy, 0)))
+            : Builder.CreateICmpNE(
+                  X, ConstantVector::getSplat(
+                         ElementCount::getFixed(XVec->getNumElements()),
+                         ConstantInt::get(EltTy, 0)));
+    Value *Result = Builder.CreateExtractElement(Cond, (uint64_t)0);
+    for (unsigned I = 1; I < XVec->getNumElements(); I++) {
+      Value *Elt = Builder.CreateExtractElement(Cond, I);
+      Result = Builder.CreateOr(Result, Elt);
+    }
+    Orig->replaceAllUsesWith(Result);
+  }
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandLerpIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  Value *Y = Orig->getOperand(1);
+  Value *S = Orig->getOperand(2);
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  auto *V = Builder.CreateFSub(Y, X);
+  V = Builder.CreateFMul(S, V);
+  auto *Result = Builder.CreateFAdd(X, V, "dx.lerp");
+  Orig->replaceAllUsesWith(Result);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandRcpIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  Type *Ty = X->getType();
+  Type *EltTy = Ty->getScalarType();
+  Constant *One =
+      Ty->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    dyn_cast<FixedVectorType>(Ty)->getNumElements()),
+                ConstantFP::get(EltTy, 1.0))
+          : ConstantFP::get(EltTy, 1.0);
+  auto *Result = Builder.CreateFDiv(One, X, "dx.rcp");
+  Orig->replaceAllUsesWith(Result);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static Intrinsic::ID getMaxForClamp(Type *ElemTy,
+                                    Intrinsic::ID ClampIntrinsic) {
+  if (ClampIntrinsic == Intrinsic::dx_uclamp)
+    return Intrinsic::umax;
+  assert(ClampIntrinsic == Intrinsic::dx_clamp);
+  if (ElemTy->isVectorTy())
+    ElemTy = ElemTy->getScalarType();
+  if (ElemTy->isIntegerTy())
+    return Intrinsic::smax;
+  assert(ElemTy->isFloatingPointTy());
+  return Intrinsic::maxnum;
+}
+
+static Intrinsic::ID getMinForClamp(Type *ElemTy,
+                                    Intrinsic::ID ClampIntrinsic) {
+  if (ClampIntrinsic == Intrinsic::dx_uclamp)
+    return Intrinsic::umin;
+  assert(ClampIntrinsic == Intrinsic::dx_clamp);
+  if (ElemTy->isVectorTy())
+    ElemTy = ElemTy->getScalarType();
+  if (ElemTy->isIntegerTy())
+    return Intrinsic::smin;
+  assert(ElemTy->isFloatingPointTy());
+  return Intrinsic::minnum;
+}
+
+static bool expandClampIntrinsic(CallInst *Orig, Intrinsic::ID ClampIntrinsic) {
+  Value *X = Orig->getOperand(0);
+  Value *Min = Orig->getOperand(1);
+  Value *Max = Orig->getOperand(2);
+  Type *Ty = X->getType();
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  auto *MaxCall = Builder.CreateIntrinsic(
+      Ty, getMaxForClamp(Ty, ClampIntrinsic), {X, Min}, nullptr, "dx.max");
+  auto *MinCall =
+      Builder.CreateIntrinsic(Ty, getMinForClamp(Ty, ClampIntrinsic),
+                              {MaxCall, Max}, nullptr, "dx.min");
+
+  Orig->replaceAllUsesWith(MinCall);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandIntrinsic(Function &F, CallInst *Orig) {
+  switch (F.getIntrinsicID()) {
+  case Intrinsic::exp:
+    return expandExpIntrinsic(Orig);
+  case Intrinsic::dx_any:
+    return expandAnyIntrinsic(Orig);
+  case Intrinsic::dx_uclamp:
+  case Intrinsic::dx_clamp:
+    return expandClampIntrinsic(Orig, F.getIntrinsicID());
+  case Intrinsic::dx_lerp:
+    return expandLerpIntrinsic(Orig);
+  case Intrinsic::dx_rcp:
+    return expandRcpIntrinsic(Orig);
+  case Intrinsic::dx_sdot:
+  case Intrinsic::dx_udot:
+    return expandIntegerDot(Orig, F.getIntrinsicID());
+  }
+  return false;
+}
+
+static bool expansionIntrinsics(Module &M) {
+  for (auto &F : make_early_inc_range(M.functions())) {
+    if (!isIntrinsicExpansion(F))
+      continue;
+    bool IntrinsicExpanded = false;
+    for (User *U : make_early_inc_range(F.users())) {
+      auto *IntrinsicCall = dyn_cast<CallInst>(U);
+      if (!IntrinsicCall)
+        continue;
+      IntrinsicExpanded = expandIntrinsic(F, IntrinsicCall);
+    }
+    if (F.user_empty() && IntrinsicExpanded)
+      F.eraseFromParent();
+  }
+  return true;
+}
+
+PreservedAnalyses DXILIntrinsicExpansion::run(Module &M,
+                                              ModuleAnalysisManager &) {
+  if (expansionIntrinsics(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+bool DXILIntrinsicExpansionLegacy::runOnModule(Module &M) {
+  return expansionIntrinsics(M);
+}
+
+char DXILIntrinsicExpansionLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DXILIntrinsicExpansionLegacy, DEBUG_TYPE,
+                      "DXIL Intrinsic Expansion", false, false)
+INITIALIZE_PASS_END(DXILIntrinsicExpansionLegacy, DEBUG_TYPE,
+                    "DXIL Intrinsic Expansion", false, false)
+
+ModulePass *llvm::createDXILIntrinsicExpansionLegacyPass() {
+  return new DXILIntrinsicExpansionLegacy();
+}
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.h b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.h
new file mode 100644
index 0000000000000..c86681af7a371
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.h
@@ -0,0 +1,33 @@
+//===- DXILIntrinsicExpansion.h - Prepare LLVM Module for DXIL encoding----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TARGET_DIRECTX_DXILINTRINSICEXPANSION_H
+#define LLVM_TARGET_DIRECTX_DXILINTRINSICEXPANSION_H
+
+#include "DXILResource.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+/// A pass that transforms DXIL Intrinsics that don't have DXIL opCodes
+class DXILIntrinsicExpansion : public PassInfoMixin<DXILIntrinsicExpansion> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
+};
+
+class DXILIntrinsicExpansionLegacy : public ModulePass {
+
+public:
+  bool runOnModule(Module &M) override;
+  DXILIntrinsicExpansionLegacy() : ModulePass(ID) {}
+
+  static char ID; // Pass identification.
+};
+} // namespace llvm
+
+#endif // LLVM_TARGET_DIRECTX_DXILINTRINSICEXPANSION_H
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 6b649b76beecd..e5c2042e7d16a 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "DXILConstants.h"
+#include "DXILIntrinsicExpansion.h"
 #include "DXILOpBuilder.h"
 #include "DirectX.h"
 #include "llvm/ADT/SmallVector.h"
@@ -94,9 +95,12 @@ class DXILOpLoweringLegacy : public ModulePass {
   DXILOpLoweringLegacy() : ModulePass(ID) {}
 
   static char ID; // Pass identification.
+  void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
+    // Specify the passes that your pass depends on
+    AU.addRequired<DXILIntrinsicExpansionLegacy>();
+  }
 };
 char DXILOpLoweringLegacy::ID = 0;
-
 } // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(DXILOpLoweringLegacy, DEBUG_TYPE, "DXIL Op Lowering",
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index 66a9dc46bcbfb..9fa137b4c025e 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -51,9 +51,14 @@ void ComputedShaderFlags::print(raw_ostream &OS) const {
   if (FlagVal == 0)
     return;
   OS << "; Note: shader requires additional functionality:\n";
-#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleNum, FlagName, Str)          \
   if (FlagName)                                                                \
-    OS << ";       " Str "\n";
+    (OS << ";").indent(7) << Str << "\n";
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+  OS << "; Note: extra DXIL module flags:\n";
+#define DXIL_MODULE_FLAG(DxilModuleBit, FlagName, Str)                         \
+  if (FlagName)                                                                \
+    (OS << ";").indent(7) << Str << "\n";
 #include "llvm/BinaryFormat/DXContainerConstants.def"
   OS << ";\n";
 }
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h
index 574a7b090f528..1df7d27de13d3 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.h
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_TARGET_DIRECTX_DXILSHADERFLAGS_H
 #define LLVM_TARGET_DIRECTX_DXILSHADERFLAGS_H
 
-#include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
@@ -29,22 +28,37 @@ class GlobalVariable;
 namespace dxil {
 
 struct ComputedShaderFlags {
-#define SHADER_FEATURE_FLAG(bit, FlagName, Str) bool FlagName : 1;
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleBit, FlagName, Str)          \
+  bool FlagName : 1;
+#define DXIL_MODULE_FLAG(DxilModuleBit, FlagName, Str) bool FlagName : 1;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 
-#define SHADER_FEATURE_FLAG(bit, FlagName, Str) FlagName = false;
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleBit, FlagName, Str)          \
+  FlagName = false;
+#define DXIL_MODULE_FLAG(DxilModuleBit, FlagName, Str) FlagName = false;
   ComputedShaderFlags() {
 #include "llvm/BinaryFormat/DXContainerConstants.def"
   }
 
+  constexpr uint64_t getMask(int Bit) const {
+    return Bit != -1 ? 1ull << Bit : 0;
+  }
   operator uint64_t() const {
     uint64_t FlagValue = 0;
-#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
-  FlagValue |=                                                                 \
-      FlagName ? static_cast<uint64_t>(dxbc::FeatureFlags::FlagName) : 0ull;
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleBit, FlagName, Str)          \
+  FlagValue |= FlagName ? getMask(DxilModuleBit) : 0ull;
+#define DXIL_MODULE_FLAG(DxilModuleBit, FlagName, Str)                         \
+  FlagValue |= FlagName ? getMask(DxilModuleBit) : 0ull;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
     return FlagValue;
   }
+  uint64_t getFeatureFlags() const {
+    uint64_t FeatureFlags = 0;
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleBit, FlagName, Str)          \
+  FeatureFlags |= FlagName ? getMask(FeatureBit) : 0ull;
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+    return FeatureFlags;
+  }
 
   static ComputedShaderFlags computeFlags(Module &M);
   void print(raw_ostream &OS = dbgs()) const;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 9c959c66be8bd..80d94bf0c9d48 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -53,8 +53,8 @@ bool DXILTranslateMetadata::runOnModule(Module &M) {
       getAnalysis<DXILResourceWrapper>().getDXILResource();
   Res.write(M);
 
-  const uint64_t Flags =
-      (uint64_t)(getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags());
+  const uint64_t Flags = static_cast<uint64_t>(
+      getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags());
   dxil::createEntryMD(M, Flags);
 
   return false;
diff --git a/llvm/lib/Target/DirectX/DirectX.h b/llvm/lib/Target/DirectX/DirectX.h
index eaecc3ac280c4..11b5412c21d78 100644
--- a/llvm/lib/Target/DirectX/DirectX.h
+++ b/llvm/lib/Target/DirectX/DirectX.h
@@ -28,6 +28,12 @@ void initializeDXILPrepareModulePass(PassRegistry &);
 /// Pass to convert modules into DXIL-compatable modules
 ModulePass *createDXILPrepareModulePass();
 
+/// Initializer for DXIL Intrinsic Expansion
+void initializeDXILIntrinsicExpansionLegacyPass(PassRegistry &);
+
+/// Pass to expand intrinsic operations that lack DXIL opCodes
+ModulePass *createDXILIntrinsicExpansionLegacyPass();
+
 /// Initializer for DXILOpLowering
 void initializeDXILOpLoweringLegacyPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 06938f8c74f15..03c825b3977db 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -39,6 +39,7 @@ using namespace llvm;
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
   auto *PR = PassRegistry::getPassRegistry();
+  initializeDXILIntrinsicExpansionLegacyPass(*PR);
   initializeDXILPrepareModulePass(*PR);
   initializeEmbedDXILPassPass(*PR);
   initializeWriteDXILPassPass(*PR);
@@ -76,6 +77,7 @@ class DirectXPassConfig : public TargetPassConfig {
 
   FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
   void addCodeGenPrepare() override {
+    addPass(createDXILIntrinsicExpansionLegacyPass());
     addPass(createDXILOpLoweringLegacyPass());
     addPass(createDXILPrepareModulePass());
     addPass(createDXILTranslateMetadataPass());
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index fd7d25fa16d1d..864591d4eb955 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/HexagonAttributes.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
@@ -79,7 +80,7 @@ static cl::opt<bool> ErrorNoncontigiousRegister(
     "merror-noncontigious-register",
     cl::desc("Error for register names that aren't contigious"),
     cl::init(false));
-
+static cl::opt<bool> AddBuildAttributes("hexagon-add-build-attributes");
 namespace {
 
 struct HexagonOperand;
@@ -120,6 +121,9 @@ class HexagonAsmParser : public MCTargetAsmParser {
                                SMLoc &EndLoc) override;
   bool ParseDirectiveSubsection(SMLoc L);
   bool ParseDirectiveComm(bool IsLocal, SMLoc L);
+
+  bool parseDirectiveAttribute(SMLoc L);
+
   bool RegisterMatchesArch(unsigned MatchNum) const;
 
   bool matchBundleOptions();
@@ -164,6 +168,9 @@ class HexagonAsmParser : public MCTargetAsmParser {
     Parser.addAliasForDirective(".word", ".4byte");
 
     MCAsmParserExtension::Initialize(_Parser);
+
+    if (AddBuildAttributes)
+      getTargetStreamer().emitTargetAttributes(*STI);
   }
 
   bool splitIdentifier(OperandVector &Operands);
@@ -652,6 +659,56 @@ bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return finishBundle(IDLoc, Out);
   return false;
 }
+/// parseDirectiveAttribute
+///  ::= .attribute int, int
+///  ::= .attribute Tag_name, int
+bool HexagonAsmParser::parseDirectiveAttribute(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  int64_t Tag;
+  SMLoc TagLoc = Parser.getTok().getLoc();
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    StringRef Name = Parser.getTok().getIdentifier();
+    std::optional<unsigned> Ret = ELFAttrs::attrTypeFromString(
+        Name, HexagonAttrs::getHexagonAttributeTags());
+    if (!Ret)
+      return Error(TagLoc, "attribute name not recognized: " + Name);
+    Tag = *Ret;
+    Parser.Lex();
+  } else {
+    const MCExpr *AttrExpr;
+
+    TagLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(AttrExpr))
+      return true;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(AttrExpr);
+    if (check(!CE, TagLoc, "expected numeric constant"))
+      return true;
+
+    Tag = CE->getValue();
+  }
+
+  if (Parser.parseComma())
+    return true;
+
+  // We currently only have integer values.
+  int64_t IntegerValue = 0;
+  SMLoc ValueExprLoc = Parser.getTok().getLoc();
+  const MCExpr *ValueExpr;
+  if (Parser.parseExpression(ValueExpr))
+    return true;
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ValueExpr);
+  if (!CE)
+    return Error(ValueExprLoc, "expected numeric constant");
+  IntegerValue = CE->getValue();
+
+  if (Parser.parseEOL())
+    return true;
+
+  getTargetStreamer().emitAttribute(Tag, IntegerValue);
+  return false;
+}
 
 /// ParseDirective parses the Hexagon specific directives
 bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
@@ -664,6 +721,8 @@ bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
     return ParseDirectiveComm(false, DirectiveID.getLoc());
   if (IDVal.lower() == ".subsection")
     return ParseDirectiveSubsection(DirectiveID.getLoc());
+  if (IDVal == ".attribute")
+    return parseDirectiveAttribute(DirectiveID.getLoc());
 
   return true;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 4ee67cb05d495..d2f64ac9e90b0 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
+#include "HexagonTargetStreamer.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCExpr.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
@@ -46,6 +47,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -775,6 +777,24 @@ void HexagonAsmPrinter::emitInstruction(const MachineInstr *MI) {
   OutStreamer->emitInstruction(MCB, getSubtargetInfo());
 }
 
+void HexagonAsmPrinter::emitStartOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    emitAttributes();
+}
+
+void HexagonAsmPrinter::emitEndOfAsmFile(Module &M) {
+  HexagonTargetStreamer &HTS =
+      static_cast<HexagonTargetStreamer &>(*OutStreamer->getTargetStreamer());
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    HTS.finishAttributeSection();
+}
+
+void HexagonAsmPrinter::emitAttributes() {
+  HexagonTargetStreamer &HTS =
+      static_cast<HexagonTargetStreamer &>(*OutStreamer->getTargetStreamer());
+  HTS.emitTargetAttributes(*TM.getMCSubtargetInfo());
+}
+
 void HexagonAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
   static const int8_t NoopsInSledCount = 4;
   // We want to emit the following pattern:
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
index 5cb1edf54d1a7..b555c88596503 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -29,6 +29,8 @@ class TargetMachine;
   class HexagonAsmPrinter : public AsmPrinter {
     const HexagonSubtarget *Subtarget = nullptr;
 
+    void emitAttributes();
+
   public:
     explicit HexagonAsmPrinter(TargetMachine &TM,
                                std::unique_ptr<MCStreamer> Streamer)
@@ -68,6 +70,8 @@ class TargetMachine;
                          const char *ExtraCode, raw_ostream &OS) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                const char *ExtraCode, raw_ostream &OS) override;
+    void emitStartOfAsmFile(Module &M) override;
+    void emitEndOfAsmFile(Module &M) override;
   };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 4ead8c68cb901..b0de0405ce961 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -1109,7 +1109,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
           break;
       }
     }
-    NewInst = GetElementPtrInst::Create(InpTy, Input, IdxList, "cgep", &*At);
+    NewInst = GetElementPtrInst::Create(InpTy, Input, IdxList, "cgep", At);
     NewInst->setIsInBounds(RN->Flags & GepNode::InBounds);
     LLVM_DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
     if (Idx < Num) {
diff --git a/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
index a647e699a8f07..9d8e5c53b8227 100644
--- a/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -292,8 +292,8 @@ bool HexagonStoreWidening::storesAreAdjacent(const MachineInstr *S1,
   int Off1 = S1->getOperand(1).getImm();
   int Off2 = S2->getOperand(1).getImm();
 
-  return (Off1 >= 0) ? Off1+S1MO.getSize() == unsigned(Off2)
-                     : int(Off1+S1MO.getSize()) == Off2;
+  return (Off1 >= 0) ? Off1 + S1MO.getSize().getValue() == unsigned(Off2)
+                     : int(Off1 + S1MO.getSize().getValue()) == Off2;
 }
 
 /// Given a sequence of adjacent stores, and a maximum size of a single wide
@@ -315,7 +315,7 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
   assert(!FirstMI->memoperands_empty() && "Expecting some memory operands");
   const MachineMemOperand &FirstMMO = getStoreTarget(FirstMI);
   unsigned Alignment = FirstMMO.getAlign().value();
-  unsigned SizeAccum = FirstMMO.getSize();
+  unsigned SizeAccum = FirstMMO.getSize().getValue();
   unsigned FirstOffset = getStoreOffset(FirstMI);
 
   // The initial value of SizeAccum should always be a power of 2.
@@ -357,7 +357,7 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
     if (!storesAreAdjacent(S1, S2))
       break;
 
-    unsigned S2Size = getStoreTarget(S2).getSize();
+    unsigned S2Size = getStoreTarget(S2).getSize().getValue();
     if (SizeAccum + S2Size > std::min(MaxSize, Alignment))
       break;
 
@@ -405,7 +405,7 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
     MachineOperand &SO = MI->getOperand(2);  // Source.
     assert(SO.isImm() && "Expecting an immediate operand");
 
-    unsigned NBits = MMO.getSize()*8;
+    unsigned NBits = MMO.getSize().getValue() * 8;
     unsigned Mask = (0xFFFFFFFFU >> (32-NBits));
     unsigned Val = (SO.getImm() & Mask) << Shift;
     Acc |= Val;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h b/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
index ff84363106629..a2f93d476bfe1 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
@@ -24,6 +24,15 @@ class HexagonTargetStreamer : public MCTargetStreamer {
   virtual void emitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                                            unsigned ByteAlign,
                                            unsigned AccessGranularity){};
+  void finish() override {}
+
+  virtual void finishAttributeSection() {}
+
+  virtual void emitAttribute(unsigned Attribute, unsigned Value) {}
+
+  void emitTargetAttributes(const MCSubtargetInfo &STI);
+
+  virtual void reset() {}
 };
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index cf4b66f8bf861..458b8717256f2 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -138,20 +138,6 @@ ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth,
   return ElementCount::getFixed((8 * ST.getVectorLength()) / ElemWidth);
 }
 
-InstructionCost HexagonTTIImpl::getScalarizationOverhead(
-    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-    TTI::TargetCostKind CostKind) {
-  return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
-                                         CostKind);
-}
-
-InstructionCost
-HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                                 ArrayRef<Type *> Tys,
-                                                 TTI::TargetCostKind CostKind) {
-  return BaseT::getOperandsScalarizationOverhead(Args, Tys, CostKind);
-}
-
 InstructionCost HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
                                                  ArrayRef<Type *> Tys,
                                                  TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index ec0fd454c8084..fdb34f308e641 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -103,14 +103,6 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
     return true;
   }
 
-  InstructionCost getScalarizationOverhead(VectorType *Ty,
-                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract,
-                                           TTI::TargetCostKind CostKind);
-  InstructionCost
-  getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                   ArrayRef<Type *> Tys,
-                                   TTI::TargetCostKind CostKind);
   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
                                    ArrayRef<Type *> Tys,
                                    TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index d1678fa0241ca..22c82bf5ac1b4 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "HexagonTargetStreamer.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/ADT/StringRef.h"
@@ -27,11 +29,13 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/HexagonAttributes.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
@@ -147,6 +151,63 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol,
   HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize);
 }
 
+static unsigned featureToArchVersion(unsigned Feature) {
+  switch (Feature) {
+  case Hexagon::ArchV5:
+    return 5;
+  case Hexagon::ArchV55:
+    return 55;
+  case Hexagon::ArchV60:
+  case Hexagon::ExtensionHVXV60:
+    return 60;
+  case Hexagon::ArchV62:
+  case Hexagon::ExtensionHVXV62:
+    return 62;
+  case Hexagon::ArchV65:
+  case Hexagon::ExtensionHVXV65:
+    return 65;
+  case Hexagon::ArchV66:
+  case Hexagon::ExtensionHVXV66:
+    return 66;
+  case Hexagon::ArchV67:
+  case Hexagon::ExtensionHVXV67:
+    return 67;
+  case Hexagon::ArchV68:
+  case Hexagon::ExtensionHVXV68:
+    return 68;
+  case Hexagon::ArchV69:
+  case Hexagon::ExtensionHVXV69:
+    return 69;
+  case Hexagon::ArchV71:
+  case Hexagon::ExtensionHVXV71:
+    return 71;
+  case Hexagon::ArchV73:
+  case Hexagon::ExtensionHVXV73:
+    return 73;
+  }
+  llvm_unreachable("Expected valid arch feature");
+  return 0;
+}
+
+void HexagonTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
+  auto Features = STI.getFeatureBits();
+  unsigned Arch = featureToArchVersion(Hexagon_MC::getArchVersion(Features));
+  std::optional<unsigned> HVXArch = Hexagon_MC::getHVXVersion(Features);
+  emitAttribute(HexagonAttrs::ARCH, Arch);
+  if (HVXArch)
+    emitAttribute(HexagonAttrs::HVXARCH, featureToArchVersion(*HVXArch));
+  if (Features.test(Hexagon::ExtensionHVXIEEEFP))
+    emitAttribute(HexagonAttrs::HVXIEEEFP, 1);
+  if (Features.test(Hexagon::ExtensionHVXQFloat))
+    emitAttribute(HexagonAttrs::HVXQFLOAT, 1);
+  if (Features.test(Hexagon::ExtensionZReg))
+    emitAttribute(HexagonAttrs::ZREG, 1);
+  if (Features.test(Hexagon::ExtensionAudio))
+    emitAttribute(HexagonAttrs::AUDIO, 1);
+  if (Features.test(Hexagon::FeatureCabac))
+    emitAttribute(HexagonAttrs::CABAC, 1);
+}
+
 namespace llvm {
 MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 0740ac58a3381..dc8328a6705da 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -35,6 +35,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/HexagonAttributes.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
@@ -224,12 +225,13 @@ bool isSlot0Only(unsigned units) {
 namespace {
 
 class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
+  formatted_raw_ostream &OS;
+  bool IsVerboseAsm;
+
 public:
-  HexagonTargetAsmStreamer(MCStreamer &S,
-                           formatted_raw_ostream &OS,
-                           bool isVerboseAsm,
-                           MCInstPrinter &IP)
-      : HexagonTargetStreamer(S) {}
+  HexagonTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                           bool IsVerboseAsm, MCInstPrinter &IP)
+      : HexagonTargetStreamer(S), OS(OS), IsVerboseAsm(IsVerboseAsm) {}
 
   void prettyPrintAsm(MCInstPrinter &InstPrinter, uint64_t Address,
                       const MCInst &Inst, const MCSubtargetInfo &STI,
@@ -266,6 +268,21 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
     else
       OS << "\t}" << PacketBundle.second;
   }
+
+  void finish() override { finishAttributeSection(); }
+
+  void finishAttributeSection() override {}
+
+  void emitAttribute(unsigned Attribute, unsigned Value) override {
+    OS << "\t.attribute\t" << Attribute << ", " << Twine(Value);
+    if (IsVerboseAsm) {
+      StringRef Name = ELFAttrs::attrTypeAsString(
+          Attribute, HexagonAttrs::getHexagonAttributeTags());
+      if (!Name.empty())
+        OS << "\t// " << Name;
+    }
+    OS << "\n";
+  }
 };
 
 class HexagonTargetELFStreamer : public HexagonTargetStreamer {
@@ -279,7 +296,6 @@ class HexagonTargetELFStreamer : public HexagonTargetStreamer {
     MCA.setELFHeaderEFlags(Hexagon_MC::GetELFFlags(STI));
   }
 
-
   void emitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                               unsigned ByteAlignment,
                               unsigned AccessSize) override {
@@ -297,6 +313,27 @@ class HexagonTargetELFStreamer : public HexagonTargetStreamer {
     HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(
         Symbol, Size, Align(ByteAlignment), AccessSize);
   }
+
+  void finish() override { finishAttributeSection(); }
+
+  void reset() override { AttributeSection = nullptr; }
+
+private:
+  MCSection *AttributeSection = nullptr;
+
+  void finishAttributeSection() override {
+    MCELFStreamer &S = getStreamer();
+    if (S.Contents.empty())
+      return;
+
+    S.emitAttributesSection("hexagon", ".hexagon.attributes",
+                            ELF::SHT_HEXAGON_ATTRIBUTES, AttributeSection);
+  }
+
+  void emitAttribute(uint32_t Attribute, uint32_t Value) override {
+    getStreamer().setAttributeItem(Attribute, Value,
+                                   /*OverwriteExisting=*/true);
+  }
 };
 
 } // end anonymous namespace
@@ -591,6 +628,29 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI, StringRef FS) {
   }
 }
 
+std::optional<unsigned>
+Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {
+  for (auto Arch : {Hexagon::ExtensionHVXV73, Hexagon::ExtensionHVXV71,
+                    Hexagon::ExtensionHVXV69, Hexagon::ExtensionHVXV68,
+                    Hexagon::ExtensionHVXV67, Hexagon::ExtensionHVXV66,
+                    Hexagon::ExtensionHVXV65, Hexagon::ExtensionHVXV62,
+                    Hexagon::ExtensionHVXV60})
+    if (Features.test(Arch))
+      return Arch;
+  return {};
+}
+
+unsigned Hexagon_MC::getArchVersion(const FeatureBitset &Features) {
+  for (auto Arch :
+       {Hexagon::ArchV73, Hexagon::ArchV71, Hexagon::ArchV69, Hexagon::ArchV68,
+        Hexagon::ArchV67, Hexagon::ArchV66, Hexagon::ArchV65, Hexagon::ArchV62,
+        Hexagon::ArchV60, Hexagon::ArchV55, Hexagon::ArchV5})
+    if (Features.test(Arch))
+      return Arch;
+  llvm_unreachable("Expected arch v5-v73");
+  return 0;
+}
+
 unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
   return StringSwitch<unsigned>(STI.getCPU())
       .Case("generic", llvm::ELF::EF_HEXAGON_MACH_V5)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index ffb81bca208df..6110c7a93982c 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -81,7 +81,11 @@ namespace Hexagon_MC {
   unsigned GetELFFlags(const MCSubtargetInfo &STI);
 
   llvm::ArrayRef<MCPhysReg> GetVectRegRev();
-}
+
+  std::optional<unsigned> getHVXVersion(const FeatureBitset &Features);
+
+  unsigned getArchVersion(const FeatureBitset &Features);
+  } // namespace Hexagon_MC
 
 MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
                                           MCContext &MCT);
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 46f63a4103f9f..cf163e4e12001 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -453,6 +453,14 @@ class LoongArchOperand : public MCParsedAsmOperand {
   }
 
   bool isImm32() const { return isSImm<32>() || isUImm<32>(); }
+  bool isImm64() const {
+    if (!isImm())
+      return false;
+    int64_t Imm;
+    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && VK == LoongArchMCExpr::VK_LoongArch_None;
+  }
 
   /// Gets location of the first token of this operand.
   SMLoc getStartLoc() const override { return StartLoc; }
@@ -1514,6 +1522,10 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a 32 bit immediate");
   }
+  case Match_InvalidImm64: {
+    SMLoc ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "operand must be a 64 bit immediate");
+  }
   case Match_InvalidBareSymbol: {
     SMLoc ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a bare symbol name");
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index fcfff08d1a9f5..80429bc45be14 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -219,6 +219,9 @@ def grlenimm : Operand<GRLenVT>;
 def imm32 : Operand<GRLenVT> {
   let ParserMatchClass = ImmAsmOperand<"", 32, "">;
 }
+def imm64 : Operand<i64> {
+  let ParserMatchClass = ImmAsmOperand<"", 64, "">;
+}
 
 def uimm1 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isUInt<1>(Imm);}]>{
   let ParserMatchClass = UImmAsmOperand<1>;
@@ -2179,7 +2182,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in {
 def PseudoLI_W : Pseudo<(outs GPR:$rd), (ins imm32:$imm), [],
                         "li.w", "$rd, $imm">;
-def PseudoLI_D : Pseudo<(outs GPR:$rd), (ins grlenimm:$imm), [],
+def PseudoLI_D : Pseudo<(outs GPR:$rd), (ins imm64:$imm), [],
                         "li.d", "$rd, $imm">, Requires<[IsLA64]>;
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index adfcea7361583..bdf299ae8ac11 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1255,28 +1255,6 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
     emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
 
-#if 0
-  // We haven't support -mabicalls -mno-shared yet.
-  if (-mno-shared) {
-    MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp");
-    const MipsMCExpr *HiExpr = MipsMCExpr::create(
-        MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
-        MCA.getContext());
-    const MipsMCExpr *LoExpr = MipsMCExpr::create(
-        MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
-        MCA.getContext());
-
-    // lui $gp, %hi(__gnu_local_gp)
-    emitRX(Mips::LUi, GPReg, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
-
-    // addiu  $gp, $gp, %lo(__gnu_local_gp)
-    emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(),
-            &STI);
-
-    return;
-  }
-#endif
-
   const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
       MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
       MCA.getContext());
diff --git a/llvm/lib/Target/Mips/Mips.td b/llvm/lib/Target/Mips/Mips.td
index dbff25dfa0f36..923e0c9cdde75 100644
--- a/llvm/lib/Target/Mips/Mips.td
+++ b/llvm/lib/Target/Mips/Mips.td
@@ -208,6 +208,10 @@ def FeatureUseIndirectJumpsHazard : SubtargetFeature<"use-indirect-jump-hazard",
                                                     "true", "Use indirect jump"
                         " guards to prevent certain speculation based attacks">;
 
+def FeatureStrictAlign
+    : SubtargetFeature<"strict-align", "StrictAlign", "true",
+                       "Disable unaligned load store for r6">;
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index 5c96044b1f0b8..f7b623785b70c 100644
--- a/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -414,7 +414,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
             C, Attribute::getWithMemoryEffects(C, MemoryEffects::none()));
         A = A.addFnAttribute(C, Attribute::NoInline);
         FunctionCallee F = (M->getOrInsertFunction(Name, A, MyVoid, T));
-        CallInst::Create(F, Params, "", &I);
+        CallInst::Create(F, Params, "", I.getIterator());
       } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
         FunctionType *FT = CI->getFunctionType();
         Function *F_ =  CI->getCalledFunction();
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7e5f148e7bf4e..0a0d40751fcf0 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -365,8 +365,13 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::JumpTable,          MVT::i64,   Custom);
     setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
     setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
-    setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
-    setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+    if (Subtarget.hasMips64r6()) {
+      setOperationAction(ISD::LOAD,               MVT::i64,   Legal);
+      setOperationAction(ISD::STORE,              MVT::i64,   Legal);
+    } else {
+      setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
+      setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+    }
     setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
     setOperationAction(ISD::SHL_PARTS,          MVT::i64,   Custom);
     setOperationAction(ISD::SRA_PARTS,          MVT::i64,   Custom);
@@ -481,7 +486,12 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   if (!Subtarget.hasMips64r2())
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
-  if (Subtarget.isGP64bit()) {
+  if (Subtarget.isGP64bit() && Subtarget.hasMips64r6()) {
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, MVT::i32, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, MVT::i32, Legal);
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, MVT::i32, Legal);
+    setTruncStoreAction(MVT::i64, MVT::i32, Legal);
+  } else if (Subtarget.isGP64bit()) {
     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, MVT::i32, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, MVT::i32, Custom);
     setLoadExtAction(ISD::EXTLOAD, MVT::i64, MVT::i32, Custom);
diff --git a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
index 654f29d08af0b..4e1e27088cce4 100644
--- a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -184,7 +184,8 @@ MipsInstructionSelector::selectLoadStoreOpCode(MachineInstr &I,
   const Register ValueReg = I.getOperand(0).getReg();
   const LLT Ty = MRI.getType(ValueReg);
   const unsigned TySize = Ty.getSizeInBits();
-  const unsigned MemSizeInBytes = (*I.memoperands_begin())->getSize();
+  const unsigned MemSizeInBytes =
+      (*I.memoperands_begin())->getSize().getValue();
   unsigned Opc = I.getOpcode();
   const bool isStore = Opc == TargetOpcode::G_STORE;
 
@@ -455,7 +456,8 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
     }
 
     // Unaligned memory access
-    if (MMO->getAlign() < MMO->getSize() &&
+    if ((!MMO->getSize().hasValue() ||
+         MMO->getAlign() < MMO->getSize().getValue()) &&
         !STI.systemSupportsUnalignedAccess()) {
       if (MMO->getSize() != 4 || !isRegInGprb(I.getOperand(0).getReg(), MRI))
         return false;
diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
index f5e94235859a0..3307e840a2afd 100644
--- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -344,7 +344,7 @@ bool MipsLegalizerInfo::legalizeCustom(
   switch (MI.getOpcode()) {
   case G_LOAD:
   case G_STORE: {
-    unsigned MemSize = (**MI.memoperands_begin()).getSize();
+    unsigned MemSize = (**MI.memoperands_begin()).getSize().getValue();
     Register Val = MI.getOperand(0).getReg();
     unsigned Size = MRI.getType(Val).getSizeInBits();
 
diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
index acf0d6312ef51..639e623954ffc 100644
--- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
@@ -70,9 +70,10 @@ class MipsPreLegalizerCombinerImpl : public Combiner {
       // subtarget doesn't support them.
       auto MMO = *MI.memoperands_begin();
       const MipsSubtarget &STI = MI.getMF()->getSubtarget<MipsSubtarget>();
-      if (!isPowerOf2_64(MMO->getSize()))
+      if (!MMO->getSize().hasValue() ||
+          !isPowerOf2_64(MMO->getSize().getValue()))
         return false;
-      bool isUnaligned = MMO->getAlign() < MMO->getSize();
+      bool isUnaligned = MMO->getAlign() < MMO->getSize().getValue();
       if (!STI.systemSupportsUnalignedAccess() && isUnaligned)
         return false;
 
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index db7afc3c86c57..6af1fd8c88e57 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -155,7 +155,8 @@ static bool isGprbTwoInstrUnalignedLoadOrStore(const MachineInstr *MI) {
     auto MMO = *MI->memoperands_begin();
     const MipsSubtarget &STI = MI->getMF()->getSubtarget<MipsSubtarget>();
     if (MMO->getSize() == 4 && (!STI.systemSupportsUnalignedAccess() &&
-                                MMO->getAlign() < MMO->getSize()))
+                                (!MMO->getSize().hasValue() ||
+                                 MMO->getAlign() < MMO->getSize().getValue())))
       return true;
   }
   return false;
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index b87d13cea787d..5c8d64e3b6e3c 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -197,8 +197,13 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
   setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
-  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
-  setOperationAction(ISD::STORE,              MVT::i32, Custom);
+  if (Subtarget.hasMips32r6()) {
+    setOperationAction(ISD::LOAD,               MVT::i32, Legal);
+    setOperationAction(ISD::STORE,              MVT::i32, Legal);
+  } else {
+    setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+    setOperationAction(ISD::STORE,              MVT::i32, Custom);
+  }
 
   setTargetDAGCombine(ISD::MUL);
 
@@ -425,6 +430,8 @@ bool MipsSETargetLowering::allowsMisalignedMemoryAccesses(
     if (Fast)
       *Fast = 1;
     return true;
+  } else if (Subtarget.hasMips32r6()) {
+    return false;
   }
 
   switch (SVT) {
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.cpp b/llvm/lib/Target/Mips/MipsSubtarget.cpp
index 0134fcb341f18..1bd35ca4a4191 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -82,7 +82,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
       HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 || Mips_Os16),
       Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
       HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false),
-      HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false),
+      HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false), StrictAlign(false),
       StackAlignOverride(StackAlignOverride), TM(TM), TargetTriple(TT),
       TSInfo(), InstrInfo(MipsInstrInfo::create(
                     initializeSubtargetDependencies(CPU, FS, TM))),
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.h b/llvm/lib/Target/Mips/MipsSubtarget.h
index 225ee139d036f..fea7f11fd0705 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -200,6 +200,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // Assume 32-bit GOT.
   bool UseXGOT = false;
 
+  // Disable unaligned load store for r6.
+  bool StrictAlign;
+
   /// The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   Align stackAlignment;
@@ -372,7 +375,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   /// MIPS32r6/MIPS64r6 require full unaligned access support but does not
   /// specify which component of the system provides it. Hardware, software, and
   /// hybrid implementations are all valid.
-  bool systemSupportsUnalignedAccess() const { return hasMips32r6(); }
+  bool systemSupportsUnalignedAccess() const {
+    return hasMips32r6() && !StrictAlign;
+  }
 
   // Set helper classes
   void setHelperClassesMips16();
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 2219d9f6619aa..2783b1e0ec73e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -489,8 +489,11 @@ void NVPTXAsmPrinter::emitFunctionEntryLabel() {
   OutStreamer->emitRawText(StringRef("{\n"));
   setAndEmitFunctionVirtualRegisters(*MF);
   // Emit initial .loc debug directive for correct relocation symbol data.
-  if (MMI && MMI->hasDebugInfo())
-    emitInitialRawDwarfLocDirective(*MF);
+  if (const DISubprogram *SP = MF->getFunction().getSubprogram()) {
+    assert(SP->getUnit());
+    if (!SP->getUnit()->isDebugDirectivesOnly() && MMI && MMI->hasDebugInfo())
+      emitInitialRawDwarfLocDirective(*MF);
+  }
 }
 
 bool NVPTXAsmPrinter::runOnMachineFunction(MachineFunction &F) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 86fb367780dc1..c34472c21ccbe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -60,10 +60,12 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
                      NRI->getFrameRegister(MF))
                  .addReg(NRI->getFrameLocalRegister(MF));
     }
-    BuildMI(MBB, MBBI, dl,
-            MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
-            NRI->getFrameLocalRegister(MF))
-        .addImm(MF.getFunctionNumber());
+    if (!MR.use_empty(NRI->getFrameLocalRegister(MF))) {
+      BuildMI(MBB, MBBI, dl,
+              MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
+              NRI->getFrameLocalRegister(MF))
+          .addImm(MF.getFunctionNumber());
+    }
   }
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c979c03dc1b83..e1cdbd6a4b4dc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -580,9 +580,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::ROTL, MVT::i8, Expand);
   setOperationAction(ISD::ROTR, MVT::i8, Expand);
   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
-  setOperationAction(ISD::BSWAP, MVT::v2i16, Expand);
-  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
-  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
   // Indirect branch is not supported.
   // This also disables Jump Table creation.
@@ -645,8 +642,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
   setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
 
-  // Lowering of DYNAMIC_STACKALLOC is unsupported.
-  // Custom lower to produce an error.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
 
@@ -937,6 +932,7 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::BFE)
     MAKE_CASE(NVPTXISD::BFI)
     MAKE_CASE(NVPTXISD::PRMT)
+    MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
     MAKE_CASE(NVPTXISD::SETP_F16X2)
     MAKE_CASE(NVPTXISD::SETP_BF16X2)
     MAKE_CASE(NVPTXISD::Dummy)
@@ -2211,14 +2207,39 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  const Function &Fn = DAG.getMachineFunction().getFunction();
-
-  DiagnosticInfoUnsupported NoDynamicAlloca(
-      Fn, "dynamic alloca unsupported by NVPTX backend",
-      SDLoc(Op).getDebugLoc());
-  DAG.getContext()->diagnose(NoDynamicAlloca);
-  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
-  return DAG.getMergeValues(Ops, SDLoc());
+
+  if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
+    const Function &Fn = DAG.getMachineFunction().getFunction();
+
+    DiagnosticInfoUnsupported NoDynamicAlloca(
+        Fn,
+        "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
+        "requires target sm_52.",
+        SDLoc(Op).getDebugLoc());
+    DAG.getContext()->diagnose(NoDynamicAlloca);
+    auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
+                Op.getOperand(0)};
+    return DAG.getMergeValues(Ops, SDLoc());
+  }
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  SDLoc DL(Op.getNode());
+
+  // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
+  if (nvTM->is64Bit())
+    Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
+  else
+    Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
+
+  SDValue AllocOps[] = {Chain, Size,
+                        DAG.getTargetConstant(Align, DL, MVT::i32)};
+  SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,
+                               nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
+
+  SDValue MergeOps[] = {Alloca, Chain};
+  return DAG.getMergeValues(MergeOps, DL);
 }
 
 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
@@ -6100,6 +6121,9 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
   if (AI->isFloatingPointOperation()) {
     if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
+      if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
+          STI.getPTXVersion() >= 63)
+        return AtomicExpansionKind::None;
       if (Ty->isFloatTy())
         return AtomicExpansionKind::None;
       if (Ty->isDoubleTy() && STI.hasAtomAddF64())
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index cf1d458076691..c9db10e555cef 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -61,6 +61,7 @@ enum NodeType : unsigned {
   BFE,
   BFI,
   PRMT,
+  DYNAMIC_STACKALLOC,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index 202134ed7035a..c3fce14df0688 100644
--- a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -157,7 +157,7 @@ void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
       else
         // Get true block
         Dest = BI->getSuccessor(0);
-      BranchInst::Create(Dest, BI);
+      BranchInst::Create(Dest, BI->getIterator());
       InstrToDelete.push_back(BI);
     }
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index e8e091414693d..1677fe3ad1316 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3550,6 +3550,11 @@ let hasSideEffects = false in {
                              (ins Int64Regs:$s),
                              "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
                              []>;
+  def I64toI32L  : NVPTXInst<(outs Int32Regs:$low),
+                             (ins Int64Regs:$s),
+                             "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
+                             []>;
+
 }
 
 // Using partial vectorized move produces better SASS code for extraction of
@@ -3806,6 +3811,28 @@ def CALL_PROTOTYPE :
   NVPTXInst<(outs), (ins ProtoIdent:$ident),
             "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
 
+def SDTDynAllocaOp :
+  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
+
+def dyn_alloca :
+  SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
+         [SDNPHasChain, SDNPSideEffect]>;
+
+def DYNAMIC_STACKALLOC32 :
+  NVPTXInst<(outs Int32Regs:$ptr),
+            (ins Int32Regs:$size, i32imm:$align),
+            "alloca.u32 \t$ptr, $size, $align;\n\t"
+            "cvta.local.u32 \t$ptr, $ptr;",
+            [(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>,
+            Requires<[hasPTX<73>, hasSM<52>]>;
+
+def DYNAMIC_STACKALLOC64 :
+  NVPTXInst<(outs Int64Regs:$ptr),
+            (ins Int64Regs:$size, i32imm:$align),
+            "alloca.u64 \t$ptr, $size, $align;\n\t"
+            "cvta.local.u64 \t$ptr, $ptr;",
+            [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
+            Requires<[hasPTX<73>, hasSM<52>]>;
 
 include "NVPTXIntrinsics.td"
 
@@ -3817,3 +3844,17 @@ include "NVPTXIntrinsics.td"
 // - for sm_20, use pmpt (use vector scalar mov to get the pack and
 //   unpack). sm_20 supports native 32-bit register, but not native 16-bit
 // register.
+
+def : Pat <
+  (i32 (bswap i32:$a)),
+  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
+
+def : Pat <
+  (v2i16 (bswap v2i16:$a)),
+  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
+
+def : Pat <
+  (i64 (bswap i64:$a)),
+  (V2I32toI64
+    (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
+    (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 6a4882d23fac1..46551c4951435 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1534,7 +1534,7 @@ multiclass F_ATOMIC_2_imp<ValueType ptrT, NVPTXRegClass ptrclass,
   def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), IMM:$b))]>,
-  Requires<Pred>;
+  Requires<!if(!eq(TypeStr, ".f16"), [Predicate<"false">], Pred)>;
 }
 multiclass F_ATOMIC_2<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
   string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM,
@@ -1644,6 +1644,13 @@ defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".add",
 defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
   ".add", atomic_load_add_64_gen, i64imm, imm>;
 
+defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2<f16, Int16Regs, ".global", ".f16", ".add.noftz",
+  atomic_load_add_g, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_ADD_S_F16 : F_ATOMIC_2<f16, Int16Regs, ".shared", ".f16", ".add.noftz",
+  atomic_load_add_s, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_ADD_GEN_F16 : F_ATOMIC_2<f16, Int16Regs, "", ".f16", ".add.noftz",
+  atomic_load_add_gen, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+
 defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<f32, Float32Regs, ".global", ".f32", ".add",
   atomic_load_add_g, f32imm, fpimm>;
 defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<f32, Float32Regs, ".shared", ".f32", ".add",
@@ -2058,6 +2065,9 @@ multiclass ATOM2P_impl<string AsmStr, Intrinsic Intr,
                        SDNode Imm, ValueType ImmTy,
                        list<Predicate> Preds> {
   let AddedComplexity = 1 in {
+    def : ATOM23_impl<AsmStr, regT, regclass, Preds,
+                      (ins Int16Regs:$src, regclass:$b),
+                      (Intr (i16 Int16Regs:$src), (regT regclass:$b))>;
     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
                       (ins Int32Regs:$src, regclass:$b),
                       (Intr (i32 Int32Regs:$src), (regT regclass:$b))>;
@@ -2068,6 +2078,9 @@ multiclass ATOM2P_impl<string AsmStr, Intrinsic Intr,
   // tablegen can't infer argument types from Intrinsic (though it can
   // from Instruction) so we have to enforce specific type on
   // immediates via explicit cast to ImmTy.
+  def : ATOM23_impl<AsmStr, regT, regclass, Preds,
+                    (ins Int16Regs:$src, ImmType:$b),
+                    (Intr (i16 Int16Regs:$src), (ImmTy Imm:$b))>;
   def : ATOM23_impl<AsmStr, regT, regclass, Preds,
                     (ins Int32Regs:$src, ImmType:$b),
                     (Intr (i32 Int32Regs:$src), (ImmTy Imm:$b))>;
@@ -2338,6 +2351,8 @@ multiclass ATOM2_add_impl<string OpStr> {
    defm _s32  : ATOM2S_impl<OpStr, "i", "s32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u32  : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u64  : ATOM2S_impl<OpStr, "i", "u64", i64, Int64Regs, i64imm, imm, i64, []>;
+   defm _f16  : ATOM2S_impl<OpStr, "f", "f16", f16, Int16Regs, f16imm, fpimm, f16,
+                            [hasSM<70>, hasPTX<63>]>;
    defm _f32  : ATOM2S_impl<OpStr, "f", "f32", f32, Float32Regs, f32imm, fpimm, f32,
                             []>;
    defm _f64  : ATOM2S_impl<OpStr, "f", "f64", f64, Float64Regs, f64imm, fpimm, f64,
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 9e06f4689304b..cde02c25c4834 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -183,16 +183,16 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
     }
     if (auto *GEP = dyn_cast<GetElementPtrInst>(I.OldInstruction)) {
       SmallVector<Value *, 4> Indices(GEP->indices());
-      auto *NewGEP = GetElementPtrInst::Create(GEP->getSourceElementType(),
-                                               I.NewParam, Indices,
-                                               GEP->getName(), GEP);
+      auto *NewGEP = GetElementPtrInst::Create(
+          GEP->getSourceElementType(), I.NewParam, Indices, GEP->getName(),
+          GEP->getIterator());
       NewGEP->setIsInBounds(GEP->isInBounds());
       return NewGEP;
     }
     if (auto *BC = dyn_cast<BitCastInst>(I.OldInstruction)) {
       auto *NewBCType = PointerType::get(BC->getContext(), ADDRESS_SPACE_PARAM);
       return BitCastInst::Create(BC->getOpcode(), I.NewParam, NewBCType,
-                                 BC->getName(), BC);
+                                 BC->getName(), BC->getIterator());
     }
     if (auto *ASC = dyn_cast<AddrSpaceCastInst>(I.OldInstruction)) {
       assert(ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM);
@@ -316,7 +316,7 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
 void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
                                       Argument *Arg) {
   Function *Func = Arg->getParent();
-  Instruction *FirstInst = &(Func->getEntryBlock().front());
+  BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
   Type *StructType = Arg->getParamByValType();
   assert(StructType && "Missing byval type");
 
@@ -407,9 +407,9 @@ void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
 
   Instruction *PtrInGlobal = new AddrSpaceCastInst(
       Ptr, PointerType::get(Ptr->getContext(), ADDRESS_SPACE_GLOBAL),
-      Ptr->getName(), &*InsertPt);
+      Ptr->getName(), InsertPt);
   Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
-                                              Ptr->getName(), &*InsertPt);
+                                              Ptr->getName(), InsertPt);
   // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
   Ptr->replaceAllUsesWith(PtrInGeneric);
   PtrInGlobal->setOperand(0, Ptr);
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp
index 34f06b548db26..92b90e2559154 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp
@@ -143,7 +143,7 @@ bool NVPTXLowerUnreachable::runOnFunction(Function &F) {
       if (auto unreachableInst = dyn_cast<UnreachableInst>(&I)) {
         if (isLoweredToTrap(*unreachableInst))
           continue; // trap is emitted as `trap; exit;`.
-        CallInst::Create(ExitFTy, Exit, "", unreachableInst);
+        CallInst::Create(ExitFTy, Exit, "", unreachableInst->getIterator());
         Changed = true;
       }
     }
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
index 98cd3a82a6e05..3283a5bb69404 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
@@ -739,7 +739,7 @@ bool PPCInstructionSelector::select(MachineInstr &I) {
     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
       const unsigned NewOpc = selectLoadStoreOp(
           I.getOpcode(), RBI.getRegBank(LdSt.getReg(0), MRI, TRI)->getID(),
-          LdSt.getMemSizeInBits());
+          LdSt.getMemSizeInBits().getValue());
 
       if (NewOpc == I.getOpcode())
         return nullptr;
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 542854ec9b99f..64cae1caa6437 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -474,6 +474,35 @@ static void collectTOCStats(PPCAsmPrinter::TOCEntryType Type) {
   }
 }
 
+static CodeModel::Model getCodeModel(const PPCSubtarget &S,
+                                     const TargetMachine &TM,
+                                     const MachineOperand &MO) {
+  CodeModel::Model ModuleModel = TM.getCodeModel();
+
+  // If the operand is not a global address then there is no
+  // global variable to carry an attribute.
+  if (!(MO.getType() == MachineOperand::MO_GlobalAddress))
+    return ModuleModel;
+
+  const GlobalValue *GV = MO.getGlobal();
+  assert(GV && "expected global for MO_GlobalAddress");
+
+  return S.getCodeModel(TM, GV);
+}
+
+static void setOptionalCodeModel(MCSymbolXCOFF *XSym, CodeModel::Model CM) {
+  switch (CM) {
+  case CodeModel::Large:
+    XSym->setPerSymbolCodeModel(MCSymbolXCOFF::CM_Large);
+    return;
+  case CodeModel::Small:
+    XSym->setPerSymbolCodeModel(MCSymbolXCOFF::CM_Small);
+    return;
+  default:
+    report_fatal_error("Invalid code model for AIX");
+  }
+}
+
 /// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
 /// exists for it.  If not, create one.  Then return a symbol that references
 /// the TOC entry.
@@ -1014,7 +1043,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // relative to the toc-base.
     if (IsAIX) {
       assert(
-          TM.getCodeModel() == CodeModel::Small &&
+          getCodeModel(*Subtarget, TM, MO) == CodeModel::Small &&
           "This pseudo should only be selected for 32-bit small code model.");
       Exp = getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp, VK);
       TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
@@ -1098,7 +1127,12 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::ADDIStocHA: {
-    assert((IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large) &&
+    const MachineOperand &MO = MI->getOperand(2);
+
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+           "Invalid operand for ADDIStocHA.");
+    assert((IsAIX && !IsPPC64 &&
+            getCodeModel(*Subtarget, TM, MO) == CodeModel::Large) &&
            "This pseudo should only be selected for 32-bit large code model on"
            " AIX.");
 
@@ -1108,10 +1142,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // Change the opcode to ADDIS.
     TmpInst.setOpcode(PPC::ADDIS);
 
-    const MachineOperand &MO = MI->getOperand(2);
-    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
-           "Invalid operand for ADDIStocHA.");
-
     // Map the machine operand to its corresponding MCSymbol.
     MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
@@ -1131,7 +1161,12 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::LWZtocL: {
-    assert(IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large &&
+    const MachineOperand &MO = MI->getOperand(1);
+
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+           "Invalid operand for LWZtocL.");
+    assert(IsAIX && !IsPPC64 &&
+           getCodeModel(*Subtarget, TM, MO) == CodeModel::Large &&
            "This pseudo should only be selected for 32-bit large code model on"
            " AIX.");
 
@@ -1141,10 +1176,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // Change the opcode to lwz.
     TmpInst.setOpcode(PPC::LWZ);
 
-    const MachineOperand &MO = MI->getOperand(1);
-    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
-           "Invalid operand for LWZtocL.");
-
     // Map the machine operand to its corresponding MCSymbol.
     MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
@@ -1183,8 +1214,12 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     const bool GlobalToc =
         MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal());
+
+    const CodeModel::Model CM =
+        IsAIX ? getCodeModel(*Subtarget, TM, MO) : TM.getCodeModel();
+
     if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
-        (MO.isCPI() && TM.getCodeModel() == CodeModel::Large))
+        (MO.isCPI() && CM == CodeModel::Large))
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
     VK = IsAIX ? MCSymbolRefExpr::VK_PPC_U : MCSymbolRefExpr::VK_PPC_TOC_HA;
@@ -1225,8 +1260,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
     MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
-
-    if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large)
+    CodeModel::Model CM =
+        IsAIX ? getCodeModel(*Subtarget, TM, MO) : TM.getCodeModel();
+    if (!MO.isCPI() || CM == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
     VK = IsAIX ? MCSymbolRefExpr::VK_PPC_L : MCSymbolRefExpr::VK_PPC_TOC_LO;
@@ -2651,6 +2687,27 @@ uint64_t PPCAIXAsmPrinter::getAliasOffset(const Constant *C) {
   return 0;
 }
 
+static void tocDataChecks(unsigned PointerSize, const GlobalVariable *GV) {
+  // TODO: These asserts should be updated as more support for the toc data
+  // transformation is added (struct support, etc.).
+  assert(
+      PointerSize >= GV->getAlign().valueOrOne().value() &&
+      "GlobalVariables with an alignment requirement stricter than TOC entry "
+      "size not supported by the toc data transformation.");
+
+  Type *GVType = GV->getValueType();
+  assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
+                              "supported by the toc data transformation.");
+  if (GV->getParent()->getDataLayout().getTypeSizeInBits(GVType) >
+      PointerSize * 8)
+    report_fatal_error(
+        "A GlobalVariable with size larger than a TOC entry is not currently "
+        "supported by the toc data transformation.");
+  if (GV->hasPrivateLinkage())
+    report_fatal_error("A GlobalVariable with private linkage is not "
+                       "currently supported by the toc data transformation.");
+}
+
 void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   // Special LLVM global arrays have been handled at the initialization.
   if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV))
@@ -2660,7 +2717,7 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   // when we emit the .toc section.
   if (GV->hasAttribute("toc-data")) {
     unsigned PointerSize = GV->getParent()->getDataLayout().getPointerSize();
-    Subtarget->tocDataChecks(PointerSize, GV);
+    tocDataChecks(PointerSize, GV);
     TOCDataGlobalVars.push_back(GV);
     return;
   }
@@ -2963,6 +3020,10 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
     }
 
     setCsectAlignment(&G);
+    std::optional<CodeModel::Model> OptionalCodeModel = G.getCodeModel();
+    if (OptionalCodeModel)
+      setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&G)),
+                           *OptionalCodeModel);
   }
 
   for (const auto &F : M)
@@ -2984,6 +3045,15 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
                          false);
     }
 
+    const GlobalVariable *GVar =
+        dyn_cast_or_null<GlobalVariable>(Alias.getAliaseeObject());
+    if (GVar) {
+      std::optional<CodeModel::Model> OptionalCodeModel = GVar->getCodeModel();
+      if (OptionalCodeModel)
+        setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&Alias)),
+                             *OptionalCodeModel);
+    }
+
     GOAliasMap[Aliasee].push_back(&Alias);
   }
 
diff --git a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index f1abb78d4dd34..4a3b64f30d8d4 100644
--- a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -263,7 +263,8 @@ class PPCBoolRetToInt : public FunctionPass {
     Value *IntRetVal = BoolToIntMap[U];
     Type *Int1Ty = Type::getInt1Ty(U->getContext());
     auto *I = cast<Instruction>(U.getUser());
-    Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I);
+    Value *BackToBool =
+        new TruncInst(IntRetVal, Int1Ty, "backToBool", I->getIterator());
     U.set(BackToBool);
 
     return true;
diff --git a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index ffaa3e05c8479..a942d2f9c7e8e 100644
--- a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -374,8 +374,9 @@ getHazardType(SUnit *SU, int Stalls) {
   // overlapping address.
   if (isLoad && NumStores && !MI->memoperands_empty()) {
     MachineMemOperand *MO = *MI->memoperands_begin();
-    if (isLoadOfStoredAddress(MO->getSize(),
-                              MO->getOffset(), MO->getValue()))
+    if (MO->getSize().hasValue() &&
+        isLoadOfStoredAddress(MO->getSize().getValue(), MO->getOffset(),
+                              MO->getValue()))
       return NoopHazard;
   }
 
@@ -399,9 +400,10 @@ void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
   if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true;
 
   // Track the address stored to.
-  if (isStore && NumStores < 4 && !MI->memoperands_empty()) {
+  if (isStore && NumStores < 4 && !MI->memoperands_empty() &&
+      (*MI->memoperands_begin())->getSize().hasValue()) {
     MachineMemOperand *MO = *MI->memoperands_begin();
-    StoreSize[NumStores] = MO->getSize();
+    StoreSize[NumStores] = MO->getSize().getValue();
     StoreOffset[NumStores] = MO->getOffset();
     StoreValue[NumStores] = MO->getValue();
     ++NumStores;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 0c25accd1d6ce..dfea9e7709240 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -524,6 +524,24 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
   return true;
 }
 
+static CodeModel::Model getCodeModel(const PPCSubtarget &Subtarget,
+                                     const TargetMachine &TM,
+                                     const SDNode *Node) {
+  // If there isn't an attribute to override the module code model
+  // this will be the effective code model.
+  CodeModel::Model ModuleModel = TM.getCodeModel();
+
+  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Node->getOperand(0));
+  if (!GA)
+    return ModuleModel;
+
+  const GlobalValue *GV = GA->getGlobal();
+  if (!GV)
+    return ModuleModel;
+
+  return Subtarget.getCodeModel(TM, GV);
+}
+
 /// isInt32Immediate - This method tests to see if the node is a 32-bit constant
 /// operand. If so Imm will receive the 32-bit value.
 static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
@@ -6059,7 +6077,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     const bool isAIXABI = Subtarget->isAIXABI();
 
     // PowerPC only support small, medium and large code model.
-    const CodeModel::Model CModel = TM.getCodeModel();
+    const CodeModel::Model CModel = getCodeModel(*Subtarget, TM, N);
+
     assert(!(CModel == CodeModel::Tiny || CModel == CodeModel::Kernel) &&
            "PowerPC doesn't support tiny or kernel code models.");
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index aef2d483c6df1..cce0efad39c75 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -8519,7 +8519,8 @@ bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
   // If there is no LXSIBZX/LXSIHZX, like Power8,
   // prefer direct move if the memory size is 1 or 2 bytes.
   MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
-  if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
+  if (!Subtarget.hasP9Vector() &&
+      (!MMO->getSize().hasValue() || MMO->getSize().getValue() <= 2))
     return true;
 
   for (SDNode::use_iterator UI = Origin->use_begin(),
@@ -10763,30 +10764,54 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getRegister(PPC::R2, MVT::i32);
 
   case Intrinsic::ppc_rldimi: {
+    assert(Subtarget.isPPC64() && "rldimi is only available in 64-bit!");
+    SDValue Src = Op.getOperand(1);
+    APInt Mask = Op.getConstantOperandAPInt(4);
+    if (Mask.isZero())
+      return Op.getOperand(2);
+    if (Mask.isAllOnes())
+      return DAG.getNode(ISD::ROTL, dl, MVT::i64, Src, Op.getOperand(3));
     uint64_t SH = Op.getConstantOperandVal(3);
     unsigned MB = 0, ME = 0;
-    if (!isRunOfOnes64(Op.getConstantOperandVal(4), MB, ME) || ME != 63 - SH)
+    if (!isRunOfOnes64(Mask.getZExtValue(), MB, ME))
       report_fatal_error("invalid rldimi mask!");
-    return SDValue(DAG.getMachineNode(
-                       PPC::RLDIMI, dl, MVT::i64,
-                       {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
-                        DAG.getTargetConstant(MB, dl, MVT::i32)}),
-                   0);
+    // rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
+    if (ME < 63 - SH) {
+      Src = DAG.getNode(ISD::ROTL, dl, MVT::i64, Src,
+                        DAG.getConstant(ME + SH + 1, dl, MVT::i32));
+    } else if (ME > 63 - SH) {
+      Src = DAG.getNode(ISD::ROTL, dl, MVT::i64, Src,
+                        DAG.getConstant(ME + SH - 63, dl, MVT::i32));
+    }
+    return SDValue(
+        DAG.getMachineNode(PPC::RLDIMI, dl, MVT::i64,
+                           {Op.getOperand(2), Src,
+                            DAG.getTargetConstant(63 - ME, dl, MVT::i32),
+                            DAG.getTargetConstant(MB, dl, MVT::i32)}),
+        0);
   }
 
   case Intrinsic::ppc_rlwimi: {
+    APInt Mask = Op.getConstantOperandAPInt(4);
+    if (Mask.isZero())
+      return Op.getOperand(2);
+    if (Mask.isAllOnes())
+      return DAG.getNode(ISD::ROTL, dl, MVT::i32, Op.getOperand(1),
+                         Op.getOperand(3));
     unsigned MB = 0, ME = 0;
-    if (!isRunOfOnes(Op.getConstantOperandVal(4), MB, ME))
+    if (!isRunOfOnes(Mask.getZExtValue(), MB, ME))
       report_fatal_error("invalid rlwimi mask!");
     return SDValue(DAG.getMachineNode(
                        PPC::RLWIMI, dl, MVT::i32,
-                       {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                       {Op.getOperand(2), Op.getOperand(1), Op.getOperand(3),
                         DAG.getTargetConstant(MB, dl, MVT::i32),
                         DAG.getTargetConstant(ME, dl, MVT::i32)}),
                    0);
   }
 
   case Intrinsic::ppc_rlwnm: {
+    if (Op.getConstantOperandVal(3) == 0)
+      return DAG.getConstant(0, dl, MVT::i32);
     unsigned MB = 0, ME = 0;
     if (!isRunOfOnes(Op.getConstantOperandVal(3), MB, ME))
       report_fatal_error("invalid rlwnm mask!");
@@ -15023,6 +15048,7 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
     SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
                                          DAG.getVTList(MVT::f64, MVT::Other),
                                          Ops, MVT::i8, LDN->getMemOperand());
+    DAG.makeEquivalentMemoryOrdering(LDN, Ld);
 
     // For signed conversion, we need to sign-extend the value in the VSR
     if (Signed) {
@@ -15114,7 +15140,7 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
     // If the MMO suggests this isn't a load of a full vector, leave
     // things alone.  For a built-in, we have to make the change for
     // correctness, so if there is a size problem that will be a bug.
-    if (MMO->getSize() < 16)
+    if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
       return SDValue();
     break;
   }
@@ -15182,7 +15208,7 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
     // If the MMO suggests this isn't a store of a full vector, leave
     // things alone.  For a built-in, we have to make the change for
     // correctness, so if there is a size problem that will be a bug.
-    if (MMO->getSize() < 16)
+    if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
       return SDValue();
     break;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 82da1a3c30598..6423e692d88c3 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -5045,12 +5045,12 @@ defm : TrapExtendedMnemonic<"lng", 6>;
 defm : TrapExtendedMnemonic<"u", 31>;
 
 // Atomic loads
-def : Pat<(atomic_load_8  DForm:$src), (LBZ  memri:$src)>;
-def : Pat<(atomic_load_16 DForm:$src), (LHZ  memri:$src)>;
-def : Pat<(atomic_load_32 DForm:$src), (LWZ  memri:$src)>;
-def : Pat<(atomic_load_8  XForm:$src), (LBZX memrr:$src)>;
-def : Pat<(atomic_load_16 XForm:$src), (LHZX memrr:$src)>;
-def : Pat<(atomic_load_32 XForm:$src), (LWZX memrr:$src)>;
+def : Pat<(i32 (atomic_load_8  DForm:$src)), (LBZ  memri:$src)>;
+def : Pat<(i32 (atomic_load_16 DForm:$src)), (LHZ  memri:$src)>;
+def : Pat<(i32 (atomic_load_32 DForm:$src)), (LWZ  memri:$src)>;
+def : Pat<(i32 (atomic_load_8  XForm:$src)), (LBZX memrr:$src)>;
+def : Pat<(i32 (atomic_load_16 XForm:$src)), (LHZX memrr:$src)>;
+def : Pat<(i32 (atomic_load_32 XForm:$src)), (LWZX memrr:$src)>;
 
 // Atomic stores
 def : Pat<(atomic_store_8  i32:$val, DForm:$ptr), (STB  gprc:$val, memri:$ptr)>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index d5a372e4dc101..5f2937d47a519 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -1289,13 +1289,13 @@ let Predicates = [PCRelativeMemops] in {
             (PSTXVpc $XS, $ga, 0)>;
 
   // Atomic Load
-  def : Pat<(atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(i32 (atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZpc $ga, 0)>;
-  def : Pat<(atomic_load_16 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(i32 (atomic_load_16 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLHZpc $ga, 0)>;
-  def : Pat<(atomic_load_32 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(i32 (atomic_load_32 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLWZpc $ga, 0)>;
-  def : Pat<(atomic_load_64 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(i64 (atomic_load_64 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLDpc $ga, 0)>;
 
   // Atomic Store
@@ -2347,10 +2347,10 @@ let Predicates = [PrefixInstrs] in {
   def : Pat<(store f64:$FRS, PDForm:$dst), (PSTFD $FRS, memri34:$dst)>;
 
   // Atomic Load
-  def : Pat<(atomic_load_8 PDForm:$src), (PLBZ memri34:$src)>;
-  def : Pat<(atomic_load_16 PDForm:$src), (PLHZ memri34:$src)>;
-  def : Pat<(atomic_load_32 PDForm:$src), (PLWZ memri34:$src)>;
-  def : Pat<(atomic_load_64 PDForm:$src), (PLD memri34:$src)>;
+  def : Pat<(i32 (atomic_load_8 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (atomic_load_16 PDForm:$src)), (PLHZ memri34:$src)>;
+  def : Pat<(i32 (atomic_load_32 PDForm:$src)), (PLWZ memri34:$src)>;
+  def : Pat<(i64 (atomic_load_64 PDForm:$src)), (PLD memri34:$src)>;
 
   // Atomic Store
   def : Pat<(atomic_store_8 i32:$RS, PDForm:$dst), (PSTB $RS, memri34:$dst)>;
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index dc739a2c7a4d0..f19eb2af5e070 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -725,7 +725,7 @@ PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
   Instruction *PtrInc = nullptr;
   Instruction *NewBasePtr = nullptr;
   if (CanPreInc) {
-    Instruction *InsPoint = &*Header->getFirstInsertionPt();
+    BasicBlock::iterator InsPoint = Header->getFirstInsertionPt();
     PtrInc = GetElementPtrInst::Create(
         I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
         InsPoint);
@@ -752,7 +752,7 @@ PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
       // For the latch predecessor, we need to insert a GEP just before the
       // terminator to increase the address.
       BasicBlock *BB = PI;
-      Instruction *InsPoint = BB->getTerminator();
+      BasicBlock::iterator InsPoint = BB->getTerminator()->getIterator();
       PtrInc = GetElementPtrInst::Create(
           I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
           InsPoint);
@@ -764,7 +764,7 @@ PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
     if (NewPHI->getType() != BasePtr->getType())
       NewBasePtr = new BitCastInst(NewPHI, BasePtr->getType(),
                                    getInstrName(NewPHI, CastNodeNameSuffix),
-                                   &*Header->getFirstInsertionPt());
+                                   Header->getFirstInsertionPt());
     else
       NewBasePtr = NewPHI;
   }
@@ -794,20 +794,25 @@ Instruction *PPCLoopInstrFormPrep::rewriteForBucketElement(
        cast<SCEVConstant>(Element.Offset)->getValue()->isZero())) {
     RealNewPtr = NewBasePtr;
   } else {
-    Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
+    std::optional<BasicBlock::iterator> PtrIP = std::nullopt;
+    if (Instruction *I = dyn_cast<Instruction>(Ptr))
+      PtrIP = I->getIterator();
+
     if (PtrIP && isa<Instruction>(NewBasePtr) &&
-        cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
-      PtrIP = nullptr;
-    else if (PtrIP && isa<PHINode>(PtrIP))
-      PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
+        cast<Instruction>(NewBasePtr)->getParent() == (*PtrIP)->getParent())
+      PtrIP = std::nullopt;
+    else if (PtrIP && isa<PHINode>(*PtrIP))
+      PtrIP = (*PtrIP)->getParent()->getFirstInsertionPt();
     else if (!PtrIP)
-      PtrIP = Element.Instr;
+      PtrIP = Element.Instr->getIterator();
 
     assert(OffToBase && "There should be an offset for non base element!\n");
     GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
         I8Ty, PtrInc, OffToBase,
-        getInstrName(Element.Instr, GEPNodeOffNameSuffix), PtrIP);
-    if (!PtrIP)
+        getInstrName(Element.Instr, GEPNodeOffNameSuffix));
+    if (PtrIP)
+      NewPtr->insertBefore(*(*PtrIP)->getParent(), *PtrIP);
+    else
       NewPtr->insertAfter(cast<Instruction>(PtrInc));
     NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
     RealNewPtr = NewPtr;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 884f2f5c57b25..653d9bda99192 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -185,37 +185,64 @@ bool PPCSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
 }
 
-void PPCSubtarget::tocDataChecks(unsigned PointerSize,
-                                 const GlobalVariable *GV) const {
-  // TODO: These asserts should be updated as more support for the toc data
-  // transformation is added (struct support, etc.).
-  assert(
-      PointerSize >= GV->getAlign().valueOrOne().value() &&
-      "GlobalVariables with an alignment requirement stricter than TOC entry "
-      "size not supported by the toc data transformation.");
-
-  Type *GVType = GV->getValueType();
-  assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
-                              "supported by the toc data transformation.");
-  if (GV->getParent()->getDataLayout().getTypeSizeInBits(GVType) >
-      PointerSize * 8)
-    report_fatal_error(
-        "A GlobalVariable with size larger than a TOC entry is not currently "
-        "supported by the toc data transformation.");
-  if (GV->hasPrivateLinkage())
-    report_fatal_error("A GlobalVariable with private linkage is not "
-                       "currently supported by the toc data transformation.");
-}
-
 bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
+  if (isAIXABI()) {
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      // On AIX the only symbols that aren't indirect are toc-data.
+      return !GVar->hasAttribute("toc-data");
+
+    return true;
+  }
+
   // Large code model always uses the TOC even for local symbols.
   if (TM.getCodeModel() == CodeModel::Large)
     return true;
+
   if (TM.shouldAssumeDSOLocal(GV))
     return false;
   return true;
 }
 
+CodeModel::Model PPCSubtarget::getCodeModel(const TargetMachine &TM,
+                                            const GlobalValue *GV) const {
+  // If there isn't an attribute to override the module code model
+  // this will be the effective code model.
+  CodeModel::Model ModuleModel = TM.getCodeModel();
+
+  // Initially support per global code model for AIX only.
+  if (!isAIXABI())
+    return ModuleModel;
+
+  // Only GlobalVariables carry an attribute which can override the module code
+  // model.
+  assert(GV && "Unexpected NULL GlobalValue");
+  const GlobalVariable *GlobalVar =
+      [](const GlobalValue *GV) -> const GlobalVariable * {
+    const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+    if (Var)
+      return Var;
+
+    const GlobalAlias *Alias = dyn_cast<GlobalAlias>(GV);
+    if (Alias)
+      return dyn_cast<GlobalVariable>(Alias->getAliaseeObject());
+
+    return nullptr;
+  }(GV);
+
+  if (!GlobalVar)
+    return ModuleModel;
+
+  std::optional<CodeModel::Model> MaybeCodeModel = GlobalVar->getCodeModel();
+  if (MaybeCodeModel) {
+    CodeModel::Model CM = *MaybeCodeModel;
+    assert((CM == CodeModel::Small || CM == CodeModel::Large) &&
+           "invalid code model for AIX");
+    return CM;
+  }
+
+  return ModuleModel;
+}
+
 bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
 bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); }
 
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index d913f22bd5ba9..bf35f8ec151b1 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -245,7 +245,9 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   /// True if the GV will be accessed via an indirect symbol.
   bool isGVIndirectSymbol(const GlobalValue *GV) const;
 
-  void tocDataChecks(unsigned PointerSize, const GlobalVariable *GV) const;
+  /// Calculates the effective code model for argument GV.
+  CodeModel::Model getCodeModel(const TargetMachine &TM,
+                                const GlobalValue *GV) const;
 
   /// True if the ABI is descriptor based.
   bool usesFunctionDescriptors() const {
diff --git a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 147438dfedd87..9f680ef5046da 100644
--- a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -25,6 +25,7 @@
 #include "PPCInstrInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/InitializePasses.h"
@@ -159,9 +160,11 @@ namespace {
         // We don't really need to save data to the stack - the clobbered
         // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
         // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
-        if (NeedFence)
+        if (NeedFence) {
+          MBB.getParent()->getFrameInfo().setAdjustsStack(true);
           BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
                                                               .addImm(0);
+        }
 
         if (IsAIX) {
           if (IsTLSLDAIXMI) {
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index d83979a873f2a..1779959324da2 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -107,6 +107,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
                                uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
+  MCRegister matchRegisterNameHelper(StringRef Name) const;
   bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override;
   ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
                                SMLoc &EndLoc) override;
@@ -1630,7 +1631,7 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 // alternative ABI names), setting RegNo to the matching register. Upon
 // failure, returns a non-valid MCRegister. If IsRVE, then registers x16-x31
 // will be rejected.
-static MCRegister matchRegisterNameHelper(bool IsRVE, StringRef Name) {
+MCRegister RISCVAsmParser::matchRegisterNameHelper(StringRef Name) const {
   MCRegister Reg = MatchRegisterName(Name);
   // The 16-/32- and 64-bit FPRs have the same asm name. Check that the initial
   // match always matches the 64-bit variant, and not the 16/32-bit one.
@@ -1641,7 +1642,7 @@ static MCRegister matchRegisterNameHelper(bool IsRVE, StringRef Name) {
   static_assert(RISCV::F0_D < RISCV::F0_F, "FPR matching must be updated");
   if (!Reg)
     Reg = MatchRegisterAltName(Name);
-  if (IsRVE && Reg >= RISCV::X16 && Reg <= RISCV::X31)
+  if (isRVE() && Reg >= RISCV::X16 && Reg <= RISCV::X31)
     Reg = RISCV::NoRegister;
   return Reg;
 }
@@ -1660,7 +1661,7 @@ ParseStatus RISCVAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
   EndLoc = Tok.getEndLoc();
   StringRef Name = getLexer().getTok().getIdentifier();
 
-  Reg = matchRegisterNameHelper(isRVE(), Name);
+  Reg = matchRegisterNameHelper(Name);
   if (!Reg)
     return ParseStatus::NoMatch;
 
@@ -1693,7 +1694,7 @@ ParseStatus RISCVAsmParser::parseRegister(OperandVector &Operands,
     return ParseStatus::NoMatch;
   case AsmToken::Identifier:
     StringRef Name = getLexer().getTok().getIdentifier();
-    MCRegister RegNo = matchRegisterNameHelper(isRVE(), Name);
+    MCRegister RegNo = matchRegisterNameHelper(Name);
 
     if (!RegNo) {
       if (HadParens)
@@ -2232,7 +2233,7 @@ ParseStatus RISCVAsmParser::parseMaskReg(OperandVector &Operands) {
   StringRef Name = getLexer().getTok().getIdentifier();
   if (!Name.consume_back(".t"))
     return Error(getLoc(), "expected '.t' suffix");
-  MCRegister RegNo = matchRegisterNameHelper(isRVE(), Name);
+  MCRegister RegNo = matchRegisterNameHelper(Name);
 
   if (!RegNo)
     return ParseStatus::NoMatch;
@@ -2250,7 +2251,7 @@ ParseStatus RISCVAsmParser::parseGPRAsFPR(OperandVector &Operands) {
     return ParseStatus::NoMatch;
 
   StringRef Name = getLexer().getTok().getIdentifier();
-  MCRegister RegNo = matchRegisterNameHelper(isRVE(), Name);
+  MCRegister RegNo = matchRegisterNameHelper(Name);
 
   if (!RegNo)
     return ParseStatus::NoMatch;
@@ -2281,7 +2282,7 @@ ParseStatus RISCVAsmParser::parseGPRPair(OperandVector &Operands,
     return ParseStatus::NoMatch;
 
   StringRef Name = getLexer().getTok().getIdentifier();
-  MCRegister RegNo = matchRegisterNameHelper(isRVE(), Name);
+  MCRegister RegNo = matchRegisterNameHelper(Name);
 
   if (!RegNo)
     return ParseStatus::NoMatch;
@@ -2461,7 +2462,7 @@ ParseStatus RISCVAsmParser::parseRegReg(OperandVector &Operands) {
     return ParseStatus::NoMatch;
 
   StringRef RegName = getLexer().getTok().getIdentifier();
-  MCRegister Reg = matchRegisterNameHelper(isRVE(), RegName);
+  MCRegister Reg = matchRegisterNameHelper(RegName);
   if (!Reg)
     return Error(getLoc(), "invalid register");
   getLexer().Lex();
@@ -2473,7 +2474,7 @@ ParseStatus RISCVAsmParser::parseRegReg(OperandVector &Operands) {
     return Error(getLoc(), "expected register");
 
   StringRef Reg2Name = getLexer().getTok().getIdentifier();
-  MCRegister Reg2 = matchRegisterNameHelper(isRVE(), Reg2Name);
+  MCRegister Reg2 = matchRegisterNameHelper(Reg2Name);
   if (!Reg2)
     return Error(getLoc(), "invalid register");
   getLexer().Lex();
@@ -2500,7 +2501,7 @@ ParseStatus RISCVAsmParser::parseReglist(OperandVector &Operands) {
     return Error(getLoc(), "register list must start from 'ra' or 'x1'");
 
   StringRef RegName = getLexer().getTok().getIdentifier();
-  MCRegister RegStart = matchRegisterNameHelper(IsEABI, RegName);
+  MCRegister RegStart = matchRegisterNameHelper(RegName);
   MCRegister RegEnd;
   if (RegStart != RISCV::X1)
     return Error(getLoc(), "register list must start from 'ra' or 'x1'");
@@ -2511,7 +2512,7 @@ ParseStatus RISCVAsmParser::parseReglist(OperandVector &Operands) {
     if (getLexer().isNot(AsmToken::Identifier))
       return Error(getLoc(), "invalid register");
     StringRef RegName = getLexer().getTok().getIdentifier();
-    RegStart = matchRegisterNameHelper(IsEABI, RegName);
+    RegStart = matchRegisterNameHelper(RegName);
     if (!RegStart)
       return Error(getLoc(), "invalid register");
     if (RegStart != RISCV::X8)
@@ -2524,7 +2525,7 @@ ParseStatus RISCVAsmParser::parseReglist(OperandVector &Operands) {
   if (parseOptionalToken(AsmToken::Minus)) {
     StringRef EndName = getLexer().getTok().getIdentifier();
     // FIXME: the register mapping and checks of EABI is wrong
-    RegEnd = matchRegisterNameHelper(IsEABI, EndName);
+    RegEnd = matchRegisterNameHelper(EndName);
     if (!RegEnd)
       return Error(getLoc(), "invalid register");
     if (IsEABI && RegEnd != RISCV::X9)
@@ -2575,7 +2576,7 @@ ParseStatus RISCVAsmParser::parseReglist(OperandVector &Operands) {
     RegEnd = RegStart;
 
   auto Encode = RISCVZC::encodeRlist(RegEnd, IsEABI);
-  if (Encode == 16)
+  if (Encode == RISCVZC::INVALID_RLIST)
     return Error(S, "invalid register list");
   Operands.push_back(RISCVOperand::createRlist(Encode, S));
 
@@ -2716,7 +2717,7 @@ ParseStatus RISCVAsmParser::parseDirective(AsmToken DirectiveID) {
 
 bool RISCVAsmParser::resetToArch(StringRef Arch, SMLoc Loc, std::string &Result,
                                  bool FromOptionDirective) {
-  for (auto Feature : RISCVFeatureKV)
+  for (auto &Feature : RISCVFeatureKV)
     if (llvm::RISCVISAInfo::isSupportedExtensionFeature(Feature.Key))
       clearFeatureBits(Feature.Value, Feature.Key);
 
@@ -2735,7 +2736,7 @@ bool RISCVAsmParser::resetToArch(StringRef Arch, SMLoc Loc, std::string &Result,
   }
   auto &ISAInfo = *ParseResult;
 
-  for (auto Feature : RISCVFeatureKV)
+  for (auto &Feature : RISCVFeatureKV)
     if (ISAInfo->hasExtension(Feature.Key))
       setFeatureBits(Feature.Value, Feature.Key);
 
@@ -2823,9 +2824,8 @@ bool RISCVAsmParser::parseDirectiveOption() {
         break;
       }
 
-      ArrayRef<SubtargetFeatureKV> KVArray(RISCVFeatureKV);
-      auto Ext = llvm::lower_bound(KVArray, Arch);
-      if (Ext == KVArray.end() || StringRef(Ext->Key) != Arch ||
+      auto Ext = llvm::lower_bound(RISCVFeatureKV, Arch);
+      if (Ext == std::end(RISCVFeatureKV) || StringRef(Ext->Key) != Arch ||
           !RISCVISAInfo::isSupportedExtension(Arch)) {
         if (isDigit(Arch.back()))
           return Error(
@@ -2858,7 +2858,7 @@ bool RISCVAsmParser::parseDirectiveOption() {
         // It is invalid to disable an extension that there are other enabled
         // extensions depend on it.
         // TODO: Make use of RISCVISAInfo to handle this
-        for (auto Feature : KVArray) {
+        for (auto &Feature : RISCVFeatureKV) {
           if (getSTI().hasFeature(Feature.Value) &&
               Feature.Implies.test(Ext->Value))
             return Error(Loc,
@@ -3271,11 +3271,13 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(1))
                             .addOperand(Inst.getOperand(2))
-                            .addReg(RISCV::NoRegister));
+                            .addReg(RISCV::NoRegister)
+                            .setLoc(IDLoc));
     emitToStreamer(Out, MCInstBuilder(RISCV::VMNAND_MM)
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(0))
-                            .addOperand(Inst.getOperand(0)));
+                            .addOperand(Inst.getOperand(0))
+                            .setLoc(IDLoc));
   } else if (Inst.getNumOperands() == 4) {
     // masked va >= x, vd != v0
     //
@@ -3287,11 +3289,13 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(1))
                             .addOperand(Inst.getOperand(2))
-                            .addOperand(Inst.getOperand(3)));
+                            .addOperand(Inst.getOperand(3))
+                            .setLoc(IDLoc));
     emitToStreamer(Out, MCInstBuilder(RISCV::VMXOR_MM)
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(0))
-                            .addReg(RISCV::V0));
+                            .addReg(RISCV::V0)
+                            .setLoc(IDLoc));
   } else if (Inst.getNumOperands() == 5 &&
              Inst.getOperand(0).getReg() == RISCV::V0) {
     // masked va >= x, vd == v0
@@ -3306,11 +3310,13 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
                             .addOperand(Inst.getOperand(1))
                             .addOperand(Inst.getOperand(2))
                             .addOperand(Inst.getOperand(3))
-                            .addReg(RISCV::NoRegister));
+                            .addReg(RISCV::NoRegister)
+                            .setLoc(IDLoc));
     emitToStreamer(Out, MCInstBuilder(RISCV::VMANDN_MM)
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(0))
-                            .addOperand(Inst.getOperand(1)));
+                            .addOperand(Inst.getOperand(1))
+                            .setLoc(IDLoc));
   } else if (Inst.getNumOperands() == 5) {
     // masked va >= x, any vd
     //
@@ -3323,19 +3329,23 @@ void RISCVAsmParser::emitVMSGE(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
                             .addOperand(Inst.getOperand(1))
                             .addOperand(Inst.getOperand(2))
                             .addOperand(Inst.getOperand(3))
-                            .addReg(RISCV::NoRegister));
+                            .addReg(RISCV::NoRegister)
+                            .setLoc(IDLoc));
     emitToStreamer(Out, MCInstBuilder(RISCV::VMANDN_MM)
                             .addOperand(Inst.getOperand(1))
                             .addReg(RISCV::V0)
-                            .addOperand(Inst.getOperand(1)));
+                            .addOperand(Inst.getOperand(1))
+                            .setLoc(IDLoc));
     emitToStreamer(Out, MCInstBuilder(RISCV::VMANDN_MM)
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(0))
-                            .addReg(RISCV::V0));
+                            .addReg(RISCV::V0)
+                            .setLoc(IDLoc));
     emitToStreamer(Out, MCInstBuilder(RISCV::VMOR_MM)
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(1))
-                            .addOperand(Inst.getOperand(0)));
+                            .addOperand(Inst.getOperand(0))
+                            .setLoc(IDLoc));
   }
 }
 
@@ -3637,7 +3647,8 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                             .addOperand(Inst.getOperand(0))
                             .addOperand(Inst.getOperand(1))
                             .addImm(Imm - 1)
-                            .addOperand(Inst.getOperand(3)));
+                            .addOperand(Inst.getOperand(3))
+                            .setLoc(IDLoc));
     return false;
   }
   case RISCV::PseudoVMSGEU_VI:
@@ -3655,7 +3666,8 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                               .addOperand(Inst.getOperand(0))
                               .addOperand(Inst.getOperand(1))
                               .addOperand(Inst.getOperand(1))
-                              .addOperand(Inst.getOperand(3)));
+                              .addOperand(Inst.getOperand(3))
+                              .setLoc(IDLoc));
     } else {
       // Other immediate values can subtract one like signed.
       unsigned Opc = Inst.getOpcode() == RISCV::PseudoVMSGEU_VI
@@ -3665,7 +3677,8 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                               .addOperand(Inst.getOperand(0))
                               .addOperand(Inst.getOperand(1))
                               .addImm(Imm - 1)
-                              .addOperand(Inst.getOperand(3)));
+                              .addOperand(Inst.getOperand(3))
+                              .setLoc(IDLoc));
     }
 
     return false;
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index f1ca1212ec378..6aadabdf1bc61 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -595,6 +595,14 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     TRY_TO_DECODE_FEATURE(
         RISCV::FeatureVendorXSfvfnrclipxfqf, DecoderTableXSfvfnrclipxfqf32,
         "SiFive FP32-to-int8 Ranged Clip Instructions opcode table");
+    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecdiscarddlone,
+                          DecoderTableXSiFivecdiscarddlone32,
+                          "SiFive sf.cdiscard.d.l1 custom opcode table");
+    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecflushdlone,
+                          DecoderTableXSiFivecflushdlone32,
+                          "SiFive sf.cflush.d.l1 custom opcode table");
+    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfcease, DecoderTableXSfcease32,
+                          "SiFive sf.cease custom opcode table");
     TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbitmanip,
                           DecoderTableXCVbitmanip32,
                           "CORE-V Bit Manipulation custom opcode table");
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 575bd4c9d3561..22736edc5f072 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -43,6 +43,7 @@ include "RISCVMacroFusion.td"
 include "RISCVSchedRocket.td"
 include "RISCVSchedSiFive7.td"
 include "RISCVSchedSiFiveP400.td"
+include "RISCVSchedSiFiveP600.td"
 include "RISCVSchedSyntacoreSCR1.td"
 include "RISCVSchedXiangShanNanHu.td"
 
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 83619ccb24baa..f3e641e250182 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1058,6 +1058,30 @@ def HasVendorXSfvfnrclipxfqf
       AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf),
                          "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">;
 
+def FeatureVendorXSiFivecdiscarddlone
+    : SubtargetFeature<"xsifivecdiscarddlone", "HasVendorXSiFivecdiscarddlone", "true",
+                       "'XSiFivecdiscarddlone' (SiFive sf.cdiscard.d.l1 Instruction)", []>;
+def HasVendorXSiFivecdiscarddlone
+    : Predicate<"Subtarget->hasVendorXSiFivecdiscarddlone()">,
+      AssemblerPredicate<(all_of FeatureVendorXSiFivecdiscarddlone),
+                         "'XSiFivecdiscarddlone' (SiFive sf.cdiscard.d.l1 Instruction)">;
+
+def FeatureVendorXSiFivecflushdlone
+    : SubtargetFeature<"xsifivecflushdlone", "HasVendorXSiFivecflushdlone", "true",
+                       "'XSiFivecflushdlone' (SiFive sf.cflush.d.l1 Instruction)", []>;
+def HasVendorXSiFivecflushdlone
+    : Predicate<"Subtarget->hasVendorXSiFivecflushdlone()">,
+      AssemblerPredicate<(all_of FeatureVendorXSiFivecflushdlone),
+                         "'XSiFivecflushdlone' (SiFive sf.cflush.d.l1 Instruction)">;
+
+def FeatureVendorXSfcease
+    : SubtargetFeature<"xsfcease", "HasVendorXSfcease", "true",
+                       "'XSfcease' (SiFive sf.cease Instruction)", []>;
+def HasVendorXSfcease
+    : Predicate<"Subtarget->hasVendorXSfcease()">,
+      AssemblerPredicate<(all_of FeatureVendorXSfcease),
+                         "'XSfcease' (SiFive sf.cease Instruction)">;
+
 // Core-V Extensions
 
 def FeatureVendorXCVelw
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index 0530e3d08be82..f0bd25f167d80 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -231,7 +231,7 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
     BasePtr =
         PHINode::Create(Start->getType(), 2, Phi->getName() + ".scalar", Phi->getIterator());
     Inc = BinaryOperator::CreateAdd(BasePtr, Step, Inc->getName() + ".scalar",
-                                    Inc);
+                                    Inc->getIterator());
     BasePtr->addIncoming(Start, Phi->getIncomingBlock(1 - IncrementingBlock));
     BasePtr->addIncoming(Inc, Phi->getIncomingBlock(IncrementingBlock));
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 08678a859ae2b..4bfd4d0386a86 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -717,7 +717,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
     static const unsigned FloatingPointVecReduceOps[] = {
         ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_FMIN,
-        ISD::VECREDUCE_FMAX};
+        ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMINIMUM, ISD::VECREDUCE_FMAXIMUM};
 
     if (!Subtarget.is64Bit()) {
       // We must custom-lower certain vXi64 operations on RV32 due to the vector
@@ -6541,6 +6541,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VECREDUCE_SEQ_FADD:
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
     return lowerFPVECREDUCE(Op, DAG);
   case ISD::VP_REDUCE_ADD:
   case ISD::VP_REDUCE_UMAX:
@@ -9541,14 +9543,17 @@ getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT,
   case ISD::VECREDUCE_SEQ_FADD:
     return std::make_tuple(RISCVISD::VECREDUCE_SEQ_FADD_VL, Op.getOperand(1),
                            Op.getOperand(0));
+  case ISD::VECREDUCE_FMINIMUM:
+  case ISD::VECREDUCE_FMAXIMUM:
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAX: {
     SDValue Front =
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op.getOperand(0),
                     DAG.getVectorIdxConstant(0, DL));
-    unsigned RVVOpc = (Opcode == ISD::VECREDUCE_FMIN)
-                          ? RISCVISD::VECREDUCE_FMIN_VL
-                          : RISCVISD::VECREDUCE_FMAX_VL;
+    unsigned RVVOpc =
+        (Opcode == ISD::VECREDUCE_FMIN || Opcode == ISD::VECREDUCE_FMINIMUM)
+            ? RISCVISD::VECREDUCE_FMIN_VL
+            : RISCVISD::VECREDUCE_FMAX_VL;
     return std::make_tuple(RVVOpc, Op.getOperand(0), Front);
   }
   }
@@ -9571,9 +9576,30 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op,
     VectorVal = convertToScalableVector(ContainerVT, VectorVal, DAG, Subtarget);
   }
 
+  MVT ResVT = Op.getSimpleValueType();
   auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
-  return lowerReductionSeq(RVVOpcode, Op.getSimpleValueType(), ScalarVal,
-                           VectorVal, Mask, VL, DL, DAG, Subtarget);
+  SDValue Res = lowerReductionSeq(RVVOpcode, ResVT, ScalarVal, VectorVal, Mask,
+                                  VL, DL, DAG, Subtarget);
+  if (Op.getOpcode() != ISD::VECREDUCE_FMINIMUM &&
+      Op.getOpcode() != ISD::VECREDUCE_FMAXIMUM)
+    return Res;
+
+  if (Op->getFlags().hasNoNaNs())
+    return Res;
+
+  // Force output to NaN if any element is Nan.
+  SDValue IsNan =
+      DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(),
+                  {VectorVal, VectorVal, DAG.getCondCode(ISD::SETNE),
+                   DAG.getUNDEF(Mask.getValueType()), Mask, VL});
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDValue CPop = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, IsNan, Mask, VL);
+  SDValue NoNaNs = DAG.getSetCC(DL, XLenVT, CPop,
+                                DAG.getConstant(0, DL, XLenVT), ISD::SETEQ);
+  return DAG.getSelect(
+      DL, ResVT, NoNaNs, Res,
+      DAG.getConstantFP(APFloat::getNaN(DAG.EVTToAPFloatSemantics(ResVT)), DL,
+                        ResVT));
 }
 
 SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
@@ -10466,6 +10492,7 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
   SDValue BasePtr = MemSD->getBasePtr();
   SDValue Val, Mask, VL;
 
+  bool IsCompressingStore = false;
   if (const auto *VPStore = dyn_cast<VPStoreSDNode>(Op)) {
     Val = VPStore->getValue();
     Mask = VPStore->getMask();
@@ -10474,9 +10501,11 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
     const auto *MStore = cast<MaskedStoreSDNode>(Op);
     Val = MStore->getValue();
     Mask = MStore->getMask();
+    IsCompressingStore = MStore->isCompressingStore();
   }
 
-  bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+  bool IsUnmasked =
+      ISD::isConstantSplatVectorAllOnes(Mask.getNode()) || IsCompressingStore;
 
   MVT VT = Val.getSimpleValueType();
   MVT XLenVT = Subtarget.getXLenVT();
@@ -10486,7 +10515,7 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
     ContainerVT = getContainerForFixedLengthVector(VT);
 
     Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
-    if (!IsUnmasked) {
+    if (!IsUnmasked || IsCompressingStore) {
       MVT MaskVT = getMaskTypeFor(ContainerVT);
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
     }
@@ -10495,6 +10524,15 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
   if (!VL)
     VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
 
+  if (IsCompressingStore) {
+    Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+                      DAG.getConstant(Intrinsic::riscv_vcompress, DL, XLenVT),
+                      DAG.getUNDEF(ContainerVT), Val, Mask, VL);
+    VL =
+        DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Mask,
+                    getAllOnesMask(Mask.getSimpleValueType(), VL, DL, DAG), VL);
+  }
+
   unsigned IntID =
       IsUnmasked ? Intrinsic::riscv_vse : Intrinsic::riscv_vse_mask;
   SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
@@ -15285,62 +15323,13 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
 }
 
-// Recursively split up concat_vectors with more than 2 operands:
-//
-// concat_vector op1, op2, op3, op4
-// ->
-// concat_vector (concat_vector op1, op2), (concat_vector op3, op4)
-//
-// This reduces the length of the chain of vslideups and allows us to perform
-// the vslideups at a smaller LMUL, limited to MF2.
-//
-// We do this as a DAG combine rather than during lowering so that any undef
-// operands can get combined away.
-static SDValue
-performCONCAT_VECTORSSplitCombine(SDNode *N, SelectionDAG &DAG,
-                                  const RISCVTargetLowering &TLI) {
-  SDLoc DL(N);
-
-  if (N->getNumOperands() <= 2)
-    return SDValue();
-
-  if (!TLI.isTypeLegal(N->getValueType(0)))
-    return SDValue();
-  MVT VT = N->getSimpleValueType(0);
-
-  // Don't split any further than MF2.
-  MVT ContainerVT = VT;
-  if (VT.isFixedLengthVector())
-    ContainerVT = getContainerForFixedLengthVector(DAG, VT, TLI.getSubtarget());
-  if (ContainerVT.bitsLT(getLMUL1VT(ContainerVT)))
-    return SDValue();
-
-  MVT HalfVT = VT.getHalfNumVectorElementsVT();
-  assert(isPowerOf2_32(N->getNumOperands()));
-  size_t HalfNumOps = N->getNumOperands() / 2;
-  SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
-                           N->ops().take_front(HalfNumOps));
-  SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
-                           N->ops().drop_front(HalfNumOps));
-
-  // Lower to an insert_subvector directly so the concat_vectors don't get
-  // recombined.
-  SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Lo,
-                            DAG.getVectorIdxConstant(0, DL));
-  Vec = DAG.getNode(
-      ISD::INSERT_SUBVECTOR, DL, VT, Vec, Hi,
-      DAG.getVectorIdxConstant(HalfVT.getVectorMinNumElements(), DL));
-  return Vec;
-}
-
 // If we're concatenating a series of vector loads like
 // concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
 // Then we can turn this into a strided load by widening the vector elements
 // vlse32 p, stride=n
-static SDValue
-performCONCAT_VECTORSStridedLoadCombine(SDNode *N, SelectionDAG &DAG,
-                                        const RISCVSubtarget &Subtarget,
-                                        const RISCVTargetLowering &TLI) {
+static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
+                                            const RISCVSubtarget &Subtarget,
+                                            const RISCVTargetLowering &TLI) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
@@ -16445,10 +16434,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return V;
     break;
   case ISD::CONCAT_VECTORS:
-    if (SDValue V =
-            performCONCAT_VECTORSStridedLoadCombine(N, DAG, Subtarget, *this))
-      return V;
-    if (SDValue V = performCONCAT_VECTORSSplitCombine(N, DAG, *this))
+    if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
       return V;
     break;
   case ISD::INSERT_VECTOR_ELT:
@@ -20923,7 +20909,7 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
   if (Op == Instruction::Add || Op == Instruction::Sub ||
       Op == Instruction::And || Op == Instruction::Or ||
       Op == Instruction::Xor || Op == Instruction::InsertElement ||
-      Op == Instruction::Xor || Op == Instruction::ShuffleVector)
+      Op == Instruction::ShuffleVector || Op == Instruction::Load)
     return false;
 
   if (Inst.getType()->isScalableTy())
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 37a8079dcbf10..4ca300f9151e1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -656,7 +656,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   if (IsScalableVector) {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
-        MemoryLocation::UnknownSize, MFI.getObjectAlign(FI));
+        LocationSize::beforeOrAfterPointer(), MFI.getObjectAlign(FI));
 
     MFI.setStackID(FI, TargetStackID::ScalableVector);
     BuildMI(MBB, I, DebugLoc(), get(Opcode))
@@ -739,7 +739,7 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (IsScalableVector) {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
-        MemoryLocation::UnknownSize, MFI.getObjectAlign(FI));
+        LocationSize::beforeOrAfterPointer(), MFI.getObjectAlign(FI));
 
     MFI.setStackID(FI, TargetStackID::ScalableVector);
     BuildMI(MBB, I, DebugLoc(), get(Opcode), DstReg)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index e753c1f1add0c..966cdc433d0fd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1405,7 +1405,8 @@ multiclass BccPat<CondCode Cond, RVInstB Inst> {
 
 class BrccCompressOpt<CondCode Cond, RVInstB Inst>
     : Pat<(riscv_brcc GPR:$lhs, simm12_no6:$Constant, Cond, bb:$place),
-          (Inst (ADDI GPR:$lhs, (NegImm simm12:$Constant)), (XLenVT X0), bb:$place)>;
+          (Inst (XLenVT (ADDI GPR:$lhs, (NegImm simm12:$Constant))),
+                (XLenVT X0), bb:$place)>;
 
 defm : BccPat<SETEQ, BEQ>;
 defm : BccPat<SETNE, BNE>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index b4130e3805a11..9a6818c99af20 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -808,3 +808,35 @@ let Predicates = [HasVendorXSfvfnrclipxfqf] in {
   defm : VPatVFNRCLIP<"vfnrclip_xu_f_qf", "VFNRCLIP_XU_F_QF">;
   defm : VPatVFNRCLIP<"vfnrclip_x_f_qf", "VFNRCLIP_X_F_QF">;
 }
+
+let Predicates = [HasVendorXSiFivecdiscarddlone] in {
+  let hasNoSchedulingInfo = 1, hasSideEffects = 1, mayLoad = 0, mayStore = 0,
+      DecoderNamespace = "XSiFivecdiscarddlone" in
+  def SF_CDISCARD_D_L1
+      : RVInstIUnary<0b111111000010, 0b000, OPC_SYSTEM, (outs), (ins GPR:$rs1),
+                     "sf.cdiscard.d.l1", "$rs1">, Sched<[]> {
+    let rd = 0;
+  }
+  def : InstAlias<"sf.cdiscard.d.l1", (SF_CDISCARD_D_L1 X0)>;
+} // Predicates = [HasVendorXSifivecdiscarddlone]
+
+let Predicates = [HasVendorXSiFivecflushdlone] in {
+  let hasNoSchedulingInfo = 1, hasSideEffects = 1, mayLoad = 0, mayStore = 0,
+      DecoderNamespace = "XSiFivecflushdlone" in
+  def SF_CFLUSH_D_L1
+      : RVInstIUnary<0b111111000000, 0b000, OPC_SYSTEM, (outs), (ins GPR:$rs1),
+                     "sf.cflush.d.l1", "$rs1">, Sched<[]> {
+    let rd = 0;
+  }
+  def : InstAlias<"sf.cflush.d.l1", (SF_CFLUSH_D_L1 X0)>;
+} // Predicates = [HasVendorXSifivecflushdlone]
+
+let Predicates = [HasVendorXSfcease] in {
+  let hasNoSchedulingInfo = 1, hasSideEffects = 1, mayLoad = 0, mayStore = 0,
+      DecoderNamespace = "XSfcease" in
+  def SF_CEASE : RVInstIUnary<0b001100000101, 0b000, OPC_SYSTEM, (outs), (ins),
+                              "sf.cease", "">, Sched<[]> {
+    let rs1 = 0b00000;
+    let rd = 0b00000;
+}
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index f0f8494dd9a31..a882b208a7688 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -424,13 +424,13 @@ def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh", Commutable=1>,
 
 let Predicates = [HasStdExtZbb] in {
 def MIN  : ALU_rr<0b0000101, 0b100, "min", Commutable=1>,
-           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+           Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 def MINU : ALU_rr<0b0000101, 0b101, "minu", Commutable=1>,
-           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+           Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 def MAX  : ALU_rr<0b0000101, 0b110, "max", Commutable=1>,
-           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+           Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 def MAXU : ALU_rr<0b0000101, 0b111, "maxu", Commutable=1>,
-           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+           Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 } // Predicates = [HasStdExtZbb]
 
 let Predicates = [HasStdExtZbkb] in {
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 8c75df41f5e39..fd6d6078ec238 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -245,7 +245,7 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                        TuneLUIADDIFusion,
                                        TuneAUIPCADDIFusion]>;
 
-def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", NoSchedModel,
+def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       [Feature64Bit,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index a68674b221d38..10bf1e88d7414 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -431,29 +431,28 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 
   if (!IsRVVSpill) {
-    if (MI.getOpcode() == RISCV::ADDI && !isInt<12>(Offset.getFixed())) {
+    int64_t Val = Offset.getFixed();
+    int64_t Lo12 = SignExtend64<12>(Val);
+    unsigned Opc = MI.getOpcode();
+    if (Opc == RISCV::ADDI && !isInt<12>(Val)) {
       // We chose to emit the canonical immediate sequence rather than folding
       // the offset into the using add under the theory that doing so doesn't
       // save dynamic instruction count and some target may fuse the canonical
       // 32 bit immediate sequence.  We still need to clear the portion of the
       // offset encoded in the immediate.
       MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+    } else if ((Opc == RISCV::PREFETCH_I || Opc == RISCV::PREFETCH_R ||
+                Opc == RISCV::PREFETCH_W) &&
+               (Lo12 & 0b11111) != 0) {
+      // Prefetch instructions require the offset to be 32 byte aligned.
+      MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
     } else {
       // We can encode an add with 12 bit signed immediate in the immediate
       // operand of our user instruction.  As a result, the remaining
       // offset can by construction, at worst, a LUI and a ADD.
-      int64_t Val = Offset.getFixed();
-      int64_t Lo12 = SignExtend64<12>(Val);
-      if ((MI.getOpcode() == RISCV::PREFETCH_I ||
-           MI.getOpcode() == RISCV::PREFETCH_R ||
-           MI.getOpcode() == RISCV::PREFETCH_W) &&
-          (Lo12 & 0b11111) != 0)
-        MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
-      else {
-        MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Lo12);
-        Offset = StackOffset::get((uint64_t)Val - (uint64_t)Lo12,
-                                  Offset.getScalable());
-      }
+      MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Lo12);
+      Offset = StackOffset::get((uint64_t)Val - (uint64_t)Lo12,
+                                Offset.getScalable());
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 240d170bfcf6f..3586d235bdbbb 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -186,7 +186,7 @@ class SiFive7AnyToGPRBypass<SchedRead read, int cycles = 2>
                                  WriteBEXT, WriteBEXTI,
                                  WriteCLZ, WriteCLZ32, WriteCTZ, WriteCTZ32,
                                  WriteCPOP, WriteCPOP32,
-                                 WriteREV8, WriteORCB, WriteSFB,
+                                 WriteREV8, WriteORCB, WriteIMinMax, WriteSFB,
                                  WriteIMul, WriteIMul32,
                                  WriteIDiv, WriteIDiv32,
                                  WriteIRem, WriteIRem32,
@@ -305,6 +305,9 @@ def : WriteRes<WriteCPOP32, [SiFive7PipeB]>;
 // orc.b is in the late-B ALU.
 def : WriteRes<WriteORCB, [SiFive7PipeB]>;
 
+// min/max are in the late-B ALU
+def : WriteRes<WriteIMinMax, [SiFive7PipeB]>;
+
 // rev8 is in the late-A and late-B ALUs.
 def : WriteRes<WriteREV8, [SiFive7PipeAB]>;
 
@@ -1041,6 +1044,7 @@ def : SiFive7AnyToGPRBypass<ReadCTZ32>;
 def : ReadAdvance<ReadCPOP, 0>;
 def : ReadAdvance<ReadCPOP32, 0>;
 def : SiFive7AnyToGPRBypass<ReadORCB>;
+def : SiFive7AnyToGPRBypass<ReadIMinMax>;
 def : SiFive7AnyToGPRBypass<ReadREV8>;
 def : SiFive7AnyToGPRBypass<ReadSHXADD>;
 def : SiFive7AnyToGPRBypass<ReadSHXADD32>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index d02d34a0fb9c5..8ec2e4ff885eb 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -109,6 +109,7 @@ def : WriteRes<WriteCTZ, [SiFiveP400IntArith]>;
 def : WriteRes<WriteCTZ32, [SiFiveP400IntArith]>;
 
 def : WriteRes<WriteORCB, [SiFiveP400IntArith]>;
+def : WriteRes<WriteIMinMax, [SiFiveP400IntArith]>;
 
 def : WriteRes<WriteREV8, [SiFiveP400IntArith]>;
 
@@ -349,6 +350,7 @@ def : ReadAdvance<ReadCTZ32, 0>;
 def : ReadAdvance<ReadCPOP, 0>;
 def : ReadAdvance<ReadCPOP32, 0>;
 def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
 def : ReadAdvance<ReadREV8, 0>;
 def : ReadAdvance<ReadSHXADD, 0>;
 def : ReadAdvance<ReadSHXADD32, 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
new file mode 100644
index 0000000000000..54016959d348e
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -0,0 +1,1024 @@
+//==- RISCVSchedSiFiveP600.td - SiFiveP600 Scheduling Defs ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+/// c is true if mx has the worst case behavior compared to LMULs in MxList.
+/// On the SiFiveP600, the worst case LMUL is the Largest LMUL
+/// and the worst case sew is the smallest SEW for that LMUL.
+class SiFiveP600IsWorstCaseMX<string mx, list<string> MxList> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
+}
+
+class SiFiveP600IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  int SSEW = SmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+// 1 Micro-Op per cycle.
+class SiFiveP600GetLMulCycles<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 1,
+    !eq(mx, "M2") : 2,
+    !eq(mx, "M4") : 4,
+    !eq(mx, "M8") : 8,
+    !eq(mx, "MF2") : 1,
+    !eq(mx, "MF4") : 1,
+    !eq(mx, "MF8") : 1
+  );
+}
+
+// Latency for segmented loads and stores are calculated as vl * nf.
+class SiFiveP600GetCyclesSegmented<string mx, int sew, int nf> {
+  defvar VLEN = 128;
+  defvar VLUpperBound = !cond(
+    !eq(mx, "M1") : !div(VLEN, sew),
+    !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
+    !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
+    !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
+    !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
+    !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
+    !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
+  );
+  int c = !mul(VLUpperBound, nf);
+}
+
+// SiFiveP600 machine model for scheduling and other instruction cost heuristics.
+def SiFiveP600Model : SchedMachineModel {
+  let IssueWidth = 4;         // 4 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 160; // Max micro-ops that can be buffered.
+  let LoadLatency = 4;        // Cycles for loads to access the cache.
+  let MispredictPenalty = 9;  // Extra cycles for a mispredicted branch.
+  let PostRAScheduler = true;
+  let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+                             HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+                             HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
+                             HasVendorXSfvqmaccqoq];
+  let CompleteModel = false;
+}
+
+let SchedModel = SiFiveP600Model in {
+
+def SiFiveP600IEXQ0       : ProcResource<1>;
+def SiFiveP600IEXQ1       : ProcResource<1>;
+def SiFiveP600IEXQ2       : ProcResource<1>;
+def SiFiveP600IEXQ3       : ProcResource<1>;
+def SiFiveP600FEXQ0       : ProcResource<1>;
+def SiFiveP600FEXQ1       : ProcResource<1>;
+
+// Two Load/Store ports that can issue either two loads, two stores, or one load
+// and one store (P550 has one load and one separate store pipe).
+def SiFiveP600LDST       : ProcResource<2>;
+
+// 4-wide pipeline with 4 ALU pipes.
+def SiFiveP600IntArith    : ProcResGroup<[SiFiveP600IEXQ0, SiFiveP600IEXQ1, SiFiveP600IEXQ2, SiFiveP600IEXQ3]>;
+defvar SiFiveP600SYS      = SiFiveP600IEXQ0;
+defvar SiFiveP600CMOV     = SiFiveP600IEXQ0;
+defvar SiFiveP600MulI2F   = SiFiveP600IEXQ1;
+def SiFiveP600Branch      : ProcResGroup<[SiFiveP600IEXQ2, SiFiveP600IEXQ3]>;
+def SiFiveP600Div         : ProcResource<1>;
+
+def SiFiveP600FloatArith  : ProcResGroup<[SiFiveP600FEXQ0, SiFiveP600FEXQ1]>;
+defvar SiFiveP600F2I      = SiFiveP600FEXQ0;
+def SiFiveP600FloatDiv    : ProcResource<1>;
+
+// Vector pipeline
+// VEXQ0 handle Mask, Simple Slide instructions,
+// VEXQ1 handle Complex Slide, Permutation, Reductions, Divide instructions.
+// Other vector instructions can be done in VEXQ0 and VEXQ1.
+def SiFiveP600VEXQ0        : ProcResource<1>;
+def SiFiveP600VEXQ1        : ProcResource<1>;
+def SiFiveP600VectorArith  : ProcResGroup<[SiFiveP600VEXQ0, SiFiveP600VEXQ1]>;
+def SiFiveP600VLD          : ProcResource<1>;
+def SiFiveP600VST          : ProcResource<1>;
+def SiFiveP600VDiv         : ProcResource<1>;
+def SiFiveP600VFloatDiv    : ProcResource<1>;
+
+// Integer arithmetic and logic
+def : WriteRes<WriteIALU, [SiFiveP600IntArith]>;
+def : WriteRes<WriteIALU32, [SiFiveP600IntArith]>;
+def : WriteRes<WriteShiftImm, [SiFiveP600IntArith]>;
+def : WriteRes<WriteShiftImm32, [SiFiveP600IntArith]>;
+def : WriteRes<WriteShiftReg, [SiFiveP600IntArith]>;
+def : WriteRes<WriteShiftReg32, [SiFiveP600IntArith]>;
+// Branching
+def : WriteRes<WriteJmp, [SiFiveP600Branch]>;
+def : WriteRes<WriteJal, [SiFiveP600Branch]>;
+def : WriteRes<WriteJalr, [SiFiveP600Branch]>;
+
+// CMOV
+def P600WriteCMOV : SchedWriteRes<[SiFiveP600Branch, SiFiveP600CMOV]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[P600WriteCMOV], (instrs PseudoCCMOVGPRNoX0)>;
+
+let Latency = 3 in {
+// Integer multiplication
+def : WriteRes<WriteIMul, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteIMul32, [SiFiveP600MulI2F]>;
+// cpop[w] look exactly like multiply.
+def : WriteRes<WriteCPOP, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteCPOP32, [SiFiveP600MulI2F]>;
+}
+
+// Integer division
+def : WriteRes<WriteIDiv, [SiFiveP600MulI2F, SiFiveP600Div]> {
+  let Latency = 35;
+  let ReleaseAtCycles = [1, 34];
+}
+def : WriteRes<WriteIDiv32,  [SiFiveP600MulI2F, SiFiveP600Div]> {
+  let Latency = 20;
+  let ReleaseAtCycles = [1, 19];
+}
+
+// Integer remainder
+def : WriteRes<WriteIRem, [SiFiveP600MulI2F, SiFiveP600Div]> {
+  let Latency = 35;
+  let ReleaseAtCycles = [1, 34];
+}
+def : WriteRes<WriteIRem32, [SiFiveP600MulI2F, SiFiveP600Div]> {
+  let Latency = 20;
+  let ReleaseAtCycles = [1, 19];
+}
+
+// Bitmanip
+def : WriteRes<WriteRotateImm, [SiFiveP600IntArith]>;
+def : WriteRes<WriteRotateImm32, [SiFiveP600IntArith]>;
+def : WriteRes<WriteRotateReg, [SiFiveP600IntArith]>;
+def : WriteRes<WriteRotateReg32, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteCLZ, [SiFiveP600IntArith]>;
+def : WriteRes<WriteCLZ32, [SiFiveP600IntArith]>;
+def : WriteRes<WriteCTZ, [SiFiveP600IntArith]>;
+def : WriteRes<WriteCTZ32, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteORCB, [SiFiveP600IntArith]>;
+def : WriteRes<WriteIMinMax, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteREV8, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteSHXADD, [SiFiveP600IntArith]>;
+def : WriteRes<WriteSHXADD32, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteSingleBit, [SiFiveP600IntArith]>;
+def : WriteRes<WriteSingleBitImm, [SiFiveP600IntArith]>;
+def : WriteRes<WriteBEXT, [SiFiveP600IntArith]>;
+def : WriteRes<WriteBEXTI, [SiFiveP600IntArith]>;
+
+// Memory
+def : WriteRes<WriteSTB, [SiFiveP600LDST]>;
+def : WriteRes<WriteSTH, [SiFiveP600LDST]>;
+def : WriteRes<WriteSTW, [SiFiveP600LDST]>;
+def : WriteRes<WriteSTD, [SiFiveP600LDST]>;
+def : WriteRes<WriteFST16, [SiFiveP600LDST]>;
+def : WriteRes<WriteFST32, [SiFiveP600LDST]>;
+def : WriteRes<WriteFST64, [SiFiveP600LDST]>;
+
+let Latency = 4 in {
+def : WriteRes<WriteLDB, [SiFiveP600LDST]>;
+def : WriteRes<WriteLDH, [SiFiveP600LDST]>;
+}
+let Latency = 4 in {
+def : WriteRes<WriteLDW, [SiFiveP600LDST]>;
+def : WriteRes<WriteLDD, [SiFiveP600LDST]>;
+}
+
+let Latency = 6 in {
+def : WriteRes<WriteFLD16, [SiFiveP600LDST]>;
+def : WriteRes<WriteFLD32, [SiFiveP600LDST]>;
+def : WriteRes<WriteFLD64, [SiFiveP600LDST]>;
+}
+
+// Atomic memory
+let Latency = 3 in {
+def : WriteRes<WriteAtomicSTW, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicSTD, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicW, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicD, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicLDW, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicLDD, [SiFiveP600LDST]>;
+}
+
+// Floating point
+let Latency = 2 in {
+def : WriteRes<WriteFAdd16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFAdd32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFAdd64, [SiFiveP600FloatArith]>;
+}
+let Latency = 3 in {
+def : WriteRes<WriteFMul16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMul32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMul64, [SiFiveP600FloatArith]>;
+}
+let Latency = 4 in {
+def : WriteRes<WriteFMA16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMA32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMA64, [SiFiveP600FloatArith]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteFSGNJ16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFSGNJ32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFSGNJ64, [SiFiveP600FloatArith]>;
+
+def : WriteRes<WriteFMinMax16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMinMax32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMinMax64, [SiFiveP600FloatArith]>;
+}
+
+// Half precision.
+def : WriteRes<WriteFDiv16, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 4;
+  let ReleaseAtCycles = [1, 4];
+}
+def : WriteRes<WriteFSqrt16, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 18;
+  let ReleaseAtCycles = [1, 17];
+}
+
+// Single precision.
+def : WriteRes<WriteFDiv32, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 6;
+  let ReleaseAtCycles = [1, 6];
+}
+def : WriteRes<WriteFSqrt32, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 18;
+  let ReleaseAtCycles = [1, 17];
+}
+
+// Double precision
+def : WriteRes<WriteFDiv64, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 11;
+  let ReleaseAtCycles = [1, 11];
+}
+def : WriteRes<WriteFSqrt64, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 33;
+  let ReleaseAtCycles = [1, 32];
+}
+
+// Conversions
+let Latency = 2 in {
+def : WriteRes<WriteFCvtI32ToF16, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI32ToF32, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI32ToF64, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI64ToF16, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI64ToF32, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI64ToF64, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtF16ToI32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF16ToI64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF16ToF32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF16ToF64, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF32ToI32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF32ToI64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF32ToF16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF32ToF64, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF64ToI32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF64ToI64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF64ToF16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF64ToF32, [SiFiveP600FloatArith]>;
+
+def : WriteRes<WriteFClass16, [SiFiveP600F2I]>;
+def : WriteRes<WriteFClass32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFClass64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCmp16, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCmp32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCmp64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFMovI16ToF16, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFMovF16ToI16, [SiFiveP600F2I]>;
+def : WriteRes<WriteFMovI32ToF32, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFMovF32ToI32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFMovI64ToF64, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFMovF64ToI64, [SiFiveP600F2I]>;
+}
+
+// 6. Configuration-Setting Instructions
+def : WriteRes<WriteVSETVLI, [SiFiveP600SYS]>;
+def : WriteRes<WriteVSETIVLI, [SiFiveP600SYS]>;
+def : WriteRes<WriteVSETVL, [SiFiveP600SYS]>;
+
+// 7. Vector Loads and Stores
+// FIXME: This unit is still being improved, currently
+// it is based on stage numbers. Estimates are optimistic,
+// latency may be longer.
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 8, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVLDE",    [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDM",    [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDFF",   [SiFiveP600VLD], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVLDS8",   [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS16",  [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS32",  [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS64",  [SiFiveP600VLD], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVLDUX8",  [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX8",  [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP600VLD], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 8, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSTE",    [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTM",    [SiFiveP600VST], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSTS8",   [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS16",  [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS32",  [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS64",  [SiFiveP600VST], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSTUX8",  [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX8",  [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP600VST], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  foreach nf=2-8 in {
+    foreach eew = [8, 16, 32, 64] in {
+      defvar LMulLat = SiFiveP600GetCyclesSegmented<mx, eew, nf>.c;
+      defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+      let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
+        defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew,   [SiFiveP600VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" # eew,  [SiFiveP600VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
+      }
+      let Latency = !add(1, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
+        defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew,   [SiFiveP600VST], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" # eew,  [SiFiveP600VST], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" # eew, [SiFiveP600VST], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" # eew, [SiFiveP600VST], mx, IsWorstCase>;
+      }
+    }
+  }
+}
+
+// Whole register move/load/store
+foreach LMul = [1, 2, 4, 8] in {
+  let Latency = 8, ReleaseAtCycles = [LMul] in {
+    def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SiFiveP600VLD]>;
+    def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SiFiveP600VST]>;
+  }
+  let Latency = LMul, ReleaseAtCycles = [LMul] in {
+    def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SiFiveP600VectorArith]>;
+  }
+}
+
+// 11. Vector Integer Arithmetic Instructions
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVIALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIALUI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVExtV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICALUV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICALUX",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICALUI",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMergeV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMergeX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMergeI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMovV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMovX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMovI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVShiftV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVShiftX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVShiftI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+// Widening
+foreach mx = SchedMxListW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVIWALUV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWALUX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWALUI",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// Worst case needs 64 cycles if SEW is equal to 64.
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    let Latency = 64, ReleaseAtCycles = [LMulLat, !mul(63, LMulLat)] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SiFiveP600VEXQ1, SiFiveP600VDiv], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SiFiveP600VEXQ1, SiFiveP600VDiv], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// Narrowing Shift and Clips
+foreach mx = SchedMxListW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVNShiftV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNShiftX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNShiftI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNClipV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNClipX",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNClipI",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSALUI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSMulV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSMulX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSShiftV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSShiftX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSShiftI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// 13. Vector Floating-Point Instructions
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFALUV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFALUF",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMulF",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMulAddF", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFCvtIToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFCmpV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFCmpF",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFRecpV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFSgnjV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFSgnjF",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMinMaxF", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFClassV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMergeV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMovV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFWCvtIToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+foreach mx = SchedMxListFW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListFW>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWCvtFToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+foreach mx = SchedMxListFW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListFW>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFWALUV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWMulAddF", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWMulF",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWALUF",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+// Narrowing
+foreach mx = SchedMxListW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+foreach mx = SchedMxListFW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListFW>.c;
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFNCvtIToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFNCvtFToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// Worst case needs 76 cycles if SEW is equal to 64.
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+    let Latency = 76, ReleaseAtCycles = [LMulLat, !mul(76, LMulLat)] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV",  [SiFiveP600VEXQ1, SiFiveP600VFloatDiv], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF",  [SiFiveP600VEXQ1, SiFiveP600VFloatDiv], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SiFiveP600VEXQ1, SiFiveP600VFloatDiv], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// 14. Vector Reduction Operations
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    let Latency = !add(2, !mul(2, LMulLat)), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+foreach mx = SchedMxListWRed in {
+  foreach sew = SchedSEWSet<mx, 0, 1>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+    let Latency = !add(2, !mul(2, LMulLat)), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+    let Latency = !add(6, !mul(6, LMulLat)), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From",
+                                     [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+foreach mx = SchedMxListFWRed in {
+  foreach sew = SchedSEWSet<mx, 1, 1>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+    let Latency = !add(6, !mul(6, LMulLat)), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From",  [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// 15. Vector Mask Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 1, ReleaseAtCycles = [1] in {
+    defm "" : LMULWriteResMX<"WriteVMALUV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVMPopV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVMFFSV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVMSFSV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVIotaV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIdxV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+}
+
+// 16. Vector Permutation Instructions
+// Simple Slide
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVISlideI",  [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVISlide1X", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFSlide1F", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+}
+foreach mx = ["MF8", "MF4", "MF2", "M1"] in {
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 2, ReleaseAtCycles = [1] in {
+    defm "" : LMULWriteResMX<"WriteVISlideX", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+}
+
+// Complex Slide
+foreach mx = ["M8", "M4", "M2"] in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = !add(4, LMulLat), ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVISlideX", [SiFiveP600VEXQ1], mx, IsWorstCase>;
+  }
+}
+
+let Latency = 2, ReleaseAtCycles = [1] in {
+  def : WriteRes<WriteVMovSX, [SiFiveP600VectorArith]>;
+  def : WriteRes<WriteVMovXS, [SiFiveP600VectorArith]>;
+}
+let Latency = 6, ReleaseAtCycles = [1] in {
+  def : WriteRes<WriteVMovSF, [SiFiveP600VectorArith]>;
+  def : WriteRes<WriteVMovFS, [SiFiveP600VectorArith]>;
+}
+
+// Simple Gather and Compress
+foreach mx = ["MF8", "MF4", "MF2", "M1"] in {
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 3, ReleaseAtCycles = [1] in {
+    defm "" : LMULWriteResMX<"WriteVRGatherVX", [SiFiveP600VEXQ1], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = ["MF8", "MF4", "MF2", "M1"] in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 3, ReleaseAtCycles = [1] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// Complex Gather and Compress
+foreach mx = ["M2", "M4", "M8"] in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVRGatherVX", [SiFiveP600VEXQ1], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = ["M2", "M4", "M8"] in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// Simple Vrgather.vi
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVRGatherVI", [SiFiveP600VEXQ1], mx, IsWorstCase>;
+  }
+}
+
+// Others
+def : WriteRes<WriteCSR, [SiFiveP600SYS]>;
+def : WriteRes<WriteNop, []>;
+def : WriteRes<WriteRdVLENB, [SiFiveP600SYS]>;
+
+// FIXME: This could be better modeled by looking at the regclasses of the operands.
+def : InstRW<[WriteIALU, ReadIALU], (instrs COPY)>;
+
+//===----------------------------------------------------------------------===//
+// Bypass and advance
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShiftImm, 0>;
+def : ReadAdvance<ReadShiftImm32, 0>;
+def : ReadAdvance<ReadShiftReg, 0>;
+def : ReadAdvance<ReadShiftReg32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIRem, 0>;
+def : ReadAdvance<ReadIRem32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFStoreData, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFAdd16, 0>;
+def : ReadAdvance<ReadFAdd32, 0>;
+def : ReadAdvance<ReadFAdd64, 0>;
+def : ReadAdvance<ReadFMul16, 0>;
+def : ReadAdvance<ReadFMA16, 0>;
+def : ReadAdvance<ReadFMA16Addend, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
+def : ReadAdvance<ReadFDiv16, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt16, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp16, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ16, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax16, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF16ToI32, 0>;
+def : ReadAdvance<ReadFCvtF16ToI64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF16, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF16, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFCvtF16ToF32, 0>;
+def : ReadAdvance<ReadFCvtF32ToF16, 0>;
+def : ReadAdvance<ReadFCvtF16ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF16, 0>;
+def : ReadAdvance<ReadFMovF16ToI16, 0>;
+def : ReadAdvance<ReadFMovI16ToF16, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass16, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+
+// Bitmanip
+def : ReadAdvance<ReadRotateImm, 0>;
+def : ReadAdvance<ReadRotateImm32, 0>;
+def : ReadAdvance<ReadRotateReg, 0>;
+def : ReadAdvance<ReadRotateReg32, 0>;
+def : ReadAdvance<ReadCLZ, 0>;
+def : ReadAdvance<ReadCLZ32, 0>;
+def : ReadAdvance<ReadCTZ, 0>;
+def : ReadAdvance<ReadCTZ32, 0>;
+def : ReadAdvance<ReadCPOP, 0>;
+def : ReadAdvance<ReadCPOP32, 0>;
+def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
+def : ReadAdvance<ReadREV8, 0>;
+def : ReadAdvance<ReadSHXADD, 0>;
+def : ReadAdvance<ReadSHXADD32, 0>;
+def : ReadAdvance<ReadSingleBit, 0>;
+def : ReadAdvance<ReadSingleBitImm, 0>;
+
+// 6. Configuration-Setting Instructions
+def : ReadAdvance<ReadVSETVLI, 0>;
+def : ReadAdvance<ReadVSETVL, 0>;
+
+// 7. Vector Loads and Stores
+def : ReadAdvance<ReadVLDX, 0>;
+def : ReadAdvance<ReadVSTX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTEV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTM", 0>;
+def : ReadAdvance<ReadVLDSX, 0>;
+def : ReadAdvance<ReadVSTSX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTS8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS64V", 0>;
+defm "" : LMULReadAdvance<"ReadVLDUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVLDOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVST1R, 0>;
+def : ReadAdvance<ReadVST2R, 0>;
+def : ReadAdvance<ReadVST4R, 0>;
+def : ReadAdvance<ReadVST8R, 0>;
+
+// 12. Vector Integer Arithmetic Instructions
+defm : LMULReadAdvance<"ReadVIALUV", 0>;
+defm : LMULReadAdvance<"ReadVIALUX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUX", 0>;
+defm : LMULReadAdvance<"ReadVExtV", 0>;
+defm : LMULReadAdvance<"ReadVICALUV", 0>;
+defm : LMULReadAdvance<"ReadVICALUX", 0>;
+defm : LMULReadAdvance<"ReadVShiftV", 0>;
+defm : LMULReadAdvance<"ReadVShiftX", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftV", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftX", 0>;
+defm : LMULReadAdvance<"ReadVICmpV", 0>;
+defm : LMULReadAdvance<"ReadVICmpX", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxV", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxX", 0>;
+defm : LMULReadAdvance<"ReadVIMulV", 0>;
+defm : LMULReadAdvance<"ReadVIMulX", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivV", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulX", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddV", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>;
+defm : LMULReadAdvance<"ReadVIMergeV", 0>;
+defm : LMULReadAdvance<"ReadVIMergeX", 0>;
+defm : LMULReadAdvance<"ReadVIMovV", 0>;
+defm : LMULReadAdvance<"ReadVIMovX", 0>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+defm "" : LMULReadAdvance<"ReadVSALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVSALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulV", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulX", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftV", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftX", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>;
+
+// 14. Vector Floating-Point Instructions
+defm "" : LMULReadAdvance<"ReadVFALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVFALUF", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWALUV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWALUF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMulV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWMulV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWMulF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMulAddV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMulAddF", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWMulAddV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
+defm "" : LMULReadAdvance<"ReadVFRecpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMinMaxV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMinMaxF", 0>;
+defm "" : LMULReadAdvance<"ReadVFSgnjV", 0>;
+defm "" : LMULReadAdvance<"ReadVFSgnjF", 0>;
+defm "" : LMULReadAdvance<"ReadVFClassV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMovF", 0>;
+defm "" : LMULReadAdvance<"ReadVFCvtIToFV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVFWCvtIToFV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
+
+// 15. Vector Reduction Operations
+def : ReadAdvance<ReadVIRedV, 0>;
+def : ReadAdvance<ReadVIRedV0, 0>;
+def : ReadAdvance<ReadVIWRedV, 0>;
+def : ReadAdvance<ReadVIWRedV0, 0>;
+def : ReadAdvance<ReadVFRedV, 0>;
+def : ReadAdvance<ReadVFRedV0, 0>;
+def : ReadAdvance<ReadVFRedOV, 0>;
+def : ReadAdvance<ReadVFRedOV0, 0>;
+def : ReadAdvance<ReadVFWRedV, 0>;
+def : ReadAdvance<ReadVFWRedV0, 0>;
+def : ReadAdvance<ReadVFWRedOV, 0>;
+def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+// 16. Vector Mask Instructions
+defm "" : LMULReadAdvance<"ReadVMALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVMPopV", 0>;
+defm "" : LMULReadAdvance<"ReadVMFFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVMSFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVIotaV", 0>;
+
+// 17. Vector Permutation Instructions
+def : ReadAdvance<ReadVMovXS, 0>;
+def : ReadAdvance<ReadVMovSX_V, 0>;
+def : ReadAdvance<ReadVMovSX_X, 0>;
+def : ReadAdvance<ReadVMovFS, 0>;
+def : ReadAdvance<ReadVMovSF_V, 0>;
+def : ReadAdvance<ReadVMovSF_F, 0>;
+defm "" : LMULReadAdvance<"ReadVISlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVISlideX", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVMov1V, 0>;
+def : ReadAdvance<ReadVMov2V, 0>;
+def : ReadAdvance<ReadVMov4V, 0>;
+def : ReadAdvance<ReadVMov8V, 0>;
+
+// Others
+def : ReadAdvance<ReadVMask, 0>;
+def : ReadAdvance<ReadVMergeOp_WorstCase, 0>;
+foreach mx = SchedMxList in {
+  def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx), 0>;
+  foreach sew = SchedSEWSet<mx>.val in
+    def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx  # "_E" # sew), 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedZabha;
+defm : UnsupportedSchedZbc;
+defm : UnsupportedSchedZbkb;
+defm : UnsupportedSchedZbkx;
+defm : UnsupportedSchedSFB;
+defm : UnsupportedSchedZfa;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
index ef491edf3671f..4fc7b0335af53 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
@@ -85,6 +85,7 @@ def : WriteRes<WriteRotateImm32, [XS2ALU]>;
 def : WriteRes<WriteRotateReg, [XS2ALU]>;
 def : WriteRes<WriteRotateReg32, [XS2ALU]>;
 def : WriteRes<WriteORCB, [XS2ALU]>;
+def : WriteRes<WriteIMinMax, [XS2ALU]>;
 def : WriteRes<WriteREV8, [XS2ALU]>;
 
 // Zbkb
@@ -288,6 +289,7 @@ def : ReadAdvance<ReadCTZ32, 0>;
 def : ReadAdvance<ReadCPOP, 0>;
 def : ReadAdvance<ReadCPOP32, 0>;
 def : XS2LoadToALUBypass<ReadORCB>;
+def : XS2LoadToALUBypass<ReadIMinMax>;
 def : XS2LoadToALUBypass<ReadREV8>;
 // Zbkc
 def : ReadAdvance<ReadCLMUL, 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleZb.td b/llvm/lib/Target/RISCV/RISCVScheduleZb.td
index 0a16390e50535..93381f439e0df 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleZb.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleZb.td
@@ -25,6 +25,7 @@ def WriteCPOP        : SchedWrite;
 def WriteCPOP32      : SchedWrite;
 def WriteREV8        : SchedWrite;
 def WriteORCB        : SchedWrite;
+def WriteIMinMax     : SchedWrite;
 
 // Zbc extension
 def WriteCLMUL       : SchedWrite; // CLMUL/CLMULR/CLMULH
@@ -63,6 +64,7 @@ def ReadCPOP        : SchedRead;
 def ReadCPOP32      : SchedRead;
 def ReadREV8        : SchedRead;
 def ReadORCB        : SchedRead;
+def ReadIMinMax     : SchedRead;
 
 // Zbc extension
 def ReadCLMUL       : SchedRead; // CLMUL/CLMULR/CLMULH
@@ -106,6 +108,7 @@ def : WriteRes<WriteCPOP, []>;
 def : WriteRes<WriteCPOP32, []>;
 def : WriteRes<WriteREV8, []>;
 def : WriteRes<WriteORCB, []>;
+def : WriteRes<WriteIMinMax, []>;
 
 def : ReadAdvance<ReadRotateImm, 0>;
 def : ReadAdvance<ReadRotateImm32, 0>;
@@ -119,6 +122,7 @@ def : ReadAdvance<ReadCPOP, 0>;
 def : ReadAdvance<ReadCPOP32, 0>;
 def : ReadAdvance<ReadREV8, 0>;
 def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
 }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 79f977e5b3226..01c2767119502 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -418,7 +418,16 @@ def : SysReg<"vsieh", 0x214>;
 def : SysReg<"vsiph", 0x254>;
 } // isRV32Only
 
+//===-----------------------------------------------
 // Jump Vector Table CSR
 //===-----------------------------------------------
 
 def : SysReg<"jvt", 0x017>;
+
+//===-----------------------------------------------
+// Resumable Non-Maskable Interrupts(Smrnmi) CSRs
+//===-----------------------------------------------
+def : SysReg<"mnscratch", 0x740>;
+def : SysReg<"mnepc", 0x741>;
+def : SysReg<"mncause", 0x742>;
+def : SysReg<"mnstatus", 0x744>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ecd373649e2c7..8f46fdc2f7ca9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1620,3 +1620,13 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                   C2.NumIVMuls, C2.NumBaseAdds,
                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
+
+bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
+  auto *VTy = dyn_cast<VectorType>(DataTy);
+  if (!VTy || VTy->isScalableTy())
+    return false;
+
+  if (!isLegalMaskedLoadStore(DataTy, Alignment))
+    return false;
+  return true;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index af36e9d5d5e88..8daf6845dc8bc 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -261,6 +261,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment);
   }
 
+  bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment);
+
   bool isVScaleKnownToBeAPowerOfTwo() const {
     return TLI->isVScaleKnownToBeAPowerOfTwo();
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 1fbf3c3e11aed..30c67d3fde633 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -29,7 +29,9 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -101,6 +103,21 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
   if (ModuleSectionsEmitted == false) {
     outputModuleSections();
     ModuleSectionsEmitted = true;
+  } else {
+    ST = static_cast<const SPIRVTargetMachine &>(TM).getSubtargetImpl();
+    uint32_t DecSPIRVVersion = ST->getSPIRVVersion();
+    uint32_t Major = DecSPIRVVersion / 10;
+    uint32_t Minor = DecSPIRVVersion - Major * 10;
+    // TODO: calculate Bound more carefully from maximum used register number,
+    // accounting for generated OpLabels and other related instructions if
+    // needed.
+    unsigned Bound = 2 * (ST->getBound() + 1);
+    bool FlagToRestore = OutStreamer->getUseAssemblerInfoForParsing();
+    OutStreamer->setUseAssemblerInfoForParsing(true);
+    if (MCAssembler *Asm = OutStreamer->getAssemblerPtr())
+      Asm->setBuildVersion(static_cast<MachO::PlatformType>(0), Major, Minor,
+                           Bound, VersionTuple(Major, Minor, 0, Bound));
+    OutStreamer->setUseAssemblerInfoForParsing(FlagToRestore);
   }
 }
 
@@ -507,6 +524,13 @@ void SPIRVAsmPrinter::outputAnnotations(const Module &M) {
         report_fatal_error("Unsupported value in llvm.global.annotations");
       Function *Func = cast<Function>(AnnotatedVar);
       Register Reg = MAI->getFuncReg(Func);
+      if (!Reg.isValid()) {
+        std::string DiagMsg;
+        raw_string_ostream OS(DiagMsg);
+        AnnotatedVar->print(OS);
+        DiagMsg = "Unknown function in llvm.global.annotations: " + DiagMsg;
+        report_fatal_error(DiagMsg.c_str());
+      }
 
       // The second field contains a pointer to a global annotation string.
       GlobalVariable *GV =
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index f1fbe2ba1bc41..6f23f055b8c2a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -209,7 +209,7 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
   // spv_assign_ptr_type intrinsic or otherwise use default pointer element
   // type.
   Argument *Arg = F.getArg(ArgIdx);
-  if (Arg->hasByValAttr() || Arg->hasByRefAttr()) {
+  if (HasPointeeTypeAttr(Arg)) {
     Type *ByValRefType = Arg->hasByValAttr() ? Arg->getParamByValType()
                                              : Arg->getParamByRefType();
     SPIRVType *ElementType = GR->getOrCreateSPIRVType(ByValRefType, MIRBuilder);
@@ -319,6 +319,12 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
         buildOpDecorate(VRegs[i][0], MIRBuilder,
                         SPIRV::Decoration::FuncParamAttr, {Attr});
       }
+      if (Arg.hasAttribute(Attribute::ByVal)) {
+        auto Attr =
+            static_cast<unsigned>(SPIRV::FunctionParameterAttribute::ByVal);
+        buildOpDecorate(VRegs[i][0], MIRBuilder,
+                        SPIRV::Decoration::FuncParamAttr, {Attr});
+      }
 
       if (F.getCallingConv() == CallingConv::SPIR_KERNEL) {
         std::vector<SPIRV::Decoration::Decoration> ArgTypeQualDecs =
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 854a19e6a4bac..ee06cc2dda8cd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -91,6 +91,7 @@ class SPIRVEmitIntrinsics
                                         IRBuilder<> &B);
   void insertPtrCastOrAssignTypeInstr(Instruction *I, IRBuilder<> &B);
   void processGlobalValue(GlobalVariable &GV, IRBuilder<> &B);
+  void processParamTypes(Function *F, IRBuilder<> &B);
 
 public:
   static char ID;
@@ -794,6 +795,64 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
   }
 }
 
+void SPIRVEmitIntrinsics::processParamTypes(Function *F, IRBuilder<> &B) {
+  DenseMap<unsigned, Argument *> Args;
+  unsigned i = 0;
+  for (Argument &Arg : F->args()) {
+    if (isUntypedPointerTy(Arg.getType()) &&
+        DeducedElTys.find(&Arg) == DeducedElTys.end() &&
+        !HasPointeeTypeAttr(&Arg))
+      Args[i] = &Arg;
+    i++;
+  }
+  if (Args.size() == 0)
+    return;
+
+  // Args contains opaque pointers without element type definition
+  B.SetInsertPointPastAllocas(F);
+  std::unordered_set<Value *> Visited;
+  for (User *U : F->users()) {
+    CallInst *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+    for (unsigned OpIdx = 0; OpIdx < CI->arg_size() && Args.size() > 0;
+         OpIdx++) {
+      auto It = Args.find(OpIdx);
+      Argument *Arg = It == Args.end() ? nullptr : It->second;
+      if (!Arg)
+        continue;
+      Value *OpArg = CI->getArgOperand(OpIdx);
+      if (!isPointerTy(OpArg->getType()))
+        continue;
+      // maybe we already know the operand's element type
+      auto DeducedIt = DeducedElTys.find(OpArg);
+      Type *ElemTy =
+          DeducedIt == DeducedElTys.end() ? nullptr : DeducedIt->second;
+      if (!ElemTy) {
+        for (User *OpU : OpArg->users()) {
+          if (Instruction *Inst = dyn_cast<Instruction>(OpU)) {
+            Visited.clear();
+            ElemTy = deduceElementTypeHelper(Inst, Visited, DeducedElTys);
+            if (ElemTy)
+              break;
+          }
+        }
+      }
+      if (ElemTy) {
+        unsigned AddressSpace = getPointerAddressSpace(Arg->getType());
+        CallInst *AssignPtrTyCI = buildIntrWithMD(
+            Intrinsic::spv_assign_ptr_type, {Arg->getType()},
+            Constant::getNullValue(ElemTy), Arg, {B.getInt32(AddressSpace)}, B);
+        DeducedElTys[AssignPtrTyCI] = ElemTy;
+        DeducedElTys[Arg] = ElemTy;
+        Args.erase(It);
+      }
+    }
+    if (Args.size() == 0)
+      break;
+  }
+}
+
 bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   if (Func.isDeclaration())
     return false;
@@ -840,6 +899,11 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
       continue;
     processInstrAfterVisit(I, B);
   }
+
+  // check if function parameter types are set
+  if (!F->isIntrinsic())
+    processParamTypes(F, B);
+
   return true;
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index bda9c57e534c3..42f8397a3023b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -24,7 +24,7 @@
 
 using namespace llvm;
 SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize)
-    : PointerSize(PointerSize) {}
+    : PointerSize(PointerSize), Bound(0) {}
 
 SPIRVType *SPIRVGlobalRegistry::assignIntTypeToVReg(unsigned BitWidth,
                                                     Register VReg,
@@ -896,6 +896,15 @@ bool SPIRVGlobalRegistry::isScalarOrVectorSigned(const SPIRVType *Type) const {
   return IntType && IntType->getOperand(2).getImm() != 0;
 }
 
+unsigned SPIRVGlobalRegistry::getPointeeTypeOp(Register PtrReg) {
+  SPIRVType *PtrType = getSPIRVTypeForVReg(PtrReg);
+  SPIRVType *ElemType =
+      PtrType && PtrType->getOpcode() == SPIRV::OpTypePointer
+          ? getSPIRVTypeForVReg(PtrType->getOperand(2).getReg())
+          : nullptr;
+  return ElemType ? ElemType->getOpcode() : 0;
+}
+
 bool SPIRVGlobalRegistry::isBitcastCompatible(const SPIRVType *Type1,
                                               const SPIRVType *Type2) const {
   if (!Type1 || !Type2)
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 25d82ebf9bc79..da480b22a525f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -56,6 +56,9 @@ class SPIRVGlobalRegistry {
   // Number of bits pointers and size_t integers require.
   const unsigned PointerSize;
 
+  // Holds the maximum ID we have in the module.
+  unsigned Bound;
+
   // Add a new OpTypeXXX instruction without checking for duplicates.
   SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
                              SPIRV::AccessQualifier::AccessQualifier AQ =
@@ -108,6 +111,9 @@ class SPIRVGlobalRegistry {
     DT.buildDepsGraph(Graph, MMI);
   }
 
+  void setBound(unsigned V) { Bound = V; }
+  unsigned getBound() { return Bound; }
+
   // Map a machine operand that represents a use of a function via function
   // pointer to a machine operand that represents the function definition.
   // Return either the register or invalid value, because we have no context for
@@ -166,6 +172,9 @@ class SPIRVGlobalRegistry {
     return Res->second;
   }
 
+  // Return a pointee's type op code, or 0 otherwise.
+  unsigned getPointeeTypeOp(Register PtrReg);
+
   // Either generate a new OpTypeXXX instruction or return an existing one
   // corresponding to the given string containing the name of the builtin type.
   // Return nullptr if unable to recognize SPIRV type name from `TypeStr`.
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 61748070fc0fb..e6e9131d8dc28 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -104,11 +104,7 @@ static void validatePtrTypes(const SPIRVSubtarget &STI,
   SPIRV::StorageClass::StorageClass SC =
       static_cast<SPIRV::StorageClass::StorageClass>(
           OpType->getOperand(1).getImm());
-  MachineInstr *PrevI = I.getPrevNode();
-  MachineBasicBlock &MBB = *I.getParent();
-  MachineBasicBlock::iterator InsPt =
-      PrevI ? PrevI->getIterator() : MBB.begin();
-  MachineIRBuilder MIB(MBB, InsPt);
+  MachineIRBuilder MIB(I);
   SPIRVType *NewPtrType = GR.getOrCreateSPIRVPointerType(ResType, MIB, SC);
   if (!GR.isBitcastCompatible(NewPtrType, OpType))
     report_fatal_error(
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fd19b7412c4c9..0fef19c2d5341 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1567,7 +1567,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
                                                const SPIRVType *ResType,
                                                MachineInstr &I) const {
   MachineBasicBlock &BB = *I.getParent();
-  switch (cast<GIntrinsic>(I).getIntrinsicID()) {
+  Intrinsic::ID IID = cast<GIntrinsic>(I).getIntrinsicID();
+  switch (IID) {
   case Intrinsic::spv_load:
     return selectLoad(ResVReg, ResType, I);
   case Intrinsic::spv_store:
@@ -1661,8 +1662,25 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     break;
   case Intrinsic::spv_thread_id:
     return selectSpvThreadId(ResVReg, ResType, I);
-  default:
-    llvm_unreachable("Intrinsic selection not implemented");
+  case Intrinsic::spv_lifetime_start:
+  case Intrinsic::spv_lifetime_end: {
+    unsigned Op = IID == Intrinsic::spv_lifetime_start ? SPIRV::OpLifetimeStart
+                                                       : SPIRV::OpLifetimeStop;
+    int64_t Size = I.getOperand(I.getNumExplicitDefs() + 1).getImm();
+    Register PtrReg = I.getOperand(I.getNumExplicitDefs() + 2).getReg();
+    unsigned PonteeOpType = GR.getPointeeTypeOp(PtrReg);
+    bool IsNonvoidPtr = PonteeOpType != 0 && PonteeOpType != SPIRV::OpTypeVoid;
+    if (Size == -1 || IsNonvoidPtr)
+      Size = 0;
+    BuildMI(BB, I, I.getDebugLoc(), TII.get(Op)).addUse(PtrReg).addImm(Size);
+  } break;
+  default: {
+    std::string DiagMsg;
+    raw_string_ostream OS(DiagMsg);
+    I.print(OS);
+    DiagMsg = "Intrinsic selection not implemented: " + DiagMsg;
+    report_fatal_error(DiagMsg.c_str(), false);
+  }
   }
   return true;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
index 8c6649bf62826..afa550d6dd424 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
@@ -34,7 +34,13 @@ void SPIRVMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI,
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_GlobalAddress: {
       Register FuncReg = MAI->getFuncReg(dyn_cast<Function>(MO.getGlobal()));
-      assert(FuncReg.isValid() && "Cannot find function Id");
+      if (!FuncReg.isValid()) {
+        std::string DiagMsg;
+        raw_string_ostream OS(DiagMsg);
+        MI->print(OS);
+        DiagMsg = "Unknown function in:" + DiagMsg;
+        report_fatal_error(DiagMsg.c_str());
+      }
       MCOp = MCOperand::createReg(FuncReg);
       break;
     }
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 2b4cb5ccc7b1e..00d0cbd763736 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1309,5 +1309,8 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
   if (MAI.MS[SPIRV::MB_EntryPoints].empty())
     MAI.Reqs.addCapability(SPIRV::Capability::Linkage);
 
+  // Set maximum ID used.
+  GR->setBound(MAI.MaxID);
+
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 708384fc55f52..6e86eed30c5dc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -163,8 +163,8 @@ struct ModuleAnalysisInfo {
   Register getFuncReg(const Function *F) {
     assert(F && "Function is null");
     auto FuncPtrRegPair = FuncMap.find(F);
-    assert(FuncPtrRegPair != FuncMap.end() && "Cannot find function ID");
-    return FuncPtrRegPair->second;
+    return FuncPtrRegPair == FuncMap.end() ? Register(0)
+                                           : FuncPtrRegPair->second;
   }
   Register getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; }
   InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index c376497469ce3..a8a0577f60564 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -263,6 +263,21 @@ static void lowerExpectAssume(IntrinsicInst *II) {
   return;
 }
 
+static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID,
+                                     ArrayRef<unsigned> OpNos) {
+  Function *F = nullptr;
+  if (OpNos.empty()) {
+    F = Intrinsic::getDeclaration(II->getModule(), NewID);
+  } else {
+    SmallVector<Type *, 4> Tys;
+    for (unsigned OpNo : OpNos)
+      Tys.push_back(II->getOperand(OpNo)->getType());
+    F = Intrinsic::getDeclaration(II->getModule(), NewID, Tys);
+  }
+  II->setCalledFunction(F);
+  return true;
+}
+
 static void lowerUMulWithOverflow(IntrinsicInst *UMulIntrinsic) {
   // Get a separate function - otherwise, we'd have to rework the CFG of the
   // current one. Then simply replace the intrinsic uses with a call to the new
@@ -290,22 +305,35 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
       if (!CF || !CF->isIntrinsic())
         continue;
       auto *II = cast<IntrinsicInst>(Call);
-      if (II->getIntrinsicID() == Intrinsic::memset ||
-          II->getIntrinsicID() == Intrinsic::bswap)
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::memset:
+      case Intrinsic::bswap:
         Changed |= lowerIntrinsicToFunction(II);
-      else if (II->getIntrinsicID() == Intrinsic::fshl ||
-               II->getIntrinsicID() == Intrinsic::fshr) {
+        break;
+      case Intrinsic::fshl:
+      case Intrinsic::fshr:
         lowerFunnelShifts(II);
         Changed = true;
-      } else if (II->getIntrinsicID() == Intrinsic::umul_with_overflow) {
+        break;
+      case Intrinsic::umul_with_overflow:
         lowerUMulWithOverflow(II);
         Changed = true;
-      } else if (II->getIntrinsicID() == Intrinsic::assume ||
-                 II->getIntrinsicID() == Intrinsic::expect) {
+        break;
+      case Intrinsic::assume:
+      case Intrinsic::expect: {
         const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
         if (STI.canUseExtension(SPIRV::Extension::SPV_KHR_expect_assume))
           lowerExpectAssume(II);
         Changed = true;
+      } break;
+      case Intrinsic::lifetime_start:
+        Changed |= toSpvOverloadedIntrinsic(
+            II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1});
+        break;
+      case Intrinsic::lifetime_end:
+        Changed |= toSpvOverloadedIntrinsic(
+            II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
index 62524ebfc9bf8..3b486226a9393 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
@@ -71,6 +71,7 @@ class SPIRVSubtarget : public SPIRVGenSubtargetInfo {
   // The definition of this function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
   unsigned getPointerSize() const { return PointerSize; }
+  unsigned getBound() const { return GR->getBound(); }
   bool canDirectlyComparePointers() const;
   // TODO: this environment is not implemented in Triple, we need to decide
   // how to standardize its support. For now, let's assume SPIR-V with physical
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index d5ed501def998..eb87349f0941c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -126,5 +126,10 @@ inline unsigned getPointerAddressSpace(const Type *T) {
              : cast<TypedPointerType>(SubT)->getAddressSpace();
 }
 
+// Return true if the Argument is decorated with a pointee type
+inline bool HasPointeeTypeAttr(Argument *Arg) {
+  return Arg->hasByValAttr() || Arg->hasByRefAttr();
+}
+
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 815eca1240d82..deaf3dcaeb92a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -344,6 +344,9 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // requirements for a PC-relative access.
   bool storeLoadIsAligned(SDNode *N) const;
 
+  // Return the load extension type of a load or atomic load.
+  ISD::LoadExtType getLoadExtType(SDNode *N) const;
+
   // Try to expand a boolean SELECT_CCMASK using an IPM sequence.
   SDValue expandSelectBoolean(SDNode *Node);
 
@@ -1507,15 +1510,17 @@ bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
 
 bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
 
-  auto *MemAccess = cast<LSBaseSDNode>(N);
+  auto *MemAccess = cast<MemSDNode>(N);
+  auto *LdSt = dyn_cast<LSBaseSDNode>(MemAccess);
   TypeSize StoreSize = MemAccess->getMemoryVT().getStoreSize();
   SDValue BasePtr = MemAccess->getBasePtr();
   MachineMemOperand *MMO = MemAccess->getMemOperand();
   assert(MMO && "Expected a memory operand.");
 
   // The memory access must have a proper alignment and no index register.
+  // Only load and store nodes have the offset operand (atomic loads do not).
   if (MemAccess->getAlign().value() < StoreSize ||
-      !MemAccess->getOffset().isUndef())
+      (LdSt && !LdSt->getOffset().isUndef()))
     return false;
 
   // The MMO must not have an unaligned offset.
@@ -1545,6 +1550,17 @@ bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
   return true;
 }
 
+ISD::LoadExtType SystemZDAGToDAGISel::getLoadExtType(SDNode *N) const {
+  ISD::LoadExtType ETy;
+  if (auto *L = dyn_cast<LoadSDNode>(N))
+    ETy = L->getExtensionType();
+  else if (auto *AL = dyn_cast<AtomicSDNode>(N))
+    ETy = AL->getExtensionType();
+  else
+    llvm_unreachable("Unkown load node type.");
+  return ETy;
+}
+
 void SystemZDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
@@ -1742,6 +1758,26 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+
+  case ISD::ATOMIC_STORE: {
+    auto *AtomOp = cast<AtomicSDNode>(Node);
+    // Replace the atomic_store with a regular store and select it. This is
+    // ok since we know all store instructions <= 8 bytes are atomic, and the
+    // 16 byte case is already handled during lowering.
+    StoreSDNode *St = cast<StoreSDNode>(CurDAG->getTruncStore(
+         AtomOp->getChain(), SDLoc(AtomOp), AtomOp->getVal(),
+         AtomOp->getBasePtr(), AtomOp->getMemoryVT(), AtomOp->getMemOperand()));
+    assert(St->getMemOperand()->isAtomic() && "Broken MMO.");
+    SDNode *Chain = St;
+    // We have to enforce sequential consistency by performing a
+    // serialization operation after the store.
+    if (AtomOp->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Chain = CurDAG->getMachineNode(SystemZ::Serialize, SDLoc(AtomOp),
+                                     MVT::Other, SDValue(Chain, 0));
+    ReplaceNode(Node, Chain);
+    SelectCode(St);
+    return;
+  }
   }
 
   SelectCode(Node);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3b85a6ac0371e..bc60d14ee700f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -194,11 +194,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UADDO_CARRY, VT, Custom);
       setOperationAction(ISD::USUBO_CARRY, VT, Custom);
 
-      // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
-      // stores, putting a serialization instruction after the stores.
-      setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
-      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
-
       // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
       // available, or if the operand is constant.
       setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
@@ -920,6 +915,22 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const
   return false;
 }
 
+TargetLowering::AtomicExpansionKind
+SystemZTargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  // Lower fp128 the same way as i128.
+  if (LI->getType()->isFP128Ty())
+    return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+SystemZTargetLowering::shouldCastAtomicStoreInIR(StoreInst *SI) const {
+  // Lower fp128 the same way as i128.
+  if (SI->getValueOperand()->getType()->isFP128Ty())
+    return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 TargetLowering::AtomicExpansionKind
 SystemZTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   // Don't expand subword operations as they require special treatment.
@@ -4252,6 +4263,7 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
   if (N->getValueType(0) == MVT::i128) {
     unsigned BaseOp = 0;
     unsigned FlagOp = 0;
+    bool IsBorrow = false;
     switch (Op.getOpcode()) {
     default: llvm_unreachable("Unknown instruction!");
     case ISD::UADDO:
@@ -4261,6 +4273,7 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
     case ISD::USUBO:
       BaseOp = ISD::SUB;
       FlagOp = SystemZISD::VSCBI;
+      IsBorrow = true;
       break;
     }
     SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS);
@@ -4268,6 +4281,9 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
     Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag,
                        DAG.getValueType(MVT::i1));
     Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1));
+    if (IsBorrow)
+      Flag = DAG.getNode(ISD::XOR, DL, Flag.getValueType(),
+                         Flag, DAG.getConstant(1, DL, Flag.getValueType()));
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag);
   }
 
@@ -4340,6 +4356,7 @@ SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op,
   if (VT == MVT::i128) {
     unsigned BaseOp = 0;
     unsigned FlagOp = 0;
+    bool IsBorrow = false;
     switch (Op.getOpcode()) {
     default: llvm_unreachable("Unknown instruction!");
     case ISD::UADDO_CARRY:
@@ -4349,14 +4366,21 @@ SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op,
     case ISD::USUBO_CARRY:
       BaseOp = SystemZISD::VSBI;
       FlagOp = SystemZISD::VSBCBI;
+      IsBorrow = true;
       break;
     }
+    if (IsBorrow)
+      Carry = DAG.getNode(ISD::XOR, DL, Carry.getValueType(),
+                          Carry, DAG.getConstant(1, DL, Carry.getValueType()));
     Carry = DAG.getZExtOrTrunc(Carry, DL, MVT::i128);
     SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS, Carry);
     SDValue Flag = DAG.getNode(FlagOp, DL, MVT::i128, LHS, RHS, Carry);
     Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag,
                        DAG.getValueType(MVT::i1));
     Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1));
+    if (IsBorrow)
+      Flag = DAG.getNode(ISD::XOR, DL, Flag.getValueType(),
+                         Flag, DAG.getConstant(1, DL, Flag.getValueType()));
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag);
   }
 
@@ -4503,40 +4527,14 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-// Op is an atomic load.  Lower it into a normal volatile load.
-SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  auto *Node = cast<AtomicSDNode>(Op.getNode());
-  if (Node->getMemoryVT() == MVT::i128) {
-    // Use same code to handle both legal and non-legal i128 types.
-    SmallVector<SDValue, 2> Results;
-    LowerOperationWrapper(Node, Results, DAG);
-    return DAG.getMergeValues(Results, SDLoc(Op));
-  }
-  return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
-                        Node->getChain(), Node->getBasePtr(),
-                        Node->getMemoryVT(), Node->getMemOperand());
-}
-
-// Op is an atomic store.  Lower it into a normal volatile store.
-SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
+                                                     SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
-  if (Node->getMemoryVT() == MVT::i128) {
-    // Use same code to handle both legal and non-legal i128 types.
-    SmallVector<SDValue, 1> Results;
-    LowerOperationWrapper(Node, Results, DAG);
-    return DAG.getMergeValues(Results, SDLoc(Op));
-  }
-  SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
-                                    Node->getBasePtr(), Node->getMemoryVT(),
-                                    Node->getMemOperand());
-  // We have to enforce sequential consistency by performing a
-  // serialization operation after the store.
-  if (Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent)
-    Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
-                                       MVT::Other, Chain), 0);
-  return Chain;
+  assert(Node->getMemoryVT() == MVT::i128 && "Only custom lowering i128.");
+  // Use same code to handle both legal and non-legal i128 types.
+  SmallVector<SDValue, 2> Results;
+  LowerOperationWrapper(Node, Results, DAG);
+  return DAG.getMergeValues(Results, SDLoc(Op));
 }
 
 // Prepare for a Compare And Swap for a subword operation. This needs to be
@@ -5662,6 +5660,9 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
 bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
   if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
     return true;
+  if (auto *AL = dyn_cast<AtomicSDNode>(Op))
+    if (AL->getOpcode() == ISD::ATOMIC_LOAD)
+      return true;
   if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
     return true;
   return false;
@@ -6138,9 +6139,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   case ISD::ATOMIC_SWAP:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
   case ISD::ATOMIC_STORE:
-    return lowerATOMIC_STORE(Op, DAG);
   case ISD::ATOMIC_LOAD:
-    return lowerATOMIC_LOAD(Op, DAG);
+    return lowerATOMIC_LDST_I128(Op, DAG);
   case ISD::ATOMIC_LOAD_ADD:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
   case ISD::ATOMIC_LOAD_SUB:
@@ -6587,6 +6587,27 @@ SDValue SystemZTargetLowering::combineTruncateExtract(
   return SDValue();
 }
 
+// Replace ALoad with a new ATOMIC_LOAD with a result that is extended to VT
+// per ETy.
+static SDValue extendAtomicLoad(AtomicSDNode *ALoad, EVT VT, SelectionDAG &DAG,
+                                ISD::LoadExtType ETy) {
+  if (VT.getSizeInBits() > 64)
+    return SDValue();
+  EVT OrigVT = ALoad->getValueType(0);
+  assert(OrigVT.getSizeInBits() < VT.getSizeInBits() && "VT should be wider.");
+  EVT MemoryVT = ALoad->getMemoryVT();
+  auto *NewALoad = dyn_cast<AtomicSDNode>(DAG.getAtomic(
+      ISD::ATOMIC_LOAD, SDLoc(ALoad), MemoryVT, VT, ALoad->getChain(),
+      ALoad->getBasePtr(), ALoad->getMemOperand()));
+  NewALoad->setExtensionType(ETy);
+  DAG.ReplaceAllUsesOfValueWith(
+      SDValue(ALoad, 0),
+      DAG.getNode(ISD::TRUNCATE, SDLoc(ALoad), OrigVT, SDValue(NewALoad, 0)));
+  // Update the chain uses.
+  DAG.ReplaceAllUsesOfValueWith(SDValue(ALoad, 1), SDValue(NewALoad, 1));
+  return SDValue(NewALoad, 0);
+}
+
 SDValue SystemZTargetLowering::combineZERO_EXTEND(
     SDNode *N, DAGCombinerInfo &DCI) const {
   // Convert (zext (select_ccmask C1, C2)) into (select_ccmask C1', C2')
@@ -6611,6 +6632,34 @@ SDValue SystemZTargetLowering::combineZERO_EXTEND(
       return NewSelect;
     }
   }
+  // Convert (zext (xor (trunc X), C)) into (xor (trunc X), C') if the size
+  // of the result is smaller than the size of X and all the truncated bits
+  // of X are already zero.
+  if (N0.getOpcode() == ISD::XOR &&
+      N0.hasOneUse() && N0.getOperand(0).hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (VT.isScalarInteger() && VT.getSizeInBits() < X.getValueSizeInBits()) {
+      KnownBits Known = DAG.computeKnownBits(X);
+      APInt TruncatedBits = APInt::getBitsSet(X.getValueSizeInBits(),
+                                              N0.getValueSizeInBits(),
+                                              VT.getSizeInBits());
+      if (TruncatedBits.isSubsetOf(Known.Zero)) {
+        X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
+        APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
+        return DAG.getNode(ISD::XOR, SDLoc(N0), VT,
+                           X, DAG.getConstant(Mask, SDLoc(N0), VT));
+      }
+    }
+  }
+
+  // Fold into ATOMIC_LOAD unless it is already sign extending.
+  if (auto *ALoad = dyn_cast<AtomicSDNode>(N0))
+    if (ALoad->getOpcode() == ISD::ATOMIC_LOAD &&
+        ALoad->getExtensionType() != ISD::SEXTLOAD)
+      return extendAtomicLoad(ALoad, VT, DAG, ISD::ZEXTLOAD);
+
   return SDValue();
 }
 
@@ -6662,6 +6711,13 @@ SDValue SystemZTargetLowering::combineSIGN_EXTEND(
       }
     }
   }
+
+  // Fold into ATOMIC_LOAD unless it is already zero extending.
+  if (auto *ALoad = dyn_cast<AtomicSDNode>(N0))
+    if (ALoad->getOpcode() == ISD::ATOMIC_LOAD &&
+        ALoad->getExtensionType() != ISD::ZEXTLOAD)
+      return extendAtomicLoad(ALoad, VT, DAG, ISD::SEXTLOAD);
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index baf4ba4165487..406a13b9281ca 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -474,6 +474,8 @@ class SystemZTargetLowering : public TargetLowering {
     return VT != MVT::f64;
   }
   bool hasInlineStackProbe(const MachineFunction &MF) const override;
+  AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override;
+  AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override;
   AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override;
   bool isLegalICmpImmediate(int64_t Imm) const override;
@@ -692,8 +694,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
                               unsigned Opcode) const;
   SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 6e67425c1e788..f4b5aeaebef92 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -129,8 +129,8 @@ defm LoadStoreF128 : MVCLoadStore<load, f128, MVCImm, 15>;
 //===----------------------------------------------------------------------===//
 
 let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
-  defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
-  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
+  defm LE : UnaryRXPair<"le", 0x78, 0xED64, z_load, FP32, 4>;
+  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, z_load, FP64, 8>;
 
   // For z13 we prefer LDE over LE to avoid partial register dependencies.
   let isCodeGenOnly = 1 in
@@ -200,14 +200,14 @@ let Predicates = [FeatureNoVectorEnhancements1] in {
 
 // Extend memory floating-point values to wider representations.
 let Uses = [FPC], mayRaiseFPException = 1 in {
-  def LDEB : UnaryRXE<"ldeb", 0xED04, any_extloadf32, FP64, 4>;
+  def LDEB : UnaryRXE<"ldeb", 0xED04, z_any_extloadf32, FP64, 4>;
   def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>;
   def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>;
 }
 let Predicates = [FeatureNoVectorEnhancements1] in {
-  def : Pat<(f128 (any_extloadf32 bdxaddr12only:$src)),
+  def : Pat<(f128 (z_any_extloadf32 bdxaddr12only:$src)),
             (LXEB bdxaddr12only:$src)>;
-  def : Pat<(f128 (any_extloadf64 bdxaddr12only:$src)),
+  def : Pat<(f128 (z_any_extloadf64 bdxaddr12only:$src)),
             (LXDB bdxaddr12only:$src)>;
 }
 
@@ -430,8 +430,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
     def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64,  FP64>;
     def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
   }
-  defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, load, 4>;
-  defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, load, 8>;
+  defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_load, 4>;
+  defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_load, 8>;
 }
 
 // Subtraction.
@@ -441,8 +441,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
   def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64,  FP64>;
   def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;
 
-  defm SEB : BinaryRXEAndPseudo<"seb",  0xED0B, any_fsub, FP32, load, 4>;
-  defm SDB : BinaryRXEAndPseudo<"sdb",  0xED1B, any_fsub, FP64, load, 8>;
+  defm SEB : BinaryRXEAndPseudo<"seb",  0xED0B, any_fsub, FP32, z_load, 4>;
+  defm SDB : BinaryRXEAndPseudo<"sdb",  0xED1B, any_fsub, FP64, z_load, 8>;
 }
 
 // Multiplication.
@@ -452,8 +452,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
     def MDBR  : BinaryRRE<"mdbr",  0xB31C, any_fmul, FP64,  FP64>;
     def MXBR  : BinaryRRE<"mxbr",  0xB34C, any_fmul, FP128, FP128>;
   }
-  defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, load, 4>;
-  defm MDB  : BinaryRXEAndPseudo<"mdb",  0xED1C, any_fmul, FP64, load, 8>;
+  defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_load, 4>;
+  defm MDB  : BinaryRXEAndPseudo<"mdb",  0xED1C, any_fmul, FP64, z_load, 8>;
 }
 
 // f64 multiplication of two FP32 registers.
@@ -466,7 +466,7 @@ def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)),
 
 // f64 multiplication of an FP32 register and an f32 memory.
 let Uses = [FPC], mayRaiseFPException = 1 in
-  def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
+  def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, z_load, 4>;
 def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)),
                     (f64 (any_extloadf32 bdxaddr12only:$addr))),
           (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
@@ -483,7 +483,7 @@ let Predicates = [FeatureNoVectorEnhancements1] in
 
 // f128 multiplication of an FP64 register and an f64 memory.
 let Uses = [FPC], mayRaiseFPException = 1 in
-  def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
+  def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, z_load, 8>;
 let Predicates = [FeatureNoVectorEnhancements1] in
   def : Pat<(any_fmul (f128 (any_fpextend FP64:$src1)),
                       (f128 (any_extloadf64 bdxaddr12only:$addr))),
@@ -495,8 +495,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
   def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
 
-  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
-  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
+  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, z_load, 4>;
+  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, z_load, 8>;
 }
 
 // Fused multiply-subtract.
@@ -504,8 +504,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>;
   def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>;
 
-  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
-  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
+  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, z_load, 4>;
+  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, z_load, 8>;
 }
 
 // Division.
@@ -514,8 +514,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def DDBR : BinaryRRE<"ddbr", 0xB31D, any_fdiv, FP64,  FP64>;
   def DXBR : BinaryRRE<"dxbr", 0xB34D, any_fdiv, FP128, FP128>;
 
-  defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, load, 4>;
-  defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, load, 8>;
+  defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, z_load, 4>;
+  defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, z_load, 8>;
 }
 
 // Divide to integer.
@@ -533,15 +533,15 @@ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC], CCValues = 0xF in {
   def CDBR : CompareRRE<"cdbr", 0xB319, z_any_fcmp, FP64,  FP64>;
   def CXBR : CompareRRE<"cxbr", 0xB349, z_any_fcmp, FP128, FP128>;
 
-  def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, load, 4>;
-  def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, load, 8>;
+  def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, z_load, 4>;
+  def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, z_load, 8>;
 
   def KEBR : CompareRRE<"kebr", 0xB308, z_strict_fcmps, FP32,  FP32>;
   def KDBR : CompareRRE<"kdbr", 0xB318, z_strict_fcmps, FP64,  FP64>;
   def KXBR : CompareRRE<"kxbr", 0xB348, z_strict_fcmps, FP128, FP128>;
 
-  def KEB : CompareRXE<"keb", 0xED08, z_strict_fcmps, FP32, load, 4>;
-  def KDB : CompareRXE<"kdb", 0xED18, z_strict_fcmps, FP64, load, 8>;
+  def KEB : CompareRXE<"keb", 0xED08, z_strict_fcmps, FP32, z_load, 4>;
+  def KDB : CompareRXE<"kdb", 0xED18, z_strict_fcmps, FP64, z_load, 8>;
 }
 
 // Test Data Class.
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index bb9fa0fc33ffa..3dba33b66bf4f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -3777,7 +3777,7 @@ class BinarySI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                Operand imm, AddressingMode mode = bdaddr12only>
   : InstSI<opcode, (outs), (ins (mode $B1, $D1):$BD1, imm:$I2),
            mnemonic#"\t$BD1, $I2",
-           [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+           [(store (operator (z_load mode:$BD1), imm:$I2), mode:$BD1)]> {
   let mayLoad = 1;
   let mayStore = 1;
 }
@@ -3786,7 +3786,7 @@ class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 Operand imm, AddressingMode mode = bdaddr20only>
   : InstSIY<opcode, (outs), (ins (mode $B1, $D1):$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
-            [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+            [(store (operator (z_load mode:$BD1), imm:$I2), mode:$BD1)]> {
   let mayLoad = 1;
   let mayStore = 1;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
index 2e3c9932d6214..d2e05b63c6c63 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
@@ -134,8 +134,8 @@ let Defs = [CC] in {
     def ADR : BinaryRR<"adr", 0x2A, null_frag, FP64,  FP64>;
     def AXR : BinaryRR<"axr", 0x36, null_frag, FP128, FP128>;
   }
-  def AE : BinaryRX<"ae", 0x7A, null_frag, FP32, load, 4>;
-  def AD : BinaryRX<"ad", 0x6A, null_frag, FP64, load, 8>;
+  def AE : BinaryRX<"ae", 0x7A, null_frag, FP32, z_load, 4>;
+  def AD : BinaryRX<"ad", 0x6A, null_frag, FP64, z_load, 8>;
 }
 
 // Addition (unnormalized).
@@ -144,8 +144,8 @@ let Defs = [CC] in {
     def AUR : BinaryRR<"aur", 0x3E, null_frag, FP32, FP32>;
     def AWR : BinaryRR<"awr", 0x2E, null_frag, FP64, FP64>;
   }
-  def AU : BinaryRX<"au", 0x7E, null_frag, FP32, load, 4>;
-  def AW : BinaryRX<"aw", 0x6E, null_frag, FP64, load, 8>;
+  def AU : BinaryRX<"au", 0x7E, null_frag, FP32, z_load, 4>;
+  def AW : BinaryRX<"aw", 0x6E, null_frag, FP64, z_load, 8>;
 }
 
 // Subtraction.
@@ -154,8 +154,8 @@ let Defs = [CC] in {
   def SDR : BinaryRR<"sdr", 0x2B, null_frag, FP64,  FP64>;
   def SXR : BinaryRR<"sxr", 0x37, null_frag, FP128, FP128>;
 
-  def SE : BinaryRX<"se", 0x7B, null_frag, FP32, load, 4>;
-  def SD : BinaryRX<"sd", 0x6B, null_frag, FP64, load, 8>;
+  def SE : BinaryRX<"se", 0x7B, null_frag, FP32, z_load, 4>;
+  def SD : BinaryRX<"sd", 0x6B, null_frag, FP64, z_load, 8>;
 }
 
 // Subtraction (unnormalized).
@@ -163,8 +163,8 @@ let Defs = [CC] in {
   def SUR : BinaryRR<"sur", 0x3F, null_frag, FP32, FP32>;
   def SWR : BinaryRR<"swr", 0x2F, null_frag, FP64, FP64>;
 
-  def SU : BinaryRX<"su", 0x7F, null_frag, FP32, load, 4>;
-  def SW : BinaryRX<"sw", 0x6F, null_frag, FP64, load, 8>;
+  def SU : BinaryRX<"su", 0x7F, null_frag, FP32, z_load, 4>;
+  def SW : BinaryRX<"sw", 0x6F, null_frag, FP64, z_load, 8>;
 }
 
 // Multiplication.
@@ -173,55 +173,55 @@ let isCommutable = 1 in {
   def MDR  : BinaryRR <"mdr",  0x2C,   null_frag, FP64,  FP64>;
   def MXR  : BinaryRR <"mxr",  0x26,   null_frag, FP128, FP128>;
 }
-def MEE : BinaryRXE<"mee", 0xED37, null_frag, FP32, load, 4>;
-def MD  : BinaryRX <"md",  0x6C,   null_frag, FP64, load, 8>;
+def MEE : BinaryRXE<"mee", 0xED37, null_frag, FP32, z_load, 4>;
+def MD  : BinaryRX <"md",  0x6C,   null_frag, FP64, z_load, 8>;
 
 // Extending multiplication (f32 x f32 -> f64).
 def MDER : BinaryRR<"mder", 0x3C, null_frag, FP64, FP32>;
-def MDE  : BinaryRX<"mde",  0x7C, null_frag, FP64, load, 4>;
+def MDE  : BinaryRX<"mde",  0x7C, null_frag, FP64, z_load, 4>;
 let isAsmParserOnly = 1 in {
   def MER : BinaryRR<"mer", 0x3C, null_frag, FP64, FP32>;
-  def ME  : BinaryRX<"me",  0x7C, null_frag, FP64, load, 4>;
+  def ME  : BinaryRX<"me",  0x7C, null_frag, FP64, z_load, 4>;
 }
 
 // Extending multiplication (f64 x f64 -> f128).
 def MXDR : BinaryRR<"mxdr", 0x27, null_frag, FP128, FP64>;
-def MXD  : BinaryRX<"mxd",  0x67, null_frag, FP128, load, 8>;
+def MXD  : BinaryRX<"mxd",  0x67, null_frag, FP128, z_load, 8>;
 
 // Fused multiply-add.
 def MAER : TernaryRRD<"maer", 0xB32E, null_frag, FP32, FP32>;
 def MADR : TernaryRRD<"madr", 0xB33E, null_frag, FP64, FP64>;
-def MAE  : TernaryRXF<"mae",  0xED2E, null_frag, FP32, FP32, load, 4>;
-def MAD  : TernaryRXF<"mad",  0xED3E, null_frag, FP64, FP64, load, 8>;
+def MAE  : TernaryRXF<"mae",  0xED2E, null_frag, FP32, FP32, z_load, 4>;
+def MAD  : TernaryRXF<"mad",  0xED3E, null_frag, FP64, FP64, z_load, 8>;
 
 // Fused multiply-subtract.
 def MSER : TernaryRRD<"mser", 0xB32F, null_frag, FP32, FP32>;
 def MSDR : TernaryRRD<"msdr", 0xB33F, null_frag, FP64, FP64>;
-def MSE  : TernaryRXF<"mse",  0xED2F, null_frag, FP32, FP32, load, 4>;
-def MSD  : TernaryRXF<"msd",  0xED3F, null_frag, FP64, FP64, load, 8>;
+def MSE  : TernaryRXF<"mse",  0xED2F, null_frag, FP32, FP32, z_load, 4>;
+def MSD  : TernaryRXF<"msd",  0xED3F, null_frag, FP64, FP64, z_load, 8>;
 
 // Multiplication (unnormalized).
 def MYR  : BinaryRRD<"myr",  0xB33B, null_frag, FP128, FP64>;
 def MYHR : BinaryRRD<"myhr", 0xB33D, null_frag, FP64,  FP64>;
 def MYLR : BinaryRRD<"mylr", 0xB339, null_frag, FP64,  FP64>;
-def MY   : BinaryRXF<"my",   0xED3B, null_frag, FP128, FP64, load, 8>;
-def MYH  : BinaryRXF<"myh",  0xED3D, null_frag, FP64,  FP64, load, 8>;
-def MYL  : BinaryRXF<"myl",  0xED39, null_frag, FP64,  FP64, load, 8>;
+def MY   : BinaryRXF<"my",   0xED3B, null_frag, FP128, FP64, z_load, 8>;
+def MYH  : BinaryRXF<"myh",  0xED3D, null_frag, FP64,  FP64, z_load, 8>;
+def MYL  : BinaryRXF<"myl",  0xED39, null_frag, FP64,  FP64, z_load, 8>;
 
 // Fused multiply-add (unnormalized).
 def MAYR  : TernaryRRD<"mayr",  0xB33A, null_frag, FP128, FP64>;
 def MAYHR : TernaryRRD<"mayhr", 0xB33C, null_frag, FP64,  FP64>;
 def MAYLR : TernaryRRD<"maylr", 0xB338, null_frag, FP64,  FP64>;
-def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP128, FP64, load, 8>;
-def MAYH  : TernaryRXF<"mayh",  0xED3C, null_frag, FP64,  FP64, load, 8>;
-def MAYL  : TernaryRXF<"mayl",  0xED38, null_frag, FP64,  FP64, load, 8>;
+def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP128, FP64, z_load, 8>;
+def MAYH  : TernaryRXF<"mayh",  0xED3C, null_frag, FP64,  FP64, z_load, 8>;
+def MAYL  : TernaryRXF<"mayl",  0xED38, null_frag, FP64,  FP64, z_load, 8>;
 
 // Division.
 def DER : BinaryRR <"der", 0x3D,   null_frag, FP32,  FP32>;
 def DDR : BinaryRR <"ddr", 0x2D,   null_frag, FP64,  FP64>;
 def DXR : BinaryRRE<"dxr", 0xB22D, null_frag, FP128, FP128>;
-def DE  : BinaryRX <"de",  0x7D,   null_frag, FP32, load, 4>;
-def DD  : BinaryRX <"dd",  0x6D,   null_frag, FP64, load, 8>;
+def DE  : BinaryRX <"de",  0x7D,   null_frag, FP32, z_load, 4>;
+def DD  : BinaryRX <"dd",  0x6D,   null_frag, FP64, z_load, 8>;
 
 
 //===----------------------------------------------------------------------===//
@@ -233,7 +233,7 @@ let Defs = [CC] in {
   def CDR : CompareRR <"cdr", 0x29,   null_frag, FP64,  FP64>;
   def CXR : CompareRRE<"cxr", 0xB369, null_frag, FP128, FP128>;
 
-  def CE : CompareRX<"ce", 0x79, null_frag, FP32, load, 4>;
-  def CD : CompareRX<"cd", 0x69, null_frag, FP64, load, 8>;
+  def CE : CompareRX<"ce", 0x79, null_frag, FP32, z_load, 4>;
+  def CD : CompareRX<"cd", 0x69, null_frag, FP64, z_load, 8>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 53e9bf9a9d1bb..2a6dce863c28f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -2020,11 +2020,12 @@ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   }
   if (SameVal) {
     int OffsetA = MMOa->getOffset(), OffsetB = MMOb->getOffset();
-    int WidthA = MMOa->getSize(), WidthB = MMOb->getSize();
+    LocationSize WidthA = MMOa->getSize(), WidthB = MMOb->getSize();
     int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
     int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
-    int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
-    if (LowOffset + LowWidth <= HighOffset)
+    LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+    if (LowWidth.hasValue() &&
+        LowOffset + (int)LowWidth.getValue() <= HighOffset)
       return true;
   }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 937e36057a6ed..96ea65b6c3d88 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -306,7 +306,7 @@ let Predicates = [IsTargetXPLINK64] in {
   let mayLoad = 1, AddedComplexity = 20, hasNoSchedulingInfo = 1, Defs = [CC] in {
     def ADA_ENTRY_VALUE : Alias<12, (outs GR64:$Reg), (ins adasym:$addr,
                               ADDR64:$ADA, imm64:$Offset),
-                            [(set i64:$Reg, (load (z_ada_entry
+                            [(set i64:$Reg, (z_load (z_ada_entry
                               iPTR:$addr, iPTR:$ADA, i64:$Offset)))]>;
  }
 }
@@ -468,12 +468,12 @@ let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
 // Register loads.
 let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
   // Expands to L, LY or LFH, depending on the choice of register.
-  def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
+  def LMux : UnaryRXYPseudo<"l", z_load, GRX32, 4>,
              Requires<[FeatureHighWord]>;
-  defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32, 4>;
-  def LFH : UnaryRXY<"lfh", 0xE3CA, load, GRH32, 4>,
+  defm L : UnaryRXPair<"l", 0x58, 0xE358, z_load, GR32, 4>;
+  def LFH : UnaryRXY<"lfh", 0xE3CA, z_load, GRH32, 4>,
             Requires<[FeatureHighWord]>;
-  def LG : UnaryRXY<"lg", 0xE304, load, GR64, 8>;
+  def LG : UnaryRXY<"lg", 0xE304, z_load, GR64, 8>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -483,22 +483,22 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
   }
 }
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
-  def LT  : UnaryRXY<"lt",  0xE312, load, GR32, 4>;
-  def LTG : UnaryRXY<"ltg", 0xE302, load, GR64, 8>;
+  def LT  : UnaryRXY<"lt",  0xE312, z_load, GR32, 4>;
+  def LTG : UnaryRXY<"ltg", 0xE302, z_load, GR64, 8>;
 }
 
 let canFoldAsLoad = 1 in {
-  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_load, GR32>;
-  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_z_load, GR32>;
+  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_z_load, GR64>;
 }
 
 // Load and zero rightmost byte.
 let Predicates = [FeatureLoadAndZeroRightmostByte] in {
   def LZRF : UnaryRXY<"lzrf", 0xE33B, null_frag, GR32, 4>;
   def LZRG : UnaryRXY<"lzrg", 0xE32A, null_frag, GR64, 8>;
-  def : Pat<(and (i32 (load bdxaddr20only:$src)), 0xffffff00),
+  def : Pat<(and (i32 (z_load bdxaddr20only:$src)), 0xffffff00),
             (LZRF bdxaddr20only:$src)>;
-  def : Pat<(and (i64 (load bdxaddr20only:$src)), 0xffffffffffffff00),
+  def : Pat<(and (i64 (z_load bdxaddr20only:$src)), 0xffffffffffffff00),
             (LZRG bdxaddr20only:$src)>;
 }
 
@@ -689,29 +689,29 @@ def : Pat<(sext_inreg GR64:$src, i32),
 
 // 32-bit extensions from 8-bit memory.  LBMux expands to LB or LBH,
 // depending on the choice of register.
-def LBMux : UnaryRXYPseudo<"lb", asextloadi8, GRX32, 1>,
+def LBMux : UnaryRXYPseudo<"lb", z_asextloadi8, GRX32, 1>,
             Requires<[FeatureHighWord]>;
-def LB  : UnaryRXY<"lb", 0xE376, asextloadi8, GR32, 1>;
-def LBH : UnaryRXY<"lbh", 0xE3C0, asextloadi8, GRH32, 1>,
+def LB  : UnaryRXY<"lb", 0xE376, z_asextloadi8, GR32, 1>;
+def LBH : UnaryRXY<"lbh", 0xE3C0, z_asextloadi8, GRH32, 1>,
           Requires<[FeatureHighWord]>;
 
 // 32-bit extensions from 16-bit memory.  LHMux expands to LH or LHH,
 // depending on the choice of register.
-def LHMux : UnaryRXYPseudo<"lh", asextloadi16, GRX32, 2>,
+def LHMux : UnaryRXYPseudo<"lh", z_asextloadi16, GRX32, 2>,
             Requires<[FeatureHighWord]>;
-defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, asextloadi16, GR32, 2>;
-def  LHH  : UnaryRXY<"lhh", 0xE3C4, asextloadi16, GRH32, 2>,
+defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, z_asextloadi16, GR32, 2>;
+def  LHH  : UnaryRXY<"lhh", 0xE3C4, z_asextloadi16, GRH32, 2>,
             Requires<[FeatureHighWord]>;
-def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_asextloadi16, GR32>;
+def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_z_asextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LGB   : UnaryRXY<"lgb", 0xE377, asextloadi8,  GR64, 1>;
-def LGH   : UnaryRXY<"lgh", 0xE315, asextloadi16, GR64, 2>;
-def LGF   : UnaryRXY<"lgf", 0xE314, asextloadi32, GR64, 4>;
-def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_asextloadi16, GR64>;
-def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_asextloadi32, GR64>;
+def LGB   : UnaryRXY<"lgb", 0xE377, z_asextloadi8,  GR64, 1>;
+def LGH   : UnaryRXY<"lgh", 0xE315, z_asextloadi16, GR64, 2>;
+def LGF   : UnaryRXY<"lgf", 0xE314, z_asextloadi32, GR64, 4>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_z_asextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_z_asextloadi32, GR64>;
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
-  def LTGF : UnaryRXY<"ltgf", 0xE332, asextloadi32, GR64, 4>;
+  def LTGF : UnaryRXY<"ltgf", 0xE332, z_asextloadi32, GR64, 4>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
@@ -740,40 +740,40 @@ def : Pat<(and GR64:$src, 0xffffffff),
 
 // 32-bit extensions from 8-bit memory.  LLCMux expands to LLC or LLCH,
 // depending on the choice of register.
-def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
+def LLCMux : UnaryRXYPseudo<"llc", z_azextloadi8, GRX32, 1>,
              Requires<[FeatureHighWord]>;
-def LLC  : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
-def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GRH32, 1>,
+def LLC  : UnaryRXY<"llc", 0xE394, z_azextloadi8, GR32, 1>;
+def LLCH : UnaryRXY<"llch", 0xE3C2, z_azextloadi8, GRH32, 1>,
            Requires<[FeatureHighWord]>;
 
 // 32-bit extensions from 16-bit memory.  LLHMux expands to LLH or LLHH,
 // depending on the choice of register.
-def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
+def LLHMux : UnaryRXYPseudo<"llh", z_azextloadi16, GRX32, 2>,
              Requires<[FeatureHighWord]>;
-def LLH   : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
-def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GRH32, 2>,
+def LLH   : UnaryRXY<"llh", 0xE395, z_azextloadi16, GR32, 2>;
+def LLHH  : UnaryRXY<"llhh", 0xE3C6, z_azextloadi16, GRH32, 2>,
             Requires<[FeatureHighWord]>;
-def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_z_azextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LLGC   : UnaryRXY<"llgc", 0xE390, azextloadi8,  GR64, 1>;
-def LLGH   : UnaryRXY<"llgh", 0xE391, azextloadi16, GR64, 2>;
-def LLGF   : UnaryRXY<"llgf", 0xE316, azextloadi32, GR64, 4>;
-def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_azextloadi16, GR64>;
-def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_azextloadi32, GR64>;
+def LLGC   : UnaryRXY<"llgc", 0xE390, z_azextloadi8,  GR64, 1>;
+def LLGH   : UnaryRXY<"llgh", 0xE391, z_azextloadi16, GR64, 2>;
+def LLGF   : UnaryRXY<"llgf", 0xE316, z_azextloadi32, GR64, 4>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_z_azextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_z_azextloadi32, GR64>;
 
 // 31-to-64-bit zero extensions.
 def LLGTR : UnaryRRE<"llgtr", 0xB917, null_frag, GR64, GR64>;
 def LLGT  : UnaryRXY<"llgt",  0xE317, null_frag, GR64, 4>;
 def : Pat<(and GR64:$src, 0x7fffffff),
           (LLGTR GR64:$src)>;
-def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0x7fffffff),
+def : Pat<(and (i64 (z_azextloadi32 bdxaddr20only:$src)), 0x7fffffff),
           (LLGT bdxaddr20only:$src)>;
 
 // Load and zero rightmost byte.
 let Predicates = [FeatureLoadAndZeroRightmostByte] in {
   def LLZRGF : UnaryRXY<"llzrgf", 0xE33A, null_frag, GR64, 4>;
-  def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0xffffff00),
+  def : Pat<(and (i64 (z_azextloadi32 bdxaddr20only:$src)), 0xffffff00),
             (LLZRGF bdxaddr20only:$src)>;
 }
 
@@ -930,14 +930,14 @@ defm : SXU<ineg, LCGFR>;
 //===----------------------------------------------------------------------===//
 
 let isCodeGenOnly = 1 in
-  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, azextloadi8, 1>;
-defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, azextloadi8, 1>;
+  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, z_azextloadi8, 1>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, z_azextloadi8, 1>;
 
-defm : InsertMem<"inserti8", IC32,  GR32, azextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC32,  GR32, z_azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, z_azextloadi8, bdxaddr20pair>;
 
-defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC,  GR64, z_azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, z_azextloadi8, bdxaddr20pair>;
 
 // Insert characters under mask -- not (yet) used for codegen.
 let Defs = [CC] in {
@@ -1015,12 +1015,12 @@ let Defs = [CC], CCValues = 0xF, CCIfNoSignedWrap = 1 in {
   def AGFI : BinaryRIL<"agfi", 0xC28, z_sadd, GR64, imm64sx32>;
 
   // Addition of memory.
-  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, asextloadi16, 2>;
-  defm A   : BinaryRXPairAndPseudo<"a",  0x5A, 0xE35A, z_sadd, GR32, load, 4>;
-  def  AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, asextloadi16, 2>,
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, z_asextloadi16, 2>;
+  defm A   : BinaryRXPairAndPseudo<"a",  0x5A, 0xE35A, z_sadd, GR32, z_load, 4>;
+  def  AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, z_asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
-  def  AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, asextloadi32, 4>;
-  defm AG  : BinaryRXYAndPseudo<"ag",  0xE308, z_sadd, GR64, load, 8>;
+  def  AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, z_asextloadi32, 4>;
+  defm AG  : BinaryRXYAndPseudo<"ag",  0xE308, z_sadd, GR64, z_load, 8>;
 
   // Addition to memory.
   def ASI  : BinarySIY<"asi",  0xEB6A, add, imm32sx8>;
@@ -1058,9 +1058,9 @@ let Defs = [CC], CCValues = 0xF, IsLogical = 1 in {
               Requires<[FeatureHighWord]>;
 
   // Addition of memory.
-  defm AL   : BinaryRXPairAndPseudo<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>;
-  def  ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, azextloadi32, 4>;
-  defm ALG  : BinaryRXYAndPseudo<"alg",  0xE30A, z_uadd, GR64, load, 8>;
+  defm AL   : BinaryRXPairAndPseudo<"al", 0x5E, 0xE35E, z_uadd, GR32, z_load, 4>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, z_azextloadi32, 4>;
+  defm ALG  : BinaryRXYAndPseudo<"alg",  0xE30A, z_uadd, GR64, z_load, 8>;
 
   // Addition to memory.
   def ALSI  : BinarySIY<"alsi",  0xEB6E, null_frag, imm32sx8>;
@@ -1075,8 +1075,8 @@ let Defs = [CC], Uses = [CC], CCValues = 0xF, IsLogical = 1 in {
   def ALCGR : BinaryRRE<"alcgr", 0xB988, z_addcarry, GR64, GR64>;
 
   // Addition of memory.
-  def ALC  : BinaryRXY<"alc",  0xE398, z_addcarry, GR32, load, 4>;
-  def ALCG : BinaryRXY<"alcg", 0xE388, z_addcarry, GR64, load, 8>;
+  def ALC  : BinaryRXY<"alc",  0xE398, z_addcarry, GR32, z_load, 4>;
+  def ALCG : BinaryRXY<"alcg", 0xE388, z_addcarry, GR64, z_load, 8>;
 }
 
 // Addition that does not modify the condition code.
@@ -1103,12 +1103,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8,
               Requires<[FeatureHighWord]>;
 
   // Subtraction of memory.
-  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, asextloadi16, 2>;
-  defm S   : BinaryRXPairAndPseudo<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>;
-  def  SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, asextloadi16, 2>,
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, z_asextloadi16, 2>;
+  defm S   : BinaryRXPairAndPseudo<"s", 0x5B, 0xE35B, z_ssub, GR32, z_load, 4>;
+  def  SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, z_asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
-  def  SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, asextloadi32, 4>;
-  defm SG  : BinaryRXYAndPseudo<"sg",  0xE309, z_ssub, GR64, load, 8>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, z_asextloadi32, 4>;
+  defm SG  : BinaryRXYAndPseudo<"sg",  0xE309, z_ssub, GR64, z_load, 8>;
 }
 defm : SXB<z_ssub, GR64, SGFR>;
 
@@ -1156,9 +1156,9 @@ let Defs = [CC], CCValues = 0x7, IsLogical = 1 in {
   def SLGFI : BinaryRIL<"slgfi", 0xC24, z_usub, GR64, imm64zx32>;
 
   // Subtraction of memory.
-  defm SL   : BinaryRXPairAndPseudo<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>;
-  def  SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, azextloadi32, 4>;
-  defm SLG  : BinaryRXYAndPseudo<"slg",  0xE30B, z_usub, GR64, load, 8>;
+  defm SL   : BinaryRXPairAndPseudo<"sl", 0x5F, 0xE35F, z_usub, GR32, z_load, 4>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, z_azextloadi32, 4>;
+  defm SLG  : BinaryRXYAndPseudo<"slg",  0xE30B, z_usub, GR64, z_load, 8>;
 }
 defm : ZXB<z_usub, GR64, SLGFR>;
 
@@ -1183,8 +1183,8 @@ let Defs = [CC], Uses = [CC], CCValues = 0xF, IsLogical = 1 in {
   def SLBGR : BinaryRRE<"slbgr", 0xB989, z_subcarry, GR64, GR64>;
 
   // Subtraction of memory.
-  def SLB  : BinaryRXY<"slb",  0xE399, z_subcarry, GR32, load, 4>;
-  def SLBG : BinaryRXY<"slbg", 0xE389, z_subcarry, GR64, load, 8>;
+  def SLB  : BinaryRXY<"slb",  0xE399, z_subcarry, GR32, z_load, 4>;
+  def SLBG : BinaryRXY<"slbg", 0xE389, z_subcarry, GR64, z_load, 8>;
 }
 
 
@@ -1233,8 +1233,8 @@ let Defs = [CC] in {
 
   // ANDs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm N  : BinaryRXPairAndPseudo<"n", 0x54, 0xE354, and, GR32, load, 4>;
-    defm NG : BinaryRXYAndPseudo<"ng", 0xE380, and, GR64, load, 8>;
+    defm N  : BinaryRXPairAndPseudo<"n", 0x54, 0xE354, and, GR32, z_load, 4>;
+    defm NG : BinaryRXYAndPseudo<"ng", 0xE380, and, GR64, z_load, 8>;
   }
 
   // AND to memory
@@ -1290,8 +1290,8 @@ let Defs = [CC] in {
 
   // ORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm O  : BinaryRXPairAndPseudo<"o", 0x56, 0xE356, or, GR32, load, 4>;
-    defm OG : BinaryRXYAndPseudo<"og", 0xE381, or, GR64, load, 8>;
+    defm O  : BinaryRXPairAndPseudo<"o", 0x56, 0xE356, or, GR32, z_load, 4>;
+    defm OG : BinaryRXYAndPseudo<"og", 0xE381, or, GR64, z_load, 8>;
   }
 
   // OR to memory
@@ -1330,8 +1330,8 @@ let Defs = [CC] in {
 
   // XORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm X  : BinaryRXPairAndPseudo<"x",0x57, 0xE357, xor, GR32, load, 4>;
-    defm XG : BinaryRXYAndPseudo<"xg", 0xE382, xor, GR64, load, 8>;
+    defm X  : BinaryRXPairAndPseudo<"x",0x57, 0xE357, xor, GR32, z_load, 4>;
+    defm XG : BinaryRXYAndPseudo<"xg", 0xE382, xor, GR64, z_load, 8>;
   }
 
   // XOR to memory
@@ -1411,17 +1411,17 @@ def MSFI  : BinaryRIL<"msfi",  0xC21, mul, GR32, simm32>;
 def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 
 // Multiplication of memory.
-defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
-defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
-def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, asextloadi16, 2>,
+defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, z_asextloadi16, 2>;
+defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, z_load, 4>;
+def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, z_asextloadi16, 2>,
             Requires<[FeatureMiscellaneousExtensions2]>;
-def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
-def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
+def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, z_asextloadi32, 4>;
+def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, z_load, 8>;
 
 // Multiplication of memory, setting the condition code.
 let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in {
-  defm MSC  : BinaryRXYAndPseudo<"msc",  0xE353, null_frag, GR32, load, 4>;
-  defm MSGC : BinaryRXYAndPseudo<"msgc", 0xE383, null_frag, GR64, load, 8>;
+  defm MSC  : BinaryRXYAndPseudo<"msc",  0xE353, null_frag, GR32, z_load, 4>;
+  defm MSGC : BinaryRXYAndPseudo<"msgc", 0xE383, null_frag, GR64, z_load, 8>;
 }
 
 // Multiplication of a register, producing two results.
@@ -1437,16 +1437,16 @@ def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2),
           (MLGR (AEXT128 GR64:$src1), GR64:$src2)>;
 
 // Multiplication of memory, producing two results.
-def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, load, 4>;
-def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
-def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, load, 8>,
+def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, z_load, 4>;
+def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, z_load, 4>;
+def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, z_load, 8>,
           Requires<[FeatureMiscellaneousExtensions2]>;
-def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
-def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>;
+def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, z_load, 4>;
+def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, z_load, 8>;
 
-def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_smul_lohi GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
-def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_umul_lohi GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 //===----------------------------------------------------------------------===//
@@ -1462,30 +1462,30 @@ let hasSideEffects = 1 in {  // Do not speculatively execute.
   def DLGR  : BinaryRRE<"dlgr",  0xB987, null_frag, GR128, GR64>;
 
   // Division and remainder, from memory.
-  def D    : BinaryRX <"d",    0x5D,   null_frag, GR128, load, 4>;
-  def DSGF : BinaryRXY<"dsgf", 0xE31D, null_frag, GR128, load, 4>;
-  def DSG  : BinaryRXY<"dsg",  0xE30D, null_frag, GR128, load, 8>;
-  def DL   : BinaryRXY<"dl",   0xE397, null_frag, GR128, load, 4>;
-  def DLG  : BinaryRXY<"dlg",  0xE387, null_frag, GR128, load, 8>;
+  def D    : BinaryRX <"d",    0x5D,   null_frag, GR128, z_load, 4>;
+  def DSGF : BinaryRXY<"dsgf", 0xE31D, null_frag, GR128, z_load, 4>;
+  def DSG  : BinaryRXY<"dsg",  0xE30D, null_frag, GR128, z_load, 8>;
+  def DL   : BinaryRXY<"dl",   0xE397, null_frag, GR128, z_load, 4>;
+  def DLG  : BinaryRXY<"dlg",  0xE387, null_frag, GR128, z_load, 8>;
 }
 def : Pat<(z_sdivrem GR64:$src1, GR32:$src2),
           (DSGFR (AEXT128 GR64:$src1), GR32:$src2)>;
-def : Pat<(z_sdivrem GR64:$src1, (i32 (load bdxaddr20only:$src2))),
+def : Pat<(z_sdivrem GR64:$src1, (i32 (z_load bdxaddr20only:$src2))),
           (DSGF (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 def : Pat<(z_sdivrem GR64:$src1, GR64:$src2),
           (DSGR (AEXT128 GR64:$src1), GR64:$src2)>;
-def : Pat<(z_sdivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_sdivrem GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (DSG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 def : Pat<(z_udivrem GR32:$src1, GR32:$src2),
           (DLR (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1,
                                        subreg_l32)), GR32:$src2)>;
-def : Pat<(z_udivrem GR32:$src1, (i32 (load bdxaddr20only:$src2))),
+def : Pat<(z_udivrem GR32:$src1, (i32 (z_load bdxaddr20only:$src2))),
           (DL (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1,
                                       subreg_l32)), bdxaddr20only:$src2)>;
 def : Pat<(z_udivrem GR64:$src1, GR64:$src2),
           (DLGR (ZEXT128 GR64:$src1), GR64:$src2)>;
-def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_udivrem GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (DLG (ZEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 //===----------------------------------------------------------------------===//
@@ -1591,25 +1591,25 @@ let Defs = [CC], CCValues = 0xE in {
   def CGFI : CompareRIL<"cgfi", 0xC2C, z_scmp, GR64, imm64sx32>;
 
   // Comparison with memory.
-  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, asextloadi16, 2>;
-  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, load, 4>,
+  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, z_asextloadi16, 2>;
+  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, z_load, 4>,
                Requires<[FeatureHighWord]>;
-  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, load, 4>;
-  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, load, 4>,
+  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, z_load, 4>;
+  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, z_load, 4>,
                Requires<[FeatureHighWord]>;
-  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, asextloadi16, 2>;
-  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, asextloadi32, 4>;
-  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, load, 8>;
-  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_asextloadi16>;
-  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_load>;
-  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_asextloadi16>;
-  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_asextloadi32>;
-  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_load>;
+  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, z_asextloadi16, 2>;
+  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, z_asextloadi32, 4>;
+  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, z_load, 8>;
+  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_z_asextloadi16>;
+  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_z_load>;
+  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_z_asextloadi16>;
+  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_z_asextloadi32>;
+  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_z_load>;
 
   // Comparison between memory and a signed 16-bit immediate.
-  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, asextloadi16, imm32sx16>;
-  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, load, imm32sx16>;
-  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, load, imm64sx16>;
+  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, z_asextloadi16, imm32sx16>;
+  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, z_load, imm32sx16>;
+  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, z_load, imm64sx16>;
 }
 defm : SXB<z_scmp, GR64, CGFR>;
 
@@ -1636,31 +1636,31 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
 
   // Comparison with memory.
-  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, load, 4>,
+  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, z_load, 4>,
                 Requires<[FeatureHighWord]>;
-  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load, 4>;
-  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, load, 4>,
+  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, z_load, 4>;
+  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, z_load, 4>,
                 Requires<[FeatureHighWord]>;
-  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, azextloadi32, 4>;
-  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load, 8>;
+  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, z_azextloadi32, 4>;
+  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, z_load, 8>;
   def  CLHRL  : CompareRILPC<"clhrl",  0xC67, z_ucmp, GR32,
-                             aligned_azextloadi16>;
+                             aligned_z_azextloadi16>;
   def  CLRL   : CompareRILPC<"clrl",   0xC6F, z_ucmp, GR32,
-                             aligned_load>;
+                             aligned_z_load>;
   def  CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
-                             aligned_azextloadi16>;
+                             aligned_z_azextloadi16>;
   def  CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
-                             aligned_azextloadi32>;
+                             aligned_z_azextloadi32>;
   def  CLGRL  : CompareRILPC<"clgrl",  0xC6A, z_ucmp, GR64,
-                             aligned_load>;
+                             aligned_z_load>;
 
   // Comparison between memory and an unsigned 8-bit immediate.
-  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, azextloadi8, imm32zx8>;
+  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, z_azextloadi8, imm32zx8>;
 
   // Comparison between memory and an unsigned 16-bit immediate.
-  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, azextloadi16, imm32zx16>;
-  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load, imm32zx16>;
-  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load, imm64zx16>;
+  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, z_azextloadi16, imm32zx16>;
+  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, z_load, imm32zx16>;
+  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, z_load, imm64zx16>;
 }
 defm : ZXB<z_ucmp, GR64, CLGFR>;
 
@@ -1693,7 +1693,7 @@ let Defs = [CC] in {
   def TMHL64 : CompareAliasRI<z_tm_reg, GR64, imm64hl16>;
   def TMHH64 : CompareAliasRI<z_tm_reg, GR64, imm64hh16>;
 
-  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
+  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, z_anyextloadi8, imm32zx8>;
 }
 
 def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
@@ -1914,8 +1914,8 @@ let Predicates = [FeatureGuardedStorage], hasSideEffects = 1 in {
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-defm CVB  : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>;
-def  CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>;
+defm CVB  : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, z_load, 4>;
+def  CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, z_load, 8>;
 
 defm CVD  : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>;
 def  CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 799b27d74414d..245e3c3399a98 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -140,8 +140,8 @@ let Predicates = [FeatureVector] in {
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
   let mayLoad = 1 in {
-    def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>;
-    def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+    def VL32 : UnaryAliasVRX<z_load, v32sb, bdxaddr12pair>;
+    def VL64 : UnaryAliasVRX<z_load, v64db, bdxaddr12pair>;
   }
 
   // Load logical element and zero.
@@ -198,12 +198,12 @@ multiclass ReplicatePeephole<Instruction vlrep, ValueType vectype,
                       (scalartype (load bdxaddr12only:$addr)))),
             (vlrep bdxaddr12only:$addr)>;
 }
-defm : ReplicatePeephole<VLREPB, v16i8, anyextloadi8, i32>;
-defm : ReplicatePeephole<VLREPH, v8i16, anyextloadi16, i32>;
-defm : ReplicatePeephole<VLREPF, v4i32, load, i32>;
-defm : ReplicatePeephole<VLREPG, v2i64, load, i64>;
-defm : ReplicatePeephole<VLREPF, v4f32, load, f32>;
-defm : ReplicatePeephole<VLREPG, v2f64, load, f64>;
+defm : ReplicatePeephole<VLREPB, v16i8, z_anyextloadi8, i32>;
+defm : ReplicatePeephole<VLREPH, v8i16, z_anyextloadi16, i32>;
+defm : ReplicatePeephole<VLREPF, v4i32, z_load, i32>;
+defm : ReplicatePeephole<VLREPG, v2i64, z_load, i64>;
+defm : ReplicatePeephole<VLREPF, v4f32, z_load, f32>;
+defm : ReplicatePeephole<VLREPG, v2f64, z_load, f64>;
 
 //===----------------------------------------------------------------------===//
 // Stores
@@ -1561,13 +1561,13 @@ let Predicates = [FeatureVector] in {
 
 // Any-extending loads into i128.
 let Predicates = [FeatureVector] in {
-  def : Pat<(i128 (extloadi8 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi8 bdxaddr12only:$addr)),
             (VLREPB bdxaddr12only:$addr)>;
-  def : Pat<(i128 (extloadi16 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi16 bdxaddr12only:$addr)),
             (VLREPH bdxaddr12only:$addr)>;
-  def : Pat<(i128 (extloadi32 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi32 bdxaddr12only:$addr)),
             (VLREPF bdxaddr12only:$addr)>;
-  def : Pat<(i128 (extloadi64 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi64 bdxaddr12only:$addr)),
             (VLREPG bdxaddr12only:$addr)>;
 }
 
@@ -1621,13 +1621,13 @@ let Predicates = [FeatureVector] in {
 
 // Zero-extending loads into i128.
 let Predicates = [FeatureVector] in {
-  def : Pat<(i128 (zextloadi8 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi8 bdxaddr12only:$addr)),
             (VLEB (VGBM 0), bdxaddr12only:$addr, 15)>;
-  def : Pat<(i128 (zextloadi16 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi16 bdxaddr12only:$addr)),
             (VLEH (VGBM 0), bdxaddr12only:$addr, 7)>;
-  def : Pat<(i128 (zextloadi32 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi32 bdxaddr12only:$addr)),
             (VLEF (VGBM 0), bdxaddr12only:$addr, 3)>;
-  def : Pat<(i128 (zextloadi64 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi64 bdxaddr12only:$addr)),
             (VLEG (VGBM 0), bdxaddr12only:$addr, 1)>;
 }
 
@@ -1663,13 +1663,13 @@ let Predicates = [FeatureVector] in {
 
 // Sign-extending loads into i128.
 let Predicates = [FeatureVector] in {
-  def : Pat<(i128 (sextloadi8 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi8 bdxaddr12only:$addr)),
             (VSRAB (VLREPB bdxaddr12only:$addr), (VREPIB 120))>;
-  def : Pat<(i128 (sextloadi16 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi16 bdxaddr12only:$addr)),
             (VSRAB (VLREPH bdxaddr12only:$addr), (VREPIB 112))>;
-  def : Pat<(i128 (sextloadi32 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi32 bdxaddr12only:$addr)),
             (VSRAB (VLREPF bdxaddr12only:$addr), (VREPIB 96))>;
-  def : Pat<(i128 (sextloadi64 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi64 bdxaddr12only:$addr)),
             (VSRAB (VLREPG bdxaddr12only:$addr), (VREPIB 64))>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index d98bb886c1850..6c4e33a6aa7fa 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -534,37 +534,108 @@ def zext8  : PatFrag<(ops node:$src), (and node:$src, 0xff)>;
 def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>;
 def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
 
-// Extending loads in which the extension type can be signed.
-def asextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
-  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
-  return Type == ISD::EXTLOAD || Type == ISD::SEXTLOAD;
+// Match a load or a non-extending atomic load.
+def z_load : PatFrags<(ops node:$ptr),
+                      [(load node:$ptr),
+                       (atomic_load node:$ptr)], [{
+  if (auto *AL = dyn_cast<AtomicSDNode>(N))
+    if (AL->getExtensionType() != ISD::NON_EXTLOAD)
+      return false;
+  return true;
 }]>;
-def asextloadi8 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+
+// Sign extending (atomic) loads.
+def z_sextload : PatFrags<(ops node:$ptr),
+                          [(unindexedload node:$ptr),
+                           (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) == ISD::SEXTLOAD;
 }]>;
-def asextloadi16 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+def z_sextloadi8 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
-def asextloadi32 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+def z_sextloadi16 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_sextloadi32 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_sextloadi64 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
 }]>;
 
-// Extending loads in which the extension type can be unsigned.
-def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
-  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
-  return Type == ISD::EXTLOAD || Type == ISD::ZEXTLOAD;
+// Zero extending (atomic) loads.
+def z_zextload : PatFrags<(ops node:$ptr),
+                          [(unindexedload node:$ptr),
+                           (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) == ISD::ZEXTLOAD;
 }]>;
-def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+def z_zextloadi8 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
-def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+def z_zextloadi16 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
 }]>;
-def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+def z_zextloadi32 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_zextloadi64 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+// Extending (atomic) loads in which the extension type can be signed.
+def z_asextload : PatFrags<(ops node:$ptr),
+                           [(unindexedload node:$ptr),
+                            (atomic_load node:$ptr)], [{
+  ISD::LoadExtType ETy = getLoadExtType(N);
+  return ETy == ISD::EXTLOAD || ETy == ISD::SEXTLOAD;
+}]>;
+def z_asextloadi8 : PatFrag<(ops node:$ptr), (z_asextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_asextloadi16 : PatFrag<(ops node:$ptr), (z_asextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_asextloadi32 : PatFrag<(ops node:$ptr), (z_asextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
-// Extending loads in which the extension type doesn't matter.
+// Extending (atomic) loads in which the extension type can be unsigned.
+def z_azextload : PatFrags<(ops node:$ptr),
+                           [(unindexedload node:$ptr),
+                           (atomic_load node:$ptr)], [{
+  ISD::LoadExtType ETy = getLoadExtType(N);
+  return ETy == ISD::EXTLOAD || ETy == ISD::ZEXTLOAD;
+}]>;
+def z_azextloadi8 : PatFrag<(ops node:$ptr), (z_azextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_azextloadi16 : PatFrag<(ops node:$ptr), (z_azextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_azextloadi32 : PatFrag<(ops node:$ptr), (z_azextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending (atomic) loads in which the extension type doesn't matter.
+def z_anyextload : PatFrags<(ops node:$ptr),
+                            [(unindexedload node:$ptr),
+                             (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) != ISD::NON_EXTLOAD;
+}]>;
+def z_anyextloadi8 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_anyextloadi16 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_anyextloadi32 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_anyextloadi64 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+// Extending non-atomic loads in which the extension type doesn't matter.
 def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
   return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
 }]>;
@@ -578,15 +649,42 @@ def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
+// Extending (atomic) loads that are not sign/zero extending.
+def z_extload : PatFrags<(ops node:$ptr),
+                         [(extload node:$ptr),
+                          (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) == ISD::EXTLOAD;
+}]>;
+def z_extloadi8 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_extloadi16 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_extloadi32 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_extloadi64 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+// Extending atomic FP loads.
+def z_any_extloadf32 : PatFrags<(ops node:$ptr),
+                                [(any_extloadf32 node:$ptr),
+                                 (any_fpextend (f32 (atomic_load node:$ptr)))]>;
+def z_any_extloadf64 : PatFrags<(ops node:$ptr),
+                                [(any_extloadf64 node:$ptr),
+                                 (any_fpextend (f64 (atomic_load node:$ptr)))]>;
+
 // Aligned loads.
 class AlignedLoad<SDPatternOperator load>
   : PatFrag<(ops node:$addr), (load node:$addr),
   [{ return storeLoadIsAligned(N); }]>;
-def aligned_load         : AlignedLoad<load>;
-def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
-def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
-def aligned_azextloadi16 : AlignedLoad<azextloadi16>;
-def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
+def aligned_z_load         : AlignedLoad<z_load>;
+def aligned_z_asextloadi16 : AlignedLoad<z_asextloadi16>;
+def aligned_z_asextloadi32 : AlignedLoad<z_asextloadi32>;
+def aligned_z_azextloadi16 : AlignedLoad<z_azextloadi16>;
+def aligned_z_azextloadi32 : AlignedLoad<z_azextloadi32>;
 
 // Aligned stores.
 class AlignedStore<SDPatternOperator store>
@@ -749,7 +847,7 @@ def z_any_vround  : PatFrags<(ops node:$src),
 
 // Create a unary operator that loads from memory and then performs
 // the given operation on it.
-class loadu<SDPatternOperator operator, SDPatternOperator load = load>
+class loadu<SDPatternOperator operator, SDPatternOperator load = z_load>
   : PatFrag<(ops node:$addr), (operator (load node:$addr))>;
 
 // Create a store operator that performs the given unary operation
@@ -799,12 +897,12 @@ def imm32nobytes : PatLeaf<(i32 imm), [{
 class z_replicate_load<ValueType scalartype, SDPatternOperator load>
   : PatFrag<(ops node:$addr),
             (z_replicate (scalartype (load node:$addr)))>;
-def z_replicate_loadi8  : z_replicate_load<i32, anyextloadi8>;
-def z_replicate_loadi16 : z_replicate_load<i32, anyextloadi16>;
-def z_replicate_loadi32 : z_replicate_load<i32, load>;
-def z_replicate_loadi64 : z_replicate_load<i64, load>;
-def z_replicate_loadf32 : z_replicate_load<f32, load>;
-def z_replicate_loadf64 : z_replicate_load<f64, load>;
+def z_replicate_loadi8  : z_replicate_load<i32, z_anyextloadi8>;
+def z_replicate_loadi16 : z_replicate_load<i32, z_anyextloadi16>;
+def z_replicate_loadi32 : z_replicate_load<i32, z_load>;
+def z_replicate_loadi64 : z_replicate_load<i64, z_load>;
+def z_replicate_loadf32 : z_replicate_load<f32, z_load>;
+def z_replicate_loadf64 : z_replicate_load<f64, z_load>;
 // Byte-swapped replicated vector element loads.
 def z_replicate_loadbswapi16 : z_replicate_load<i32, z_loadbswap16>;
 def z_replicate_loadbswapi32 : z_replicate_load<i32, z_loadbswap32>;
@@ -815,12 +913,12 @@ class z_vle<ValueType scalartype, SDPatternOperator load>
   : PatFrag<(ops node:$vec, node:$addr, node:$index),
             (z_vector_insert node:$vec, (scalartype (load node:$addr)),
                              node:$index)>;
-def z_vlei8  : z_vle<i32, anyextloadi8>;
-def z_vlei16 : z_vle<i32, anyextloadi16>;
-def z_vlei32 : z_vle<i32, load>;
-def z_vlei64 : z_vle<i64, load>;
-def z_vlef32 : z_vle<f32, load>;
-def z_vlef64 : z_vle<f64, load>;
+def z_vlei8  : z_vle<i32, z_anyextloadi8>;
+def z_vlei16 : z_vle<i32, z_anyextloadi16>;
+def z_vlei32 : z_vle<i32, z_load>;
+def z_vlei64 : z_vle<i64, z_load>;
+def z_vlef32 : z_vle<f32, z_load>;
+def z_vlef64 : z_vle<f64, z_load>;
 // Byte-swapped vector element loads.
 def z_vlebri16 : z_vle<i32, z_loadbswap16>;
 def z_vlebri32 : z_vle<i32, z_loadbswap32>;
@@ -832,13 +930,13 @@ class z_vllez<ValueType scalartype, SDPatternOperator load, int index>
   : PatFrag<(ops node:$addr),
             (z_vector_insert immAllZerosV,
                              (scalartype (load node:$addr)), (i32 index))>;
-def z_vllezi8  : z_vllez<i32, anyextloadi8, 7>;
-def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>;
-def z_vllezi32 : z_vllez<i32, load, 1>;
+def z_vllezi8  : z_vllez<i32, z_anyextloadi8, 7>;
+def z_vllezi16 : z_vllez<i32, z_anyextloadi16, 3>;
+def z_vllezi32 : z_vllez<i32, z_load, 1>;
 def z_vllezi64 : PatFrags<(ops node:$addr),
                           [(z_vector_insert immAllZerosV,
-                                            (i64 (load node:$addr)), (i32 0)),
-                           (z_join_dwords (i64 (load node:$addr)), (i64 0))]>;
+                                            (i64 (z_load node:$addr)), (i32 0)),
+                           (z_join_dwords (i64 (z_load node:$addr)), (i64 0))]>;
 // We use high merges to form a v4f32 from four f32s.  Propagating zero
 // into all elements but index 1 gives this expression.
 def z_vllezf32 : PatFrag<(ops node:$addr),
@@ -848,23 +946,23 @@ def z_vllezf32 : PatFrag<(ops node:$addr),
                             (v4i32
                              (bitconvert
                               (v4f32 (scalar_to_vector
-                                      (f32 (load node:$addr)))))))),
+                                      (f32 (z_load node:$addr)))))))),
                           (v2i64
                            (bitconvert (v4f32 immAllZerosV))))>;
 def z_vllezf64 : PatFrag<(ops node:$addr),
                          (z_merge_high
-                          (v2f64 (scalar_to_vector (f64 (load node:$addr)))),
+                          (v2f64 (scalar_to_vector (f64 (z_load node:$addr)))),
                           immAllZerosV)>;
 
 // Similarly for the high element of a zeroed vector.
-def z_vllezli32 : z_vllez<i32, load, 0>;
+def z_vllezli32 : z_vllez<i32, z_load, 0>;
 def z_vllezlf32 : PatFrag<(ops node:$addr),
                           (z_merge_high
                            (v2i64
                             (bitconvert
                              (z_merge_high
                               (v4f32 (scalar_to_vector
-                                      (f32 (load node:$addr)))),
+                                      (f32 (z_load node:$addr)))),
                               (v4f32 immAllZerosV)))),
                            (v2i64
                             (bitconvert (v4f32 immAllZerosV))))>;
diff --git a/llvm/lib/Target/SystemZ/SystemZPatterns.td b/llvm/lib/Target/SystemZ/SystemZPatterns.td
index 5e5dca77e9553..4d6bc68e9a7ed 100644
--- a/llvm/lib/Target/SystemZ/SystemZPatterns.td
+++ b/llvm/lib/Target/SystemZ/SystemZPatterns.td
@@ -49,8 +49,8 @@ class RMWI<SDPatternOperator load, SDPatternOperator operator,
 // memory location.  IMM is the type of the second operand.
 multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
                     Instruction insn> {
-  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
-  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
+  def : RMWI<z_anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<z_anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
 }
 
 // Record that INSN performs insertion TYPE into a register of class CLS.
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 1e548d7c101a7..cbad5a0eafb27 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -1785,14 +1785,14 @@ defm : TRUNC64m<truncstorei8, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
 defm : TRUNC64m<truncstorei16, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
 defm : TRUNC64m<truncstorei32, STLrri, STLrii, STLzri, ST1Bzii>;
 
-// Atomic loads
+// Atomic loads (FIXME: replace iAny with the correct integer VT:)
 multiclass ATMLDm<SDPatternOperator from,
                   RM torri, RM torii,
                   RM tozri, RM tozii> {
-  def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>;
-  def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>;
-  def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>;
-  def : Pat<(from ADDRzii:$addr), (tozii MEMzii:$addr)>;
+  def : Pat<(iAny (from ADDRrri:$addr)), (torri MEMrri:$addr)>;
+  def : Pat<(iAny (from ADDRrii:$addr)), (torii MEMrii:$addr)>;
+  def : Pat<(iAny (from ADDRzri:$addr)), (tozri MEMzri:$addr)>;
+  def : Pat<(iAny (from ADDRzii:$addr)), (tozii MEMzii:$addr)>;
 }
 defm : ATMLDm<atomic_load_8, LD1BZXrri, LD1BZXrii, LD1BZXzri, LD1BZXzii>;
 defm : ATMLDm<atomic_load_16, LD2BZXrri, LD2BZXrii, LD2BZXzri, LD2BZXzii>;
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index 86fb99cc98a90..fac2e0d935f5a 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -63,7 +63,7 @@ wasm::ValType WebAssembly::toValType(MVT Type) {
 }
 
 void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
-                                    const ArrayRef<MVT> &VTs) {
+                                    ArrayRef<MVT> VTs) {
   assert(!Sym->getType());
 
   // Tables are represented as Arrays in LLVM IR therefore
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
index a184ecb9a465a..87660947e7de1 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
@@ -60,7 +60,7 @@ wasm::ValType toValType(MVT Type);
 
 /// Sets a Wasm Symbol Type.
 void wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
-                       const ArrayRef<MVT> &VTs);
+                       ArrayRef<MVT> VTs);
 
 } // end namespace WebAssembly
 } // end namespace llvm
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index fb949d4b19a3e..03897b551ecae 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -264,7 +264,7 @@ MCSymbol *WebAssemblyAsmPrinter::getOrCreateWasmSymbol(StringRef Name) {
     Params.push_back(AddrType);
   } else { // Function symbols
     WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
-    getLibcallSignature(Subtarget, Name, Returns, Params);
+    WebAssembly::getLibcallSignature(Subtarget, Name, Returns, Params);
   }
   auto Signature = std::make_unique<wasm::WasmSignature>(std::move(Returns),
                                                          std::move(Params));
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 77e6640d5a822..0427fe473f8e1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -1283,9 +1283,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram());
   SplitBlock(Entry, &*Entry->getFirstInsertionPt());
 
-  BinaryOperator *SetjmpTableSize =
-      BinaryOperator::Create(Instruction::Add, IRB.getInt32(4), IRB.getInt32(0),
-                             "setjmpTableSize", Entry->getTerminator());
+  BinaryOperator *SetjmpTableSize = BinaryOperator::Create(
+      Instruction::Add, IRB.getInt32(4), IRB.getInt32(0), "setjmpTableSize",
+      Entry->getTerminator()->getIterator());
   SetjmpTableSize->setDebugLoc(FirstDL);
   // setjmpTable = (int *) malloc(40);
   Type *IntPtrTy = getAddrIntType(&M);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
index e0a2192112285..2594430d1d8f3 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
@@ -73,7 +73,7 @@ bool WebAssemblyLowerRefTypesIntPtrConv::runOnFunction(Function &F) {
 
     Function *TrapIntrin =
         Intrinsic::getDeclaration(F.getParent(), Intrinsic::debugtrap);
-    CallInst::Create(TrapIntrin, {}, "", &*I);
+    CallInst::Create(TrapIntrin, {}, "", I->getIterator());
 
     worklist.insert(&*I);
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 1e959111a4dbc..d17394eede772 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -105,7 +105,7 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
   }
 }
 
-void llvm::valTypesFromMVTs(const ArrayRef<MVT> &In,
+void llvm::valTypesFromMVTs(ArrayRef<MVT> In,
                             SmallVectorImpl<wasm::ValType> &Out) {
   for (MVT Ty : In)
     Out.push_back(WebAssembly::toValType(Ty));
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index fe18347ad8c1d..37059188a7614 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -169,8 +169,7 @@ void computeSignatureVTs(const FunctionType *Ty, const Function *TargetFunc,
                          SmallVectorImpl<MVT> &Params,
                          SmallVectorImpl<MVT> &Results);
 
-void valTypesFromMVTs(const ArrayRef<MVT> &In,
-                      SmallVectorImpl<wasm::ValType> &Out);
+void valTypesFromMVTs(ArrayRef<MVT> In, SmallVectorImpl<wasm::ValType> &Out);
 
 std::unique_ptr<wasm::WasmSignature>
 signatureFromMVTs(const SmallVectorImpl<MVT> &Results,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index c9ef17f928144..acd984f2f8f88 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -129,7 +129,7 @@ buildVRegToDbgValueMap(MachineFunction &MF, const LiveIntervals *Liveness) {
 // changes.
 static void undefInvalidDbgValues(
     const LiveIntervals *Liveness,
-    const ArrayRef<SmallVector<LiveInterval *, 4>> &Assignments,
+    ArrayRef<SmallVector<LiveInterval *, 4>> Assignments,
     DenseMap<Register, std::vector<std::pair<SlotIndex, MachineInstr *>>>
         &DbgVRegToValues) {
 #ifndef NDEBUG
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 3e2e029695ab6..1896ac631d96e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -524,10 +524,10 @@ struct StaticLibcallNameMap {
 
 } // end anonymous namespace
 
-void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
-                               RTLIB::Libcall LC,
-                               SmallVectorImpl<wasm::ValType> &Rets,
-                               SmallVectorImpl<wasm::ValType> &Params) {
+void WebAssembly::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
+                                      RTLIB::Libcall LC,
+                                      SmallVectorImpl<wasm::ValType> &Rets,
+                                      SmallVectorImpl<wasm::ValType> &Params) {
   assert(Rets.empty());
   assert(Params.empty());
 
@@ -889,10 +889,10 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
 
 // TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unused
 // other than here, just roll its logic into this version.
-void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
-                               StringRef Name,
-                               SmallVectorImpl<wasm::ValType> &Rets,
-                               SmallVectorImpl<wasm::ValType> &Params) {
+void WebAssembly::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
+                                      StringRef Name,
+                                      SmallVectorImpl<wasm::ValType> &Rets,
+                                      SmallVectorImpl<wasm::ValType> &Params) {
   static StaticLibcallNameMap LibcallNameMap;
   auto &Map = LibcallNameMap.Map;
   auto Val = Map.find(Name);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
index f7a94aa20bd46..ff6515d5bf4e6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -22,16 +22,18 @@ namespace llvm {
 
 class WebAssemblySubtarget;
 
-extern void getLibcallSignature(const WebAssemblySubtarget &Subtarget,
-                                RTLIB::Libcall LC,
-                                SmallVectorImpl<wasm::ValType> &Rets,
-                                SmallVectorImpl<wasm::ValType> &Params);
+namespace WebAssembly {
 
-extern void getLibcallSignature(const WebAssemblySubtarget &Subtarget,
-                                StringRef Name,
-                                SmallVectorImpl<wasm::ValType> &Rets,
-                                SmallVectorImpl<wasm::ValType> &Params);
+void getLibcallSignature(const WebAssemblySubtarget &Subtarget,
+                         RTLIB::Libcall LC,
+                         SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
 
+void getLibcallSignature(const WebAssemblySubtarget &Subtarget, StringRef Name,
+                         SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
+
+} // end namespace WebAssembly
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e3701d69934c0..6401df9f49f03 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -4068,7 +4068,7 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
 
   if (UseApxExtendedReg && !X86II::canUseApxExtendedReg(MCID))
     return Match_Unsupported;
-  if (ForcedNoFlag != !!(MCID.TSFlags & X86II::EVEX_NF))
+  if (ForcedNoFlag == !(MCID.TSFlags & X86II::EVEX_NF) && !X86::isCFCMOVCC(Opc))
     return Match_Unsupported;
 
   if (ForcedVEXEncoding == VEXEncoding_EVEX &&
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 65b8ebc0b9b99..c0a75e215a40b 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -545,6 +545,14 @@ enum : uint64_t {
   /// PrefixByte - This form is used for instructions that represent a prefix
   /// byte like data16 or rep.
   PrefixByte = 10,
+  /// MRMDestRegCC - This form is used for the cfcmov instructions, which use
+  /// the Mod/RM byte to specify the operands reg(r/m) and reg(reg) and also
+  /// encodes a condition code.
+  MRMDestRegCC = 18,
+  /// MRMDestMemCC - This form is used for the cfcmov instructions, which use
+  /// the Mod/RM byte to specify the operands mem(r/m) and reg(reg) and also
+  /// encodes a condition code.
+  MRMDestMemCC = 19,
   /// MRMDestMem4VOp3CC - This form is used for instructions that use the Mod/RM
   /// byte to specify a destination which in this case is memory and operand 3
   /// with VEX.VVVV, and also encodes a condition code.
@@ -1032,6 +1040,7 @@ inline int getMemoryOperandNo(uint64_t TSFlags) {
     return -1;
   case X86II::MRMDestMem:
   case X86II::MRMDestMemFSIB:
+  case X86II::MRMDestMemCC:
     return hasNewDataDest(TSFlags);
   case X86II::MRMSrcMem:
   case X86II::MRMSrcMemFSIB:
@@ -1045,11 +1054,13 @@ inline int getMemoryOperandNo(uint64_t TSFlags) {
     // Skip registers encoded in reg, VEX_VVVV, and I8IMM.
     return 3;
   case X86II::MRMSrcMemCC:
+    return 1 + hasNewDataDest(TSFlags);
   case X86II::MRMDestMem4VOp3CC:
     // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
     // mask register.
     return 1;
   case X86II::MRMDestReg:
+  case X86II::MRMDestRegCC:
   case X86II::MRMSrcReg:
   case X86II::MRMSrcReg4VOp3:
   case X86II::MRMSrcRegOp4:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 29a1866bf01ab..21c1556d1d8ed 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -394,7 +394,7 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O,
   else if (Flags & X86::IP_HAS_REPEAT)
     O << "\trep\t";
 
-  if (TSFlags & X86II::EVEX_NF)
+  if (TSFlags & X86II::EVEX_NF && !X86::isCFCMOVCC(MI->getOpcode()))
     O << "\t{nf}";
 
   // These all require a pseudo prefix
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index dcd0551b7b765..92a14226a0dc0 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1071,6 +1071,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
   case X86II::MRM_C0:
   case X86II::RawFrm:
     break;
+  case X86II::MRMDestMemCC:
   case X86II::MRMDestMemFSIB:
   case X86II::MRMDestMem: {
     // MRMDestMem instructions forms:
@@ -1102,6 +1103,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     }
     break;
   }
+  case X86II::MRMSrcMemCC:
   case X86II::MRMSrcMemFSIB:
   case X86II::MRMSrcMem: {
     // MRMSrcMem instructions forms:
@@ -1180,6 +1182,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     }
     break;
   }
+  case X86II::MRMSrcRegCC:
   case X86II::MRMSrcReg: {
     // MRMSrcReg instructions forms:
     //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
@@ -1242,6 +1245,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     ++CurOp;
     break;
   }
+  case X86II::MRMDestRegCC:
   case X86II::MRMDestReg: {
     // MRMDestReg instructions forms:
     //  dst(ModR/M), src(ModR/M)
@@ -1638,6 +1642,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     CurOp = SrcRegNum + 1;
     break;
   }
+  case X86II::MRMDestRegCC: {
+    unsigned FirstOp = CurOp++;
+    unsigned SecondOp = CurOp++;
+    unsigned CC = MI.getOperand(CurOp++).getImm();
+    emitByte(BaseOpcode + CC, CB);
+    emitRegModRMByte(MI.getOperand(FirstOp),
+                     getX86RegNum(MI.getOperand(SecondOp)), CB);
+    break;
+  }
   case X86II::MRMDestMem4VOp3CC: {
     unsigned CC = MI.getOperand(8).getImm();
     emitByte(BaseOpcode + CC, CB);
@@ -1667,6 +1680,16 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     CurOp = SrcRegNum + 1;
     break;
   }
+  case X86II::MRMDestMemCC: {
+    unsigned MemOp = CurOp;
+    CurOp = MemOp + X86::AddrNumOperands;
+    unsigned RegOp = CurOp++;
+    unsigned CC = MI.getOperand(CurOp++).getImm();
+    emitByte(BaseOpcode + CC, CB);
+    emitMemModRMByte(MI, MemOp, getX86RegNum(MI.getOperand(RegOp)), TSFlags,
+                     Kind, StartByte, CB, Fixups, STI);
+    break;
+  }
   case X86II::MRMSrcReg: {
     emitByte(BaseOpcode, CB);
     unsigned SrcRegNum = CurOp + 1;
@@ -1717,6 +1740,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     break;
   }
   case X86II::MRMSrcRegCC: {
+    if (IsND) // Skip new data destination
+      ++CurOp;
     unsigned FirstOp = CurOp++;
     unsigned SecondOp = CurOp++;
 
@@ -1778,6 +1803,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     break;
   }
   case X86II::MRMSrcMemCC: {
+    if (IsND) // Skip new data destination
+      ++CurOp;
     unsigned RegOp = CurOp++;
     unsigned FirstMemOp = CurOp;
     CurOp = FirstMemOp + X86::AddrNumOperands;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8367f938c0ddf..78bc043911f2f 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -683,6 +683,12 @@ def TuningFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
 
+// Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
+def TuningFastDPWSSD
+    : SubtargetFeature<
+          "fast-dpwssd", "HasFastDPWSSD", "true",
+          "Prefer vpdpwssd instruction over vpmaddwd+vpaddd instruction sequence">;
+
 def TuningPreferNoGather
     : SubtargetFeature<"prefer-no-gather", "PreferGather", "false",
                        "Prefer no gather instructions">;
@@ -1502,7 +1508,11 @@ def ProcessorFeatures {
     !listconcat(ZN2Tuning, ZN3AdditionalTuning);
   list<SubtargetFeature> ZN3Features =
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
-  list<SubtargetFeature> ZN4Tuning = ZN3Tuning;
+
+
+  list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastDPWSSD];
+  list<SubtargetFeature> ZN4Tuning =
+    !listconcat(ZN3Tuning, ZN4AdditionalTuning);
   list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
                                                   FeatureEVEX512,
                                                   FeatureCDI,
diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 04931afdec51c..a72eeb53915d6 100644
--- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -521,8 +521,8 @@ bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
     return true;
 
   int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
-  int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
-  int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
+  int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;
+  int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;
 
   return !AA->isNoAlias(
       MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
@@ -688,7 +688,7 @@ bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
           !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
         continue;
       int64_t PBstDispImm = getDispOperand(PBInst).getImm();
-      unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
+      unsigned PBstSize = (*PBInst->memoperands_begin())->getSize().getValue();
       // This check doesn't cover all cases, but it will suffice for now.
       // TODO: take branch probability into consideration, if the blocking
       // store is in an unreached block, breaking the memcopy could lose
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index 6ef28b373cc93..453754381034e 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -48,7 +48,7 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
 
 Error X86TargetMachine::buildCodeGenPipeline(
     ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
-    CodeGenFileType FileType, CGPassBuilderOption Opt,
+    CodeGenFileType FileType, const CGPassBuilderOption &Opt,
     PassInstrumentationCallbacks *PIC) {
   auto CGPB = X86CodeGenPassBuilder(*this, Opt, PIC);
   return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 48d3b68b1823a..2eae155956368 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -2133,7 +2133,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     return false;
 
   const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
-  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8);
+  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC) / 8, false,
+                                    Subtarget->hasNDD());
   Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
   updateValueMap(I, ResultReg);
   return true;
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index af25b34fbab99..d96613d7bb7ef 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -604,7 +604,8 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
         }
 
         // Otherwise we can just rewrite in-place.
-        if (X86::getCondFromCMov(MI) != X86::COND_INVALID) {
+        if (X86::getCondFromCMov(MI) != X86::COND_INVALID ||
+            X86::getCondFromCFCMov(MI) != X86::COND_INVALID) {
           rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (getCondFromFCMOV(MI.getOpcode()) != X86::COND_INVALID) {
           rewriteFCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
@@ -829,7 +830,9 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
                                            MachineOperand &FlagUse,
                                            CondRegArray &CondRegs) {
   // First get the register containing this specific condition.
-  X86::CondCode Cond = X86::getCondFromCMov(CMovI);
+  X86::CondCode Cond = X86::getCondFromCMov(CMovI) == X86::COND_INVALID
+                           ? X86::getCondFromCFCMov(CMovI)
+                           : X86::getCondFromCMov(CMovI);
   unsigned CondReg;
   bool Inverted;
   std::tie(CondReg, Inverted) =
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 76c6c1645239a..4e4241efd63d6 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3090,13 +3090,19 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
                                         SDValue &Scale, SDValue &Index,
                                         SDValue &Disp, SDValue &Segment) {
-  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
-  auto *GA = cast<GlobalAddressSDNode>(N);
+  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
+         N.getOpcode() == ISD::TargetExternalSymbol);
 
   X86ISelAddressMode AM;
-  AM.GV = GA->getGlobal();
-  AM.Disp += GA->getOffset();
-  AM.SymbolFlags = GA->getTargetFlags();
+  if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
+    AM.GV = GA->getGlobal();
+    AM.Disp += GA->getOffset();
+    AM.SymbolFlags = GA->getTargetFlags();
+  } else {
+    auto *SA = cast<ExternalSymbolSDNode>(N);
+    AM.ES = SA->getSymbol();
+    AM.SymbolFlags = SA->getTargetFlags();
+  }
 
   if (Subtarget->is32Bit()) {
     AM.Scale = 1;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2b5e3c0379a13..8a6594230f0a2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18592,13 +18592,22 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDLoc dl(GA);
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                           GA->getValueType(0),
-                                           GA->getOffset(),
-                                           OperandFlags);
+  SDValue TGA;
+  bool UseTLSDESC = DAG.getTarget().useTLSDESC();
+  if (LocalDynamic && UseTLSDESC) {
+    TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
+    auto UI = TGA->use_begin();
+    // Reuse existing GetTLSADDR node if we can find it.
+    if (UI != TGA->use_end())
+      return SDValue(*UI->use_begin()->use_begin(), 0);
+  } else {
+    TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                     GA->getOffset(), OperandFlags);
+  }
 
-  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
-                                           : X86ISD::TLSADDR;
+  X86ISD::NodeType CallType = UseTLSDESC     ? X86ISD::TLSDESC
+                              : LocalDynamic ? X86ISD::TLSBASEADDR
+                                             : X86ISD::TLSADDR;
 
   if (InGlue) {
     SDValue Ops[] = { Chain,  TGA, *InGlue };
@@ -18613,7 +18622,19 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   MFI.setHasCalls(true);
 
   SDValue Glue = Chain.getValue(1);
-  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
+  SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
+
+  if (!UseTLSDESC)
+    return Ret;
+
+  const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
+  unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
+
+  Value *Ptr = Constant::getNullValue(PointerType::get(*DAG.getContext(), Seg));
+  SDValue Offset =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
+                  MachinePointerInfo(Ptr));
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
@@ -33426,6 +33447,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(TLSADDR)
   NODE_NAME_CASE(TLSBASEADDR)
   NODE_NAME_CASE(TLSCALL)
+  NODE_NAME_CASE(TLSDESC)
   NODE_NAME_CASE(EH_SJLJ_SETJMP)
   NODE_NAME_CASE(EH_SJLJ_LONGJMP)
   NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
@@ -35206,6 +35228,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
   MachineFunction &MF = *BB->getParent();
 
   // Emit CALLSEQ_START right before the instruction.
+  BB->getParent()->getFrameInfo().setAdjustsStack(true);
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   MachineInstrBuilder CallseqStart =
       BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
@@ -36206,6 +36229,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
   case X86::TLS_base_addrX32:
+  case X86::TLS_desc32:
+  case X86::TLS_desc64:
     return EmitLoweredTLSAddr(MI, BB);
   case X86::INDIRECT_THUNK_CALL32:
   case X86::INDIRECT_THUNK_CALL64:
@@ -42641,7 +42666,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
 bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
     bool PoisonOnly, unsigned Depth) const {
-  unsigned EltsBits = Op.getScalarValueSizeInBits();
   unsigned NumElts = DemandedElts.getBitWidth();
 
   // TODO: Add more target shuffles.
@@ -42649,15 +42673,28 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI: {
     SmallVector<int, 8> Mask;
-    DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
-
-    APInt DemandedSrcElts = APInt::getZero(NumElts);
-    for (unsigned I = 0; I != NumElts; ++I)
-      if (DemandedElts[I])
-        DemandedSrcElts.setBit(Mask[I]);
-
-    return DAG.isGuaranteedNotToBeUndefOrPoison(
-        Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
+    SmallVector<SDValue, 2> Ops;
+    if (getTargetShuffleMask(Op.getNode(), Op.getSimpleValueType(), true, Ops,
+                             Mask)) {
+      SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
+                                            APInt::getZero(NumElts));
+      for (auto M : enumerate(Mask)) {
+        if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
+          continue;
+        if (M.value() == SM_SentinelUndef)
+          return false;
+        assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
+               "Shuffle mask index out of range");
+        DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
+      }
+      for (auto Op : enumerate(Ops))
+        if (!DemandedSrcElts[Op.index()].isZero() &&
+            !DAG.isGuaranteedNotToBeUndefOrPoison(
+                Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
+          return false;
+      return true;
+    }
+    break;
   }
   }
   return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
@@ -47351,6 +47388,16 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
     return V;
 
+  APInt ShiftAmt;
+  if (supportedVectorVarShift(VT, Subtarget, ISD::SRA) &&
+      N1.getOpcode() == ISD::UMIN &&
+      ISD::isConstantSplatVector(N1.getOperand(1).getNode(), ShiftAmt) &&
+      ShiftAmt == VT.getScalarSizeInBits() - 1) {
+    SDValue ShrAmtVal = N1.getOperand(0);
+    SDLoc DL(N);
+    return DAG.getNode(X86ISD::VSRAV, DL, N->getVTList(), N0, ShrAmtVal);
+  }
+
   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index fe1943b576084..0a1e8ca442731 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -295,6 +295,10 @@ namespace llvm {
     // thunk at the address from an earlier relocation.
     TLSCALL,
 
+    // Thread Local Storage. A descriptor containing pointer to
+    // code and to argument to get the TLS offset for the symbol.
+    TLSDESC,
+
     // Exception Handling helpers.
     EH_RETURN,
 
diff --git a/llvm/lib/Target/X86/X86InstrAsmAlias.td b/llvm/lib/Target/X86/X86InstrAsmAlias.td
index fab058dc390bb..6b15213a2e683 100644
--- a/llvm/lib/Target/X86/X86InstrAsmAlias.td
+++ b/llvm/lib/Target/X86/X86InstrAsmAlias.td
@@ -402,6 +402,12 @@ defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
 // No size suffix for intel-style asm.
 defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
 
+// Aliases for cfcmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cfcmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cfcmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cfcmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cfcmov", "", "intel">;
 //===----------------------------------------------------------------------===//
 // Assembler Instruction Aliases
 //===----------------------------------------------------------------------===//
@@ -768,6 +774,20 @@ multiclass CMOV_SETCC_Aliases<string Cond, int CC> {
                   (CMOV64rr GR64:$dst, GR64:$src, CC), 0>;
   def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
                   (CMOV64rm GR64:$dst, i64mem:$src, CC), 0>;
+let Predicates = [In64BitMode] in {
+  def : InstAlias<"cmov"#Cond#"{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV16rr_ND GR16:$dst, GR16:$src1, GR16:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV16rm_ND GR16:$dst, GR16:$src1, i16mem:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV32rr_ND GR32:$dst, GR32:$src1, GR32:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV32rm_ND GR32:$dst, GR32:$src1, i32mem:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV64rr_ND GR64:$dst, GR64:$src1, GR64:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV64rm_ND GR64:$dst, GR64:$src1, i64mem:$src2, CC), 0>;
+}
 
   def : InstAlias<"set"#Cond#"\t$dst", (SETCCr GR8:$dst, CC), 0>;
   def : InstAlias<"set"#Cond#"\t$dst", (SETCCm i8mem:$dst, CC), 0>;
@@ -790,6 +810,57 @@ defm : CMOV_SETCC_Aliases<"ge", 13>;
 defm : CMOV_SETCC_Aliases<"le", 14>;
 defm : CMOV_SETCC_Aliases<"g" , 15>;
 
+multiclass CFCMOV_Aliases<string Cond, int CC> {
+let Predicates = [In64BitMode] in {
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV16rr GR16:$dst, GR16:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV32rr GR32:$dst, GR32:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV64rr GR64:$dst, GR64:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV16rm GR16:$dst, i16mem:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV32rm GR32:$dst, i32mem:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV64rm GR64:$dst, i64mem:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV16mr i16mem:$dst, GR16:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV32mr i32mem:$dst, GR32:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV64mr i64mem:$dst, GR64:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV16rr_ND GR16:$dst, GR16:$src1, GR16:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV32rr_ND GR32:$dst, GR32:$src1, GR32:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV64rr_ND GR64:$dst, GR64:$src1, GR64:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV16rm_ND GR16:$dst, GR16:$src1, i16mem:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV32rm_ND GR32:$dst, GR32:$src1, i32mem:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV64rm_ND GR64:$dst, GR64:$src1, i64mem:$src2, CC), 0>;
+}
+}
+defm : CFCMOV_Aliases<"o" ,  0>;
+defm : CFCMOV_Aliases<"no",  1>;
+defm : CFCMOV_Aliases<"b" ,  2>;
+defm : CFCMOV_Aliases<"ae",  3>;
+defm : CFCMOV_Aliases<"e" ,  4>;
+defm : CFCMOV_Aliases<"ne",  5>;
+defm : CFCMOV_Aliases<"be",  6>;
+defm : CFCMOV_Aliases<"a" ,  7>;
+defm : CFCMOV_Aliases<"s" ,  8>;
+defm : CFCMOV_Aliases<"ns",  9>;
+defm : CFCMOV_Aliases<"p" , 10>;
+defm : CFCMOV_Aliases<"np", 11>;
+defm : CFCMOV_Aliases<"l" , 12>;
+defm : CFCMOV_Aliases<"ge", 13>;
+defm : CFCMOV_Aliases<"le", 14>;
+defm : CFCMOV_Aliases<"g" , 15>;
+
 // Condition dump instructions Alias
 def : InstAlias<"jo\t$dst",  (JCC_1 brtarget8:$dst,  0), 0>;
 def : InstAlias<"jno\t$dst", (JCC_1 brtarget8:$dst,  1), 0>;
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 2e31c05cd687d..d41591f68a605 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -13,46 +13,72 @@
 
 
 // CMOV instructions.
-let isCodeGenOnly = 1, ForceDisassemble = 1 in {
-let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst",
-    isCommutable = 1, SchedRW = [WriteCMOV] in {
-  def CMOV16rr
-    : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
-        "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
-        [(set GR16:$dst,
-              (X86cmov GR16:$src1, GR16:$src2, timm:$cond, EFLAGS))]>,
-              TB, OpSize16;
-  def CMOV32rr
-    : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond),
-        "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
-        [(set GR32:$dst,
-              (X86cmov GR32:$src1, GR32:$src2, timm:$cond, EFLAGS))]>,
-              TB, OpSize32;
-  def CMOV64rr
-    :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond),
-        "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
-        [(set GR64:$dst,
-              (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB;
+multiclass Cmov<X86TypeInfo t, string args, bit ndd = 0, string suffix = ""> {
+let isCommutable = 1, SchedRW = [WriteCMOV] in
+  def rr#suffix : ITy<0x40, MRMSrcRegCC, t, (outs t.RegClass:$dst),
+                      (ins t.RegClass:$src1, t.RegClass:$src2, ccode:$cond),
+                      "cmov${cond}", args,
+                      [(set t.RegClass:$dst, (X86cmov t.RegClass:$src1,
+                                        t.RegClass:$src2, timm:$cond, EFLAGS))]>, UseEFLAGS, NDD<ndd>;
+let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in
+  def rm#suffix : ITy<0x40, MRMSrcMemCC, t, (outs t.RegClass:$dst),
+                      (ins t.RegClass:$src1, t.MemOperand:$src2, ccode:$cond),
+                      "cmov${cond}", args,
+                      [(set t.RegClass:$dst, (X86cmov t.RegClass:$src1,
+                                    (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>, UseEFLAGS, NDD<ndd>;
+}
+
+multiclass Cfcmov<X86TypeInfo t> {
+let isCommutable = 1, SchedRW = [WriteCMOV] in {
+let Predicates = [HasCMOV, HasCF, In64BitMode] in {
+  def rr : ITy<0x40, MRMDestRegCC, t, (outs t.RegClass:$dst),
+               (ins t.RegClass:$src1, ccode:$cond),
+               "cfcmov${cond}", unaryop_ndd_args,
+               [(set t.RegClass:$dst,
+                 (X86cmov 0, t.RegClass:$src1, timm:$cond, EFLAGS))]>, UseEFLAGS, NF;
+  def rr_REV : ITy<0x40, MRMSrcRegCC, t, (outs t.RegClass:$dst),
+                   (ins t.RegClass:$src1, ccode:$cond),
+                   "cfcmov${cond}", unaryop_ndd_args,
+                   []>, UseEFLAGS, EVEX, T_MAP4;
+}
+let Predicates = [HasCMOV, HasCF, HasNDD, In64BitMode] in
+  def rr_ND : ITy<0x40, MRMSrcRegCC, t, (outs t.RegClass:$dst),
+                  (ins t.RegClass:$src1, t.RegClass:$src2, ccode:$cond),
+                  "cfcmov${cond}", binop_ndd_args, []>, UseEFLAGS, NDD<1>, NF;
 }
+let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
+  let Predicates = [HasCMOV, HasCF, In64BitMode], mayLoad = 1 in
+    def rm : ITy<0x40, MRMSrcMemCC, t, (outs t.RegClass:$dst),
+                 (ins t.MemOperand:$src1, ccode:$cond),
+                 "cfcmov${cond}", unaryop_ndd_args, []>, UseEFLAGS, EVEX, T_MAP4;
+  let Predicates = [HasCMOV, HasCF, HasNDD, In64BitMode], mayLoad = 1 in
+    def rm_ND : ITy<0x40, MRMSrcMemCC, t, (outs t.RegClass:$dst),
+                    (ins t.RegClass:$src1, t.MemOperand:$src2, ccode:$cond),
+                    "cfcmov${cond}", binop_ndd_args, []>, UseEFLAGS, NDD<1>, NF;
+}
+let SchedRW = [WriteCMOV, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault],
+    Predicates = [HasCMOV, HasCF, In64BitMode], mayStore = 1 in
+  def mr : ITy<0x40, MRMDestMemCC, t, (outs t.MemOperand:$dst),
+                (ins t.RegClass:$src1, ccode:$cond),
+                "cfcmov${cond}", unaryop_ndd_args, []>, UseEFLAGS, NF;
+}
+
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+  let Predicates = [HasCMOV, NoNDD], Constraints = "$dst = $src1" in {
+    defm CMOV16 : Cmov<Xi16, binop_args>, OpSize16, TB;
+    defm CMOV32 : Cmov<Xi32, binop_args>, OpSize32, TB;
+    defm CMOV64 : Cmov<Xi64, binop_args>, TB;
+  }
 
-let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst",
-    SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
-  def CMOV16rm
-    : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
-        "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
-        [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
-                                  timm:$cond, EFLAGS))]>, TB, OpSize16;
-  def CMOV32rm
-    : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond),
-        "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
-        [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
-                                  timm:$cond, EFLAGS))]>, TB, OpSize32;
-  def CMOV64rm
-    :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond),
-        "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
-        [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
-                                  timm:$cond, EFLAGS))]>, TB;
-} // Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst"
+  let Predicates = [HasCMOV, HasNDD, In64BitMode] in {
+    defm CMOV16 : Cmov<Xi16, binop_ndd_args, 1, "_ND">, PD;
+    defm CMOV32 : Cmov<Xi32, binop_ndd_args, 1, "_ND">;
+    defm CMOV64 : Cmov<Xi64, binop_ndd_args, 1, "_ND">;
+  }
+
+  defm CFCMOV16 : Cfcmov<Xi16>, PD;
+  defm CFCMOV32 : Cfcmov<Xi32>;
+  defm CFCMOV64 : Cfcmov<Xi64>;
 } // isCodeGenOnly = 1, ForceDisassemble = 1
 
 def inv_cond_XFORM : SDNodeXForm<imm, [{
@@ -63,7 +89,7 @@ def inv_cond_XFORM : SDNodeXForm<imm, [{
 
 // Conditional moves with folded loads with operands swapped and conditions
 // inverted.
-let Predicates = [HasCMOV] in {
+let Predicates = [HasCMOV, NoNDD] in {
   def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
             (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
   def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
@@ -72,6 +98,23 @@ let Predicates = [HasCMOV] in {
             (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
 }
 
+let Predicates = [HasCMOV, HasNDD] in {
+  def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
+            (CMOV16rm_ND GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
+            (CMOV32rm_ND GR32:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
+            (CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+}
+let Predicates = [HasCMOV, HasCF] in {
+  def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
+            (CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov GR32:$src1, 0, timm:$cond, EFLAGS),
+            (CFCMOV32rr GR32:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov GR64:$src1, 0, timm:$cond, EFLAGS),
+            (CFCMOV64rr GR64:$src1, (inv_cond_XFORM timm:$cond))>;
+}
+
 // SetCC instructions.
 let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in {
   def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond),
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index f393f86e64aad..ce3b6af4cab47 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -507,6 +507,16 @@ def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   Requires<[In64BitMode, NotLP64]>;
 }
 
+// TLSDESC only clobbers EAX and EFLAGS. ESP is marked as a use to prevent
+// stack-pointer assignments that appear immediately before calls from
+// potentially appearing dead.
+let Defs = [EAX, EFLAGS], usesCustomInserter = 1, Uses = [RSP, SSP] in {
+  def TLS_desc32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                     "# TLS_desc32", [(X86tlsdesc tls32addr:$sym)]>;
+  def TLS_desc64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                     "# TLS_desc64", [(X86tlsdesc tls64addr:$sym)]>;
+}
+
 // Darwin TLS Support
 // For i386, the address of the thunk is passed on the stack, on return the
 // address of the variable is in %eax.  %ecx is trashed during the function
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 13014293267b3..31ee288c6f8bb 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -28,6 +28,8 @@ def RawFrmImm8    : Format<7>;
 def RawFrmImm16   : Format<8>;
 def AddCCFrm      : Format<9>;
 def PrefixByte    : Format<10>;
+def MRMDestRegCC  : Format<18>;
+def MRMDestMemCC  : Format<19>;
 def MRMDestMem4VOp3CC : Format<20>;
 def MRMr0          : Format<21>;
 def MRMSrcMemFSIB  : Format<22>;
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index adf527d72f5b4..f14c7200af968 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -223,6 +223,9 @@ def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
 def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+def X86tlsdesc : SDNode<"X86ISD::TLSDESC", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
                         [SDNPHasChain]>;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b65f49527ae5d..eb42a4b2119d5 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2637,9 +2637,9 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     WorkingMI = CloneIfNew(MI);
     WorkingMI->setDesc(get(Opc));
     break;
-  case X86::CMOV16rr:
-  case X86::CMOV32rr:
-  case X86::CMOV64rr: {
+  CASE_ND(CMOV16rr)
+  CASE_ND(CMOV32rr)
+  CASE_ND(CMOV64rr) {
     WorkingMI = CloneIfNew(MI);
     unsigned OpNo = MI.getDesc().getNumOperands() - 1;
     X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
@@ -3120,7 +3120,8 @@ bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const {
 
 int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) {
   unsigned Opcode = MCID.getOpcode();
-  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode)))
+  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode) ||
+        X86::isCFCMOVCC(Opcode)))
     return -1;
   // Assume that condition code is always the last use operand.
   unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
@@ -3151,6 +3152,11 @@ X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
                                        : X86::COND_INVALID;
 }
 
+X86::CondCode X86::getCondFromCFCMov(const MachineInstr &MI) {
+  return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
+                                         : X86::COND_INVALID;
+}
+
 /// Return the inverse of the specified condition,
 /// e.g. turning COND_E to COND_NE.
 X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
@@ -3312,16 +3318,21 @@ X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
 }
 
 /// Return a cmov opcode for the given register size in bytes, and operand type.
-unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
+unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
+                            bool HasNDD) {
   switch (RegBytes) {
   default:
     llvm_unreachable("Illegal register size!");
+#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
   case 2:
-    return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
+    return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
+                            : GET_ND_IF_ENABLED(X86::CMOV16rr);
   case 4:
-    return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
+    return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
+                            : GET_ND_IF_ENABLED(X86::CMOV32rr);
   case 8:
-    return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
+    return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
+                            : GET_ND_IF_ENABLED(X86::CMOV64rr);
   }
 }
 
@@ -4047,8 +4058,9 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
   const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
   assert(Cond.size() == 1 && "Invalid Cond array");
-  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
-                                    false /*HasMemoryOperand*/);
+  unsigned Opc =
+      X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
+                         false /*HasMemoryOperand*/, Subtarget.hasNDD());
   BuildMI(MBB, I, DL, get(Opc), DstReg)
       .addReg(FalseReg)
       .addReg(TrueReg)
@@ -10565,15 +10577,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
     bool DoRegPressureReduce) const {
   unsigned Opc = Root.getOpcode();
   switch (Opc) {
-  default:
-    return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
-                                                       DoRegPressureReduce);
   case X86::VPDPWSSDrr:
   case X86::VPDPWSSDrm:
   case X86::VPDPWSSDYrr:
   case X86::VPDPWSSDYrm: {
-    Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+    if (!Subtarget.hasFastDPWSSD()) {
+      Patterns.push_back(MachineCombinerPattern::DPWSSD);
+      return true;
+    }
+    break;
   }
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128m:
@@ -10581,11 +10593,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDZ256m:
   case X86::VPDPWSSDZr:
   case X86::VPDPWSSDZm: {
-    if (Subtarget.hasBWI())
+   if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
       Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+      return true;
+    }
+    break;
   }
   }
+  return TargetInstrInfo::getMachineCombinerPatterns(Root,
+                                                     Patterns, DoRegPressureReduce);
 }
 
 static void
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 0e5fcbeda08f7..e719be0caf3ee 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -42,7 +42,8 @@ enum AsmComments {
 std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
 
 /// Return a cmov opcode for the given register size in bytes, and operand type.
-unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false);
+unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false,
+                       bool HasNDD = false);
 
 /// Return the source operand # for condition code by \p MCID. If the
 /// instruction doesn't have a condition code, return -1.
@@ -61,6 +62,9 @@ CondCode getCondFromSETCC(const MachineInstr &MI);
 // Turn CMOV instruction into condition code.
 CondCode getCondFromCMov(const MachineInstr &MI);
 
+// Turn CFCMOV instruction into condition code.
+CondCode getCondFromCFCMov(const MachineInstr &MI);
+
 /// GetOppositeBranchCondition - Return the inverse of the specified cond,
 /// e.g. turning COND_E to COND_NE.
 CondCode GetOppositeBranchCondition(CondCode CC);
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7dd51ba6c027a..9f2709d6b1a20 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -45,6 +45,7 @@ def NoEGPR       : Predicate<"!Subtarget->hasEGPR()">;
 // entries, so that the NDD variant can be selected first to benefit RA.
 def HasNDD       : Predicate<"Subtarget->hasNDD()">;
 def NoNDD        : Predicate<"!Subtarget->hasNDD()">;
+def HasCF        : Predicate<"Subtarget->hasCF()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
 def HasNOPL      : Predicate<"Subtarget->hasNOPL()">;
@@ -238,5 +239,6 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
+def HasFastDPWSSD: Predicate<"Subtarget->hasFastDPWSSD()">;
 def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
 def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index a57c0fe157884..b69058787a4e2 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -102,7 +102,7 @@ static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB,
   auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
   unsigned AllocaAS = DL.getAllocaAddrSpace();
   AllocaInst *AllocaRes =
-      new AllocaInst(Ty, AllocaAS, "", &F.getEntryBlock().front());
+      new AllocaInst(Ty, AllocaAS, "", F.getEntryBlock().begin());
   AllocaRes->setAlignment(AllocaAlignment);
   return AllocaRes;
 }
@@ -453,7 +453,7 @@ static Value *getAllocaPos(BasicBlock *BB) {
   unsigned AllocaAS = DL.getAllocaAddrSpace();
   Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
   AllocaInst *AllocaRes =
-      new AllocaInst(V256I32Ty, AllocaAS, "", &F->getEntryBlock().front());
+      new AllocaInst(V256I32Ty, AllocaAS, "", F->getEntryBlock().begin());
   BasicBlock::iterator Iter = AllocaRes->getIterator();
   ++Iter;
   Builder.SetInsertPoint(&*Iter);
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 64d4d411e7b43..e2330ff34c175 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -519,10 +519,8 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
 void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
                                  const MachineInstr &MI) {
   NoAutoPaddingScope NoPadScope(*OutStreamer);
-  bool Is64Bits = MI.getOpcode() != X86::TLS_addr32 &&
-                  MI.getOpcode() != X86::TLS_base_addr32;
-  bool Is64BitsLP64 = MI.getOpcode() == X86::TLS_addr64 ||
-                      MI.getOpcode() == X86::TLS_base_addr64;
+  bool Is64Bits = getSubtarget().is64Bit();
+  bool Is64BitsLP64 = getSubtarget().isTarget64BitLP64();
   MCContext &Ctx = OutStreamer->getContext();
 
   MCSymbolRefExpr::VariantKind SRVK;
@@ -539,6 +537,10 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
   case X86::TLS_base_addrX32:
     SRVK = MCSymbolRefExpr::VK_TLSLD;
     break;
+  case X86::TLS_desc32:
+  case X86::TLS_desc64:
+    SRVK = MCSymbolRefExpr::VK_TLSDESC;
+    break;
   default:
     llvm_unreachable("unexpected opcode");
   }
@@ -554,7 +556,26 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
   bool UseGot = MMI->getModule()->getRtLibUseGOT() &&
                 Ctx.getTargetOptions()->X86RelaxRelocations;
 
-  if (Is64Bits) {
+  if (SRVK == MCSymbolRefExpr::VK_TLSDESC) {
+    const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(
+        MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)),
+        MCSymbolRefExpr::VK_TLSCALL, Ctx);
+    EmitAndCountInstruction(
+        MCInstBuilder(Is64BitsLP64 ? X86::LEA64r : X86::LEA32r)
+            .addReg(Is64BitsLP64 ? X86::RAX : X86::EAX)
+            .addReg(Is64Bits ? X86::RIP : X86::EBX)
+            .addImm(1)
+            .addReg(0)
+            .addExpr(Sym)
+            .addReg(0));
+    EmitAndCountInstruction(
+        MCInstBuilder(Is64Bits ? X86::CALL64m : X86::CALL32m)
+            .addReg(Is64BitsLP64 ? X86::RAX : X86::EAX)
+            .addImm(1)
+            .addReg(0)
+            .addExpr(Expr)
+            .addReg(0));
+  } else if (Is64Bits) {
     bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD;
     if (NeedsPadding && Is64BitsLP64)
       EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
@@ -2164,6 +2185,8 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
   case X86::TLS_base_addrX32:
+  case X86::TLS_desc32:
+  case X86::TLS_desc64:
     return LowerTlsAddr(MCInstLowering, *MI);
 
   case X86::MOVPC32r: {
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index 0fd3e47aaefe7..4e7ded16729d0 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -60,7 +60,7 @@ class X86TargetMachine final : public LLVMTargetMachine {
 
   Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
                              raw_pwrite_stream *, CodeGenFileType,
-                             CGPassBuilderOption,
+                             const CGPassBuilderOption &,
                              PassInstrumentationCallbacks *) override;
 
   bool isJIT() const { return IsJIT; }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 1a5e6bc886aa6..23035f655098a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -94,6 +94,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::TuningNoDomainDelayBlend,
       X86::TuningPreferShiftShuffle,
       X86::TuningFastImmVectorShift,
+      X86::TuningFastDPWSSD,
 
       // Perf-tuning flags.
       X86::TuningFastGather,
diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index b5a683de33ab1..5e91cce1068b4 100644
--- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -88,12 +88,15 @@ static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
               BasicBlock *PredBB = PN->getIncomingBlock(I);
               if (PredBB->getTerminator()->getNumSuccessors() > 1)
                 PredBB = SplitEdge(PredBB, PN->getParent());
-              Instruction *InsertPos = PredBB->getTerminator();
-              Instruction *NewInst = CE->getAsInstruction(InsertPos);
+              BasicBlock::iterator InsertPos =
+                  PredBB->getTerminator()->getIterator();
+              Instruction *NewInst = CE->getAsInstruction();
+              NewInst->insertBefore(*PredBB, InsertPos);
               PN->setOperand(I, NewInst);
             }
         } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
-          Instruction *NewInst = CE->getAsInstruction(Instr);
+          Instruction *NewInst = CE->getAsInstruction();
+          NewInst->insertBefore(*Instr->getParent(), Instr->getIterator());
           Instr->replaceUsesOfWith(CE, NewInst);
         } else {
           ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index f65ed259eade4..d2c9bae97364e 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -220,6 +220,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xd05", "cortex-a55")
         .Case("0xd46", "cortex-a510")
         .Case("0xd80", "cortex-a520")
+        .Case("0xd88", "cortex-a520ae")
         .Case("0xd07", "cortex-a57")
         .Case("0xd06", "cortex-a65")
         .Case("0xd43", "cortex-a65ae")
@@ -235,6 +236,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xd47", "cortex-a710")
         .Case("0xd4d", "cortex-a715")
         .Case("0xd81", "cortex-a720")
+        .Case("0xd89", "cortex-a720ae")
         .Case("0xd44", "cortex-x1")
         .Case("0xd4c", "cortex-x1c")
         .Case("0xd48", "cortex-x2")
diff --git a/llvm/lib/TextAPI/TextStub.cpp b/llvm/lib/TextAPI/TextStub.cpp
index 24a52607a981c..0f742523f8207 100644
--- a/llvm/lib/TextAPI/TextStub.cpp
+++ b/llvm/lib/TextAPI/TextStub.cpp
@@ -1079,16 +1079,16 @@ Expected<FileType> TextAPIReader::canRead(MemoryBufferRef InputBuffer) {
   if (!TAPIFile.ends_with("..."))
     return createStringError(std::errc::not_supported, "unsupported file type");
 
-  if (TAPIFile.starts_with("--- !tapi-tbd\n"))
+  if (TAPIFile.starts_with("--- !tapi-tbd"))
     return FileType::TBD_V4;
 
-  if (TAPIFile.starts_with("--- !tapi-tbd-v3\n"))
+  if (TAPIFile.starts_with("--- !tapi-tbd-v3"))
     return FileType::TBD_V3;
 
-  if (TAPIFile.starts_with("--- !tapi-tbd-v2\n"))
+  if (TAPIFile.starts_with("--- !tapi-tbd-v2"))
     return FileType::TBD_V2;
 
-  if (TAPIFile.starts_with("--- !tapi-tbd-v1\n") ||
+  if (TAPIFile.starts_with("--- !tapi-tbd-v1") ||
       TAPIFile.starts_with("---\narchs:"))
     return FileType::TBD_V1;
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 7c29d443df51b..08a4522e3fac6 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -971,7 +971,7 @@ static void cacheDIVar(FrameDataInfo &FrameData,
         DIVarCache.insert({V, (*I)->getVariable()});
     };
     CacheIt(findDbgDeclares(V));
-    CacheIt(findDPVDeclares(V));
+    CacheIt(findDVRDeclares(V));
   }
 }
 
@@ -1123,7 +1123,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
          "Coroutine with switch ABI should own Promise alloca");
 
   TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(PromiseAlloca);
-  TinyPtrVector<DPValue *> DPVs = findDPVDeclares(PromiseAlloca);
+  TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(PromiseAlloca);
 
   DILocalVariable *PromiseDIVariable = nullptr;
   DILocation *DILoc = nullptr;
@@ -1131,10 +1131,10 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
     DbgDeclareInst *PromiseDDI = DIs.front();
     PromiseDIVariable = PromiseDDI->getVariable();
     DILoc = PromiseDDI->getDebugLoc().get();
-  } else if (!DPVs.empty()) {
-    DPValue *PromiseDPV = DPVs.front();
-    PromiseDIVariable = PromiseDPV->getVariable();
-    DILoc = PromiseDPV->getDebugLoc().get();
+  } else if (!DVRs.empty()) {
+    DbgVariableRecord *PromiseDVR = DVRs.front();
+    PromiseDIVariable = PromiseDVR->getVariable();
+    DILoc = PromiseDVR->getDebugLoc().get();
   } else {
     return;
   }
@@ -1273,11 +1273,12 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
   }
 
   if (UseNewDbgInfoFormat) {
-    DPValue *NewDPV = new DPValue(ValueAsMetadata::get(Shape.FramePtr),
-                                  FrameDIVar, DBuilder.createExpression(),
-                                  DILoc, DPValue::LocationType::Declare);
+    DbgVariableRecord *NewDVR =
+        new DbgVariableRecord(ValueAsMetadata::get(Shape.FramePtr), FrameDIVar,
+                              DBuilder.createExpression(), DILoc,
+                              DbgVariableRecord::LocationType::Declare);
     BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr();
-    It->getParent()->insertDbgRecordBefore(NewDPV, It);
+    It->getParent()->insertDbgRecordBefore(NewDVR, It);
   } else {
     DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar,
                            DBuilder.createExpression(), DILoc,
@@ -1862,13 +1863,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
               SpillAlignment, E.first->getName() + Twine(".reload"));
 
         TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(Def);
-        TinyPtrVector<DPValue *> DPVs = findDPVDeclares(Def);
+        TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(Def);
         // Try best to find dbg.declare. If the spill is a temp, there may not
         // be a direct dbg.declare. Walk up the load chain to find one from an
         // alias.
         if (F->getSubprogram()) {
           auto *CurDef = Def;
-          while (DIs.empty() && DPVs.empty() && isa<LoadInst>(CurDef)) {
+          while (DIs.empty() && DVRs.empty() && isa<LoadInst>(CurDef)) {
             auto *LdInst = cast<LoadInst>(CurDef);
             // Only consider ptr to ptr same type load.
             if (LdInst->getPointerOperandType() != LdInst->getType())
@@ -1877,7 +1878,7 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
             if (!isa<AllocaInst, LoadInst>(CurDef))
               break;
             DIs = findDbgDeclares(CurDef);
-            DPVs = findDPVDeclares(CurDef);
+            DVRs = findDVRDeclares(CurDef);
           }
         }
 
@@ -1887,12 +1888,12 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
           // fragments. It will be unreachable in the main function, and
           // processed by coro::salvageDebugInfo() by CoroCloner.
           if (UseNewDbgInfoFormat) {
-            DPValue *NewDPV =
-                new DPValue(ValueAsMetadata::get(CurrentReload),
-                            DDI->getVariable(), DDI->getExpression(),
-                            DDI->getDebugLoc(), DPValue::LocationType::Declare);
+            DbgVariableRecord *NewDVR = new DbgVariableRecord(
+                ValueAsMetadata::get(CurrentReload), DDI->getVariable(),
+                DDI->getExpression(), DDI->getDebugLoc(),
+                DbgVariableRecord::LocationType::Declare);
             Builder.GetInsertPoint()->getParent()->insertDbgRecordBefore(
-                NewDPV, Builder.GetInsertPoint());
+                NewDVR, Builder.GetInsertPoint());
           } else {
             DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved)
                 .insertDeclare(CurrentReload, DDI->getVariable(),
@@ -1905,7 +1906,7 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
                                  false /*UseEntryValue*/);
         };
         for_each(DIs, SalvageOne);
-        for_each(DPVs, SalvageOne);
+        for_each(DVRs, SalvageOne);
       }
 
       // If we have a single edge PHINode, remove it and replace it with a
@@ -1925,8 +1926,8 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
       U->replaceUsesOfWith(Def, CurrentReload);
       // Instructions are added to Def's user list if the attached
       // debug records use Def. Update those now.
-      for (DPValue &DPV : DPValue::filter(U->getDbgRecordRange()))
-        DPV.replaceVariableLocationOp(Def, CurrentReload, true);
+      for (DbgVariableRecord &DVR : filterDbgVars(U->getDbgRecordRange()))
+        DVR.replaceVariableLocationOp(Def, CurrentReload, true);
     }
   }
 
@@ -1977,12 +1978,12 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
     G->setName(Alloca->getName() + Twine(".reload.addr"));
 
     SmallVector<DbgVariableIntrinsic *, 4> DIs;
-    SmallVector<DPValue *> DPValues;
-    findDbgUsers(DIs, Alloca, &DPValues);
+    SmallVector<DbgVariableRecord *> DbgVariableRecords;
+    findDbgUsers(DIs, Alloca, &DbgVariableRecords);
     for (auto *DVI : DIs)
       DVI->replaceUsesOfWith(Alloca, G);
-    for (auto *DPV : DPValues)
-      DPV->replaceVariableLocationOp(Alloca, G);
+    for (auto *DVR : DbgVariableRecords)
+      DVR->replaceVariableLocationOp(Alloca, G);
 
     for (Instruction *I : UsersToUpdate) {
       // It is meaningless to retain the lifetime intrinsics refer for the
@@ -2960,30 +2961,30 @@ void coro::salvageDebugInfo(
 }
 
 void coro::salvageDebugInfo(
-    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, DPValue &DPV,
-    bool OptimizeFrame, bool UseEntryValue) {
+    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
+    DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue) {
 
-  Function *F = DPV.getFunction();
+  Function *F = DVR.getFunction();
   // Follow the pointer arithmetic all the way to the incoming
   // function argument and convert into a DIExpression.
-  bool SkipOutermostLoad = DPV.isDbgDeclare();
-  Value *OriginalStorage = DPV.getVariableLocationOp(0);
+  bool SkipOutermostLoad = DVR.isDbgDeclare();
+  Value *OriginalStorage = DVR.getVariableLocationOp(0);
 
   auto SalvagedInfo = ::salvageDebugInfoImpl(
       ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage,
-      DPV.getExpression(), SkipOutermostLoad);
+      DVR.getExpression(), SkipOutermostLoad);
   if (!SalvagedInfo)
     return;
 
   Value *Storage = &SalvagedInfo->first;
   DIExpression *Expr = &SalvagedInfo->second;
 
-  DPV.replaceVariableLocationOp(OriginalStorage, Storage);
-  DPV.setExpression(Expr);
+  DVR.replaceVariableLocationOp(OriginalStorage, Storage);
+  DVR.setExpression(Expr);
   // We only hoist dbg.declare today since it doesn't make sense to hoist
   // dbg.value since it does not have the same function wide guarantees that
   // dbg.declare does.
-  if (DPV.getType() == DPValue::LocationType::Declare) {
+  if (DVR.getType() == DbgVariableRecord::LocationType::Declare) {
     std::optional<BasicBlock::iterator> InsertPt;
     if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
@@ -2991,12 +2992,12 @@ void coro::salvageDebugInfo(
       // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
       // an example.
       if (!OptimizeFrame && I->getDebugLoc())
-        DPV.setDebugLoc(I->getDebugLoc());
+        DVR.setDebugLoc(I->getDebugLoc());
     } else if (isa<Argument>(Storage))
       InsertPt = F->getEntryBlock().begin();
     if (InsertPt) {
-      DPV.removeFromParent();
-      (*InsertPt)->getParent()->insertDbgRecordBefore(&DPV, *InsertPt);
+      DVR.removeFromParent();
+      (*InsertPt)->getParent()->insertDbgRecordBefore(&DVR, *InsertPt);
     }
   }
 }
@@ -3188,15 +3189,15 @@ void coro::buildCoroutineFrame(
   for (auto &Iter : FrameData.Spills) {
     auto *V = Iter.first;
     SmallVector<DbgValueInst *, 16> DVIs;
-    SmallVector<DPValue *, 16> DPVs;
-    findDbgValues(DVIs, V, &DPVs);
+    SmallVector<DbgVariableRecord *, 16> DVRs;
+    findDbgValues(DVIs, V, &DVRs);
     for (DbgValueInst *DVI : DVIs)
       if (Checker.isDefinitionAcrossSuspend(*V, DVI))
         FrameData.Spills[V].push_back(DVI);
     // Add the instructions which carry debug info that is in the frame.
-    for (DPValue *DPV : DPVs)
-      if (Checker.isDefinitionAcrossSuspend(*V, DPV->Marker->MarkedInstr))
-        FrameData.Spills[V].push_back(DPV->Marker->MarkedInstr);
+    for (DbgVariableRecord *DVR : DVRs)
+      if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr))
+        FrameData.Spills[V].push_back(DVR->Marker->MarkedInstr);
   }
 
   LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills));
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 09d1430b4c559..84fd88806154e 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -35,8 +35,8 @@ void salvageDebugInfo(
     SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
     DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool IsEntryPoint);
 void salvageDebugInfo(
-    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, DPValue &DPV,
-    bool OptimizeFrame, bool UseEntryValue);
+    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
+    DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue);
 
 // Keeps data and helper functions for lowering coroutine intrinsics.
 struct LowererBase {
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 086971a1f2137..3f3d81474fafe 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -679,17 +679,18 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
 }
 
 /// Returns all DbgVariableIntrinsic in F.
-static std::pair<SmallVector<DbgVariableIntrinsic *, 8>, SmallVector<DPValue *>>
+static std::pair<SmallVector<DbgVariableIntrinsic *, 8>,
+                 SmallVector<DbgVariableRecord *>>
 collectDbgVariableIntrinsics(Function &F) {
   SmallVector<DbgVariableIntrinsic *, 8> Intrinsics;
-  SmallVector<DPValue *> DPValues;
+  SmallVector<DbgVariableRecord *> DbgVariableRecords;
   for (auto &I : instructions(F)) {
-    for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange()))
-      DPValues.push_back(&DPV);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      DbgVariableRecords.push_back(&DVR);
     if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
       Intrinsics.push_back(DVI);
   }
-  return {Intrinsics, DPValues};
+  return {Intrinsics, DbgVariableRecords};
 }
 
 void CoroCloner::replaceSwiftErrorOps() {
@@ -697,7 +698,7 @@ void CoroCloner::replaceSwiftErrorOps() {
 }
 
 void CoroCloner::salvageDebugInfo() {
-  auto [Worklist, DPValues] = collectDbgVariableIntrinsics(*NewF);
+  auto [Worklist, DbgVariableRecords] = collectDbgVariableIntrinsics(*NewF);
   SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap;
 
   // Only 64-bit ABIs have a register we can refer to with the entry value.
@@ -706,8 +707,8 @@ void CoroCloner::salvageDebugInfo() {
   for (DbgVariableIntrinsic *DVI : Worklist)
     coro::salvageDebugInfo(ArgToAllocaMap, *DVI, Shape.OptimizeFrame,
                            UseEntryValue);
-  for (DPValue *DPV : DPValues)
-    coro::salvageDebugInfo(ArgToAllocaMap, *DPV, Shape.OptimizeFrame,
+  for (DbgVariableRecord *DVR : DbgVariableRecords)
+    coro::salvageDebugInfo(ArgToAllocaMap, *DVR, Shape.OptimizeFrame,
                            UseEntryValue);
 
   // Remove all salvaged dbg.declare intrinsics that became
@@ -732,7 +733,7 @@ void CoroCloner::salvageDebugInfo() {
     }
   };
   for_each(Worklist, RemoveOne);
-  for_each(DPValues, RemoveOne);
+  for_each(DbgVariableRecords, RemoveOne);
 }
 
 void CoroCloner::replaceEntryBlock() {
@@ -2107,12 +2108,12 @@ splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
   // original function. The Cloner has already salvaged debug info in the new
   // coroutine funclets.
   SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap;
-  auto [DbgInsts, DPValues] = collectDbgVariableIntrinsics(F);
+  auto [DbgInsts, DbgVariableRecords] = collectDbgVariableIntrinsics(F);
   for (auto *DDI : DbgInsts)
     coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame,
                            false /*UseEntryValue*/);
-  for (DPValue *DPV : DPValues)
-    coro::salvageDebugInfo(ArgToAllocaMap, *DPV, Shape.OptimizeFrame,
+  for (DbgVariableRecord *DVR : DbgVariableRecords)
+    coro::salvageDebugInfo(ArgToAllocaMap, *DVR, Shape.OptimizeFrame,
                            false /*UseEntryValue*/);
   return Shape;
 }
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 37f4a8749fd8e..29f9264f2fc24 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -723,8 +723,8 @@ static void moveFunctionData(Function &Old, Function &New,
     for (Instruction &Val : CurrBB) {
       // Since debug-info originates from many different locations in the
       // program, it will cause incorrect reporting from a debugger if we keep
-      // the same debug instructions. Drop non-intrinsic DPValues here,
-      // collect intrinsics for removal later.
+      // the same debug instructions. Drop non-intrinsic DbgVariableRecords
+      // here, collect intrinsics for removal later.
       Val.dropDbgRecords();
 
       // We must handle the scoping of called functions differently than
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index ed5352e7d04a9..05a3b169aaaf4 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -256,20 +256,22 @@ class MergeFunctions {
 
   /// Fill PDIUnrelatedWL with instructions from the entry block that are
   /// unrelated to parameter related debug info.
-  /// \param PDPVUnrelatedWL The equivalent non-intrinsic debug records.
-  void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
-                                 std::vector<Instruction *> &PDIUnrelatedWL,
-                                 std::vector<DPValue *> &PDPVUnrelatedWL);
+  /// \param PDVRUnrelatedWL The equivalent non-intrinsic debug records.
+  void
+  filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
+                            std::vector<Instruction *> &PDIUnrelatedWL,
+                            std::vector<DbgVariableRecord *> &PDVRUnrelatedWL);
 
   /// Erase the rest of the CFG (i.e. barring the entry block).
   void eraseTail(Function *G);
 
   /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
   /// parameter debug info, from the entry block.
-  /// \param PDPVUnrelatedWL contains the equivalent set of non-instruction
+  /// \param PDVRUnrelatedWL contains the equivalent set of non-instruction
   /// debug-info records.
-  void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL,
-                                std::vector<DPValue *> &PDPVUnrelatedWL);
+  void
+  eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL,
+                           std::vector<DbgVariableRecord *> &PDVRUnrelatedWL);
 
   /// Replace G with a simple tail call to bitcast(F). Also (unless
   /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
@@ -512,7 +514,7 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
 // parameter debug info, from the entry block.
 void MergeFunctions::eraseInstsUnrelatedToPDI(
     std::vector<Instruction *> &PDIUnrelatedWL,
-    std::vector<DPValue *> &PDPVUnrelatedWL) {
+    std::vector<DbgVariableRecord *> &PDVRUnrelatedWL) {
   LLVM_DEBUG(
       dbgs() << " Erasing instructions (in reverse order of appearance in "
                 "entry block) unrelated to parameter debug info from entry "
@@ -526,13 +528,13 @@ void MergeFunctions::eraseInstsUnrelatedToPDI(
     PDIUnrelatedWL.pop_back();
   }
 
-  while (!PDPVUnrelatedWL.empty()) {
-    DPValue *DPV = PDPVUnrelatedWL.back();
-    LLVM_DEBUG(dbgs() << "  Deleting DPValue ");
-    LLVM_DEBUG(DPV->print(dbgs()));
+  while (!PDVRUnrelatedWL.empty()) {
+    DbgVariableRecord *DVR = PDVRUnrelatedWL.back();
+    LLVM_DEBUG(dbgs() << "  Deleting DbgVariableRecord ");
+    LLVM_DEBUG(DVR->print(dbgs()));
     LLVM_DEBUG(dbgs() << "\n");
-    DPV->eraseFromParent();
-    PDPVUnrelatedWL.pop_back();
+    DVR->eraseFromParent();
+    PDVRUnrelatedWL.pop_back();
   }
 
   LLVM_DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
@@ -564,12 +566,12 @@ void MergeFunctions::eraseTail(Function *G) {
 // PDIUnrelatedWL with such instructions.
 void MergeFunctions::filterInstsUnrelatedToPDI(
     BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL,
-    std::vector<DPValue *> &PDPVUnrelatedWL) {
+    std::vector<DbgVariableRecord *> &PDVRUnrelatedWL) {
   std::set<Instruction *> PDIRelated;
-  std::set<DPValue *> PDPVRelated;
+  std::set<DbgVariableRecord *> PDVRRelated;
 
-  // Work out whether a dbg.value intrinsic or an equivalent DPValue is a
-  // parameter to be preserved.
+  // Work out whether a dbg.value intrinsic or an equivalent DbgVariableRecord
+  // is a parameter to be preserved.
   auto ExamineDbgValue = [](auto *DbgVal, auto &Container) {
     LLVM_DEBUG(dbgs() << " Deciding: ");
     LLVM_DEBUG(DbgVal->print(dbgs()));
@@ -641,14 +643,14 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
 
   for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
        BI != BIE; ++BI) {
-    // Examine DPValues as they happen "before" the instruction. Are they
-    // connected to parameters?
-    for (DPValue &DPV : DPValue::filter(BI->getDbgRecordRange())) {
-      if (DPV.isDbgValue() || DPV.isDbgAssign()) {
-        ExamineDbgValue(&DPV, PDPVRelated);
+    // Examine DbgVariableRecords as they happen "before" the instruction. Are
+    // they connected to parameters?
+    for (DbgVariableRecord &DVR : filterDbgVars(BI->getDbgRecordRange())) {
+      if (DVR.isDbgValue() || DVR.isDbgAssign()) {
+        ExamineDbgValue(&DVR, PDVRRelated);
       } else {
-        assert(DPV.isDbgDeclare());
-        ExamineDbgDeclare(&DPV, PDPVRelated);
+        assert(DVR.isDbgDeclare());
+        ExamineDbgDeclare(&DVR, PDVRRelated);
       }
     }
 
@@ -686,8 +688,8 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
 
   // Collect the set of unrelated instructions and debug records.
   for (Instruction &I : *GEntryBlock) {
-    for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange()))
-      IsPDIRelated(&DPV, PDPVRelated, PDPVUnrelatedWL);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      IsPDIRelated(&DVR, PDVRRelated, PDVRUnrelatedWL);
     IsPDIRelated(&I, PDIRelated, PDIUnrelatedWL);
   }
   LLVM_DEBUG(dbgs() << " }\n");
@@ -728,7 +730,7 @@ static void copyMetadataIfPresent(Function *From, Function *To, StringRef Key) {
 void MergeFunctions::writeThunk(Function *F, Function *G) {
   BasicBlock *GEntryBlock = nullptr;
   std::vector<Instruction *> PDIUnrelatedWL;
-  std::vector<DPValue *> PDPVUnrelatedWL;
+  std::vector<DbgVariableRecord *> PDVRUnrelatedWL;
   BasicBlock *BB = nullptr;
   Function *NewG = nullptr;
   if (MergeFunctionsPDI) {
@@ -740,7 +742,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
         dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
                   "debug info for "
                << G->getName() << "() {\n");
-    filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL, PDPVUnrelatedWL);
+    filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL, PDVRUnrelatedWL);
     GEntryBlock->getTerminator()->eraseFromParent();
     BB = GEntryBlock;
   } else {
@@ -790,7 +792,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
                  << G->getName() << "()\n");
     }
     eraseTail(G);
-    eraseInstsUnrelatedToPDI(PDIUnrelatedWL, PDPVUnrelatedWL);
+    eraseInstsUnrelatedToPDI(PDIUnrelatedWL, PDVRUnrelatedWL);
     LLVM_DEBUG(
         dbgs() << "} // End of parameter related debug info filtering for: "
                << G->getName() << "()\n");
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index ffc26f1a0a972..9a8040bc4b064 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1167,6 +1167,9 @@ void SampleProfileLoader::findExternalInlineCandidate(
   // For AutoFDO profile, retrieve candidate profiles by walking over
   // the nested inlinee profiles.
   if (!FunctionSamples::ProfileIsCS) {
+    // Set threshold to zero to honor pre-inliner decision.
+    if (UsePreInlinerDecision)
+      Threshold = 0;
     Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
     return;
   }
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index dd6062d303d42..3986359b6a5a3 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -575,16 +575,17 @@ bool writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
 }
 
 } // anonymous namespace
-
+extern bool WriteNewDbgInfoFormatToBitcode;
 PreservedAnalyses
 llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  // RemoveDIs: there's no bitcode representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
+  // RemoveDIs: there's no bitcode representation of the DbgVariableRecord
+  // debug-info, convert to dbg.values before writing out.
+  bool ConvertToOldDbgFormatForWrite =
+      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
+  if (ConvertToOldDbgFormatForWrite)
     M.convertFromNewDbgValues();
 
   bool Changed = writeThinLTOBitcode(
@@ -594,7 +595,7 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
       },
       M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
 
-  if (IsNewDbgInfoFormat)
+  if (ConvertToOldDbgFormatForWrite)
     M.convertToNewDbgValues();
 
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f536b22054c7d..1248e5f34c27f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -274,7 +274,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
         DbgAssign->replaceVariableLocationOp(FillC, FillVal);
     };
     for_each(at::getAssignmentMarkers(S), replaceOpForAssignmentMarkers);
-    for_each(at::getDPVAssignmentMarkers(S), replaceOpForAssignmentMarkers);
+    for_each(at::getDVRAssignmentMarkers(S), replaceOpForAssignmentMarkers);
 
     S->setAlignment(Alignment);
     if (isa<AtomicMemSetInst>(MI))
@@ -2289,13 +2289,14 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         default:
           llvm_unreachable("unexpected intrinsic ID");
         }
-        Instruction *NewCall = Builder.CreateBinaryIntrinsic(
+        Value *V = Builder.CreateBinaryIntrinsic(
             IID, X, ConstantFP::get(Arg0->getType(), Res), II);
         // TODO: Conservatively intersecting FMF. If Res == C2, the transform
         //       was a simplification (so Arg0 and its original flags could
         //       propagate?)
-        NewCall->andIRFlags(M);
-        return replaceInstUsesWith(*II, NewCall);
+        if (auto *CI = dyn_cast<CallInst>(V))
+          CI->andIRFlags(M);
+        return replaceInstUsesWith(*II, V);
       }
     }
 
@@ -2472,6 +2473,16 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X))))
       return replaceOperand(*II, 1, X);
 
+    // Clear sign-bit of constant magnitude:
+    // copysign -MagC, X --> copysign MagC, X
+    // TODO: Support constant folding for fabs
+    const APFloat *MagC;
+    if (match(Mag, m_APFloat(MagC)) && MagC->isNegative()) {
+      APFloat PosMagC = *MagC;
+      PosMagC.clearSign();
+      return replaceOperand(*II, 0, ConstantFP::get(Mag->getType(), PosMagC));
+    }
+
     // Peek through changes of magnitude's sign-bit. This call rewrites those:
     // copysign (fabs X), Sign --> copysign X, Sign
     // copysign (fneg X), Sign --> copysign X, Sign
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 57ec9bf4336ff..bc592f6d89396 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1937,10 +1937,26 @@ Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) {
   return replaceInstUsesWith(FI, X);
 }
 
+static Instruction *foldFPtoI(Instruction &FI, InstCombiner &IC) {
+  // fpto{u/s}i non-norm --> 0
+  FPClassTest Mask =
+      FI.getOpcode() == Instruction::FPToUI ? fcPosNormal : fcNormal;
+  KnownFPClass FPClass =
+      computeKnownFPClass(FI.getOperand(0), Mask, /*Depth=*/0,
+                          IC.getSimplifyQuery().getWithInstruction(&FI));
+  if (FPClass.isKnownNever(Mask))
+    return IC.replaceInstUsesWith(FI, ConstantInt::getNullValue(FI.getType()));
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFPToUI(FPToUIInst &FI) {
   if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
+  if (Instruction *I = foldFPtoI(FI, *this))
+    return I;
+
   return commonCastTransforms(FI);
 }
 
@@ -1948,6 +1964,9 @@ Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
   if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
+  if (Instruction *I = foldFPtoI(FI, *this))
+    return I;
+
   return commonCastTransforms(FI);
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 0dce0077bf158..db302d7e52684 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4177,7 +4177,9 @@ static bool isMaskOrZero(const Value *V, bool Not, const SimplifyQuery &Q,
 /// a check for a lossy truncation.
 /// Folds:
 ///   icmp SrcPred (x & Mask), x    to    icmp DstPred x, Mask
+///   icmp SrcPred (x & ~Mask), ~Mask    to    icmp DstPred x, ~Mask
 ///   icmp eq/ne (x & ~Mask), 0     to    icmp DstPred x, Mask
+///   icmp eq/ne (~x | Mask), -1     to    icmp DstPred x, Mask
 /// Where Mask is some pattern that produces all-ones in low bits:
 ///    (-1 >> y)
 ///    ((-1 << y) >> y)     <- non-canonical, has extra uses
@@ -4189,82 +4191,126 @@ static bool isMaskOrZero(const Value *V, bool Not, const SimplifyQuery &Q,
 static Value *foldICmpWithLowBitMaskedVal(ICmpInst::Predicate Pred, Value *Op0,
                                           Value *Op1, const SimplifyQuery &Q,
                                           InstCombiner &IC) {
-  Value *X, *M;
-  bool NeedsNot = false;
-
-  auto CheckMask = [&](Value *V, bool Not) {
-    if (ICmpInst::isSigned(Pred) && !match(V, m_ImmConstant()))
-      return false;
-    return isMaskOrZero(V, Not, Q);
-  };
-
-  if (match(Op0, m_c_And(m_Specific(Op1), m_Value(M))) &&
-      CheckMask(M, /*Not*/ false)) {
-    X = Op1;
-  } else if (match(Op1, m_Zero()) && ICmpInst::isEquality(Pred) &&
-             match(Op0, m_OneUse(m_And(m_Value(X), m_Value(M))))) {
-    NeedsNot = true;
-    if (IC.isFreeToInvert(X, X->hasOneUse()) && CheckMask(X, /*Not*/ true))
-      std::swap(X, M);
-    else if (!IC.isFreeToInvert(M, M->hasOneUse()) ||
-             !CheckMask(M, /*Not*/ true))
-      return nullptr;
-  } else {
-    return nullptr;
-  }
 
   ICmpInst::Predicate DstPred;
   switch (Pred) {
   case ICmpInst::Predicate::ICMP_EQ:
-    //  x & (-1 >> y) == x    ->    x u<= (-1 >> y)
+    //  x & Mask == x
+    //  x & ~Mask == 0
+    //  ~x | Mask == -1
+    //    ->    x u<= Mask
+    //  x & ~Mask == ~Mask
+    //    ->    ~Mask u<= x
     DstPred = ICmpInst::Predicate::ICMP_ULE;
     break;
   case ICmpInst::Predicate::ICMP_NE:
-    //  x & (-1 >> y) != x    ->    x u> (-1 >> y)
+    //  x & Mask != x
+    //  x & ~Mask != 0
+    //  ~x | Mask != -1
+    //    ->    x u> Mask
+    //  x & ~Mask != ~Mask
+    //    ->    ~Mask u> x
     DstPred = ICmpInst::Predicate::ICMP_UGT;
     break;
   case ICmpInst::Predicate::ICMP_ULT:
-    //  x & (-1 >> y) u< x    ->    x u> (-1 >> y)
-    //  x u> x & (-1 >> y)    ->    x u> (-1 >> y)
+    //  x & Mask u< x
+    //    -> x u> Mask
+    //  x & ~Mask u< ~Mask
+    //    -> ~Mask u> x
     DstPred = ICmpInst::Predicate::ICMP_UGT;
     break;
   case ICmpInst::Predicate::ICMP_UGE:
-    //  x & (-1 >> y) u>= x    ->    x u<= (-1 >> y)
-    //  x u<= x & (-1 >> y)    ->    x u<= (-1 >> y)
+    //  x & Mask u>= x
+    //    -> x u<= Mask
+    //  x & ~Mask u>= ~Mask
+    //    -> ~Mask u<= x
     DstPred = ICmpInst::Predicate::ICMP_ULE;
     break;
   case ICmpInst::Predicate::ICMP_SLT:
-    //  x & (-1 >> y) s< x    ->    x s> (-1 >> y)
-    //  x s> x & (-1 >> y)    ->    x s> (-1 >> y)
-    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
-      return nullptr;
-    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
-      return nullptr;
+    //  x & Mask s< x [iff Mask s>= 0]
+    //    -> x s> Mask
+    //  x & ~Mask s< ~Mask [iff ~Mask != 0]
+    //    -> ~Mask s> x
     DstPred = ICmpInst::Predicate::ICMP_SGT;
     break;
   case ICmpInst::Predicate::ICMP_SGE:
-    //  x & (-1 >> y) s>= x    ->    x s<= (-1 >> y)
-    //  x s<= x & (-1 >> y)    ->    x s<= (-1 >> y)
-    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
-      return nullptr;
-    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
-      return nullptr;
+    //  x & Mask s>= x [iff Mask s>= 0]
+    //    -> x s<= Mask
+    //  x & ~Mask s>= ~Mask [iff ~Mask != 0]
+    //    -> ~Mask s<= x
     DstPred = ICmpInst::Predicate::ICMP_SLE;
     break;
-  case ICmpInst::Predicate::ICMP_SGT:
-  case ICmpInst::Predicate::ICMP_SLE:
-    return nullptr;
-  case ICmpInst::Predicate::ICMP_UGT:
-  case ICmpInst::Predicate::ICMP_ULE:
-    llvm_unreachable("Instsimplify took care of commut. variant");
-    break;
   default:
-    llvm_unreachable("All possible folds are handled.");
+    // We don't support sgt,sle
+    // ult/ugt are simplified to true/false respectively.
+    return nullptr;
   }
 
-  // The mask value may be a vector constant that has undefined elements. But it
-  // may not be safe to propagate those undefs into the new compare, so replace
-  // those elements by copying an existing, defined, and safe scalar constant.
+  Value *X, *M;
+  // Put search code in lambda for early positive returns.
+  auto IsLowBitMask = [&]() {
+    if (match(Op0, m_c_And(m_Specific(Op1), m_Value(M)))) {
+      X = Op1;
+      // Look for: x & Mask pred x
+      if (isMaskOrZero(M, /*Not=*/false, Q)) {
+        return !ICmpInst::isSigned(Pred) ||
+               (match(M, m_NonNegative()) || isKnownNonNegative(M, Q));
+      }
+
+      // Look for: x & ~Mask pred ~Mask
+      if (isMaskOrZero(X, /*Not=*/true, Q)) {
+        return !ICmpInst::isSigned(Pred) ||
+               isKnownNonZero(X, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+      }
+      return false;
+    }
+    if (ICmpInst::isEquality(Pred) && match(Op1, m_AllOnes()) &&
+        match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(M))))) {
+
+      auto Check = [&]() {
+        // Look for: ~x | Mask == -1
+        if (isMaskOrZero(M, /*Not=*/false, Q)) {
+          if (Value *NotX =
+                  IC.getFreelyInverted(X, X->hasOneUse(), &IC.Builder)) {
+            X = NotX;
+            return true;
+          }
+        }
+        return false;
+      };
+      if (Check())
+        return true;
+      std::swap(X, M);
+      return Check();
+    }
+    if (ICmpInst::isEquality(Pred) && match(Op1, m_Zero()) &&
+        match(Op0, m_OneUse(m_And(m_Value(X), m_Value(M))))) {
+      auto Check = [&]() {
+        // Look for: x & ~Mask == 0
+        if (isMaskOrZero(M, /*Not=*/true, Q)) {
+          if (Value *NotM =
+                  IC.getFreelyInverted(M, M->hasOneUse(), &IC.Builder)) {
+            M = NotM;
+            return true;
+          }
+        }
+        return false;
+      };
+      if (Check())
+        return true;
+      std::swap(X, M);
+      return Check();
+    }
+    return false;
+  };
+
+  if (!IsLowBitMask())
+    return nullptr;
+
+  // The mask value may be a vector constant that has undefined elements. But
+  // it may not be safe to propagate those undefs into the new compare, so
+  // replace those elements by copying an existing, defined, and safe scalar
+  // constant.
   Type *OpTy = M->getType();
   auto *VecC = dyn_cast<Constant>(M);
   auto *OpVTy = dyn_cast<FixedVectorType>(OpTy);
@@ -4280,8 +4326,6 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst::Predicate Pred, Value *Op0,
     M = Constant::replaceUndefsWith(VecC, SafeReplacementConstant);
   }
 
-  if (NeedsNot)
-    M = IC.Builder.CreateNot(M);
   return IC.Builder.CreateICmp(DstPred, X, M);
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 6a1ef6edeb407..b9ad3a7400792 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -98,6 +98,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *visitSub(BinaryOperator &I);
   Instruction *visitFSub(BinaryOperator &I);
   Instruction *visitMul(BinaryOperator &I);
+  Instruction *foldPowiReassoc(BinaryOperator &I);
   Instruction *foldFMulReassoc(BinaryOperator &I);
   Instruction *visitFMul(BinaryOperator &I);
   Instruction *visitURem(BinaryOperator &I);
@@ -757,10 +758,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   void tryToSinkInstructionDbgValues(
       Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
       BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers);
-  void tryToSinkInstructionDPValues(Instruction *I,
-                                    BasicBlock::iterator InsertPos,
-                                    BasicBlock *SrcBlock, BasicBlock *DestBlock,
-                                    SmallVectorImpl<DPValue *> &DPUsers);
+  void tryToSinkInstructionDbgVariableRecords(
+      Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
+      BasicBlock *DestBlock, SmallVectorImpl<DbgVariableRecord *> &DPUsers);
 
   bool removeInstructionsBeforeUnreachable(Instruction &I);
   void addDeadEdge(BasicBlock *From, BasicBlock *To,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 278be6233f4b8..9d4c271f990d1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -448,6 +448,14 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(X, Op0, ConstantInt::getNullValue(Ty));
 
+  // mul (sext X), Y -> select X, -Y, 0
+  // mul Y, (sext X) -> select X, -Y, 0
+  if (match(&I, m_c_Mul(m_OneUse(m_SExt(m_Value(X))), m_Value(Y))) &&
+      X->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(
+        X, Builder.CreateNeg(Y, "", /*HasNUW=*/false, I.hasNoSignedWrap()),
+        ConstantInt::getNullValue(Op0->getType()));
+
   Constant *ImmC;
   if (match(Op1, m_ImmConstant(ImmC))) {
     // (sext bool X) * C --> X ? -C : 0
@@ -571,6 +579,42 @@ Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) {
   return nullptr;
 }
 
+Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
+  auto createPowiExpr = [](BinaryOperator &I, InstCombinerImpl &IC, Value *X,
+                           Value *Y, Value *Z) {
+    InstCombiner::BuilderTy &Builder = IC.Builder;
+    Value *YZ = Builder.CreateAdd(Y, Z);
+    auto *NewPow = Builder.CreateIntrinsic(
+        Intrinsic::powi, {X->getType(), YZ->getType()}, {X, YZ}, &I);
+    return IC.replaceInstUsesWith(I, NewPow);
+  };
+
+  Value *X, *Y, *Z;
+
+  // powi(X, Y) * X --> powi(X, Y+1)
+  // X * powi(X, Y) --> powi(X, Y+1)
+  if (match(&I, m_c_FMul(m_OneUse(m_AllowReassoc(m_Intrinsic<Intrinsic::powi>(
+                             m_Value(X), m_Value(Y)))),
+                         m_Deferred(X)))) {
+    Constant *One = ConstantInt::get(Y->getType(), 1);
+    if (willNotOverflowSignedAdd(Y, One, I))
+      return createPowiExpr(I, *this, X, Y, One);
+  }
+
+  // powi(x, y) * powi(x, z) -> powi(x, y + z)
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  if (I.isOnlyUserOfAnyOperand() &&
+      match(Op0, m_AllowReassoc(
+                     m_Intrinsic<Intrinsic::powi>(m_Value(X), m_Value(Y)))) &&
+      match(Op1, m_AllowReassoc(m_Intrinsic<Intrinsic::powi>(m_Specific(X),
+                                                             m_Value(Z)))) &&
+      Y->getType() == Z->getType())
+    return createPowiExpr(I, *this, X, Y, Z);
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0);
   Value *Op1 = I.getOperand(1);
@@ -683,6 +727,9 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
     return replaceInstUsesWith(I, Pow);
   }
 
+  if (Instruction *FoldedPowi = foldPowiReassoc(I))
+    return FoldedPowi;
+
   if (I.isOnlyUserOfAnyOperand()) {
     // pow(X, Y) * pow(X, Z) -> pow(X, Y + Z)
     if (match(Op0, m_Intrinsic<Intrinsic::pow>(m_Value(X), m_Value(Y))) &&
@@ -699,16 +746,6 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
       return replaceInstUsesWith(I, NewPow);
     }
 
-    // powi(x, y) * powi(x, z) -> powi(x, y + z)
-    if (match(Op0, m_Intrinsic<Intrinsic::powi>(m_Value(X), m_Value(Y))) &&
-        match(Op1, m_Intrinsic<Intrinsic::powi>(m_Specific(X), m_Value(Z))) &&
-        Y->getType() == Z->getType()) {
-      auto *YZ = Builder.CreateAdd(Y, Z);
-      auto *NewPow = Builder.CreateIntrinsic(
-          Intrinsic::powi, {X->getType(), YZ->getType()}, {X, YZ}, &I);
-      return replaceInstUsesWith(I, NewPow);
-    }
-
     // exp(X) * exp(Y) -> exp(X + Y)
     if (match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))) &&
         match(Op1, m_Intrinsic<Intrinsic::exp>(m_Value(Y)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index aee18d770f729..9ab2bd8f70aa1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -536,19 +536,29 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
     // between 0, 1 and -1.
     const APInt *OOpC;
     bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
-    if (!isa<Constant>(OOp) ||
-        (OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
-      Value *NewSel = Builder.CreateSelect(SI.getCondition(), Swapped ? C : OOp,
-                                           Swapped ? OOp : C, "", &SI);
-      if (isa<FPMathOperator>(&SI))
-        cast<Instruction>(NewSel)->setFastMathFlags(FMF);
-      NewSel->takeName(TVI);
-      BinaryOperator *BO =
-          BinaryOperator::Create(TVI->getOpcode(), FalseVal, NewSel);
-      BO->copyIRFlags(TVI);
-      return BO;
-    }
-    return nullptr;
+    if (isa<Constant>(OOp) &&
+        (!OOpIsAPInt || !isSelect01(C->getUniqueInteger(), *OOpC)))
+      return nullptr;
+
+    // If the false value is a NaN then we have that the floating point math
+    // operation in the transformed code may not preserve the exact NaN
+    // bit-pattern -- e.g. `fadd sNaN, 0.0 -> qNaN`.
+    // This makes the transformation incorrect since the original program would
+    // have preserved the exact NaN bit-pattern.
+    // Avoid the folding if the false value might be a NaN.
+    if (isa<FPMathOperator>(&SI) &&
+        !computeKnownFPClass(FalseVal, FMF, fcNan, &SI).isKnownNeverNaN())
+      return nullptr;
+
+    Value *NewSel = Builder.CreateSelect(SI.getCondition(), Swapped ? C : OOp,
+                                         Swapped ? OOp : C, "", &SI);
+    if (isa<FPMathOperator>(&SI))
+      cast<Instruction>(NewSel)->setFastMathFlags(FMF);
+    NewSel->takeName(TVI);
+    BinaryOperator *BO =
+        BinaryOperator::Create(TVI->getOpcode(), FalseVal, NewSel);
+    BO->copyIRFlags(TVI);
+    return BO;
   };
 
   if (Instruction *R = TryFoldSelectIntoOp(SI, TrueVal, FalseVal, false))
@@ -1191,7 +1201,7 @@ static Value *canonicalizeSPF(ICmpInst &Cmp, Value *TrueVal, Value *FalseVal,
                           match(RHS, m_NSWNeg(m_Specific(LHS)));
     Constant *IntMinIsPoisonC =
         ConstantInt::get(Type::getInt1Ty(Cmp.getContext()), IntMinIsPoison);
-    Instruction *Abs =
+    Value *Abs =
         IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
 
     if (SPF == SelectPatternFlavor::SPF_NABS)
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 17210b59c5c43..a2669f15c1d7d 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1481,6 +1481,11 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign(
 
   // If we have a constant rhs, see if we can losslessly convert it to an int.
   if (Op1FpC != nullptr) {
+    // Signed + Mul req non-zero
+    if (OpsFromSigned && BO.getOpcode() == Instruction::FMul &&
+        !match(Op1FpC, m_NonZeroFP()))
+      return nullptr;
+
     Constant *Op1IntC = ConstantFoldCastOperand(
         OpsFromSigned ? Instruction::FPToSI : Instruction::FPToUI, Op1FpC,
         IntTy, DL);
@@ -1645,6 +1650,7 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
                                              Value *NewOp, InstCombiner &IC) {
   Instruction *Clone = I.clone();
   Clone->replaceUsesOfWith(SI, NewOp);
+  Clone->dropUBImplyingAttrsAndMetadata();
   IC.InsertNewInstBefore(Clone, SI->getIterator());
   return Clone;
 }
@@ -2597,6 +2603,45 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
     return nullptr;
   }
 
+  // De Morgan's Laws:
+  // (~(A | B)) -> (~A & ~B)
+  // (~(A & B)) -> (~A | ~B)
+  auto TryInvertAndOrUsingDeMorgan = [&](Instruction::BinaryOps Opcode,
+                                         bool IsLogical, Value *A,
+                                         Value *B) -> Value * {
+    bool LocalDoesConsume = DoesConsume;
+    if (!getFreelyInvertedImpl(B, B->hasOneUse(), /*Builder=*/nullptr,
+                               LocalDoesConsume, Depth))
+      return nullptr;
+    if (auto *NotA = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+                                           LocalDoesConsume, Depth)) {
+      auto *NotB = getFreelyInvertedImpl(B, B->hasOneUse(), Builder,
+                                         LocalDoesConsume, Depth);
+      DoesConsume = LocalDoesConsume;
+      if (IsLogical)
+        return Builder ? Builder->CreateLogicalOp(Opcode, NotA, NotB) : NonNull;
+      return Builder ? Builder->CreateBinOp(Opcode, NotA, NotB) : NonNull;
+    }
+
+    return nullptr;
+  };
+
+  if (match(V, m_Or(m_Value(A), m_Value(B))))
+    return TryInvertAndOrUsingDeMorgan(Instruction::And, /*IsLogical=*/false, A,
+                                       B);
+
+  if (match(V, m_And(m_Value(A), m_Value(B))))
+    return TryInvertAndOrUsingDeMorgan(Instruction::Or, /*IsLogical=*/false, A,
+                                       B);
+
+  if (match(V, m_LogicalOr(m_Value(A), m_Value(B))))
+    return TryInvertAndOrUsingDeMorgan(Instruction::And, /*IsLogical=*/true, A,
+                                       B);
+
+  if (match(V, m_LogicalAnd(m_Value(A), m_Value(B))))
+    return TryInvertAndOrUsingDeMorgan(Instruction::Or, /*IsLogical=*/true, A,
+                                       B);
+
   return nullptr;
 }
 
@@ -3076,10 +3121,10 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
   // If we are removing an alloca with a dbg.declare, insert dbg.value calls
   // before each store.
   SmallVector<DbgVariableIntrinsic *, 8> DVIs;
-  SmallVector<DPValue *, 8> DPVs;
+  SmallVector<DbgVariableRecord *, 8> DVRs;
   std::unique_ptr<DIBuilder> DIB;
   if (isa<AllocaInst>(MI)) {
-    findDbgUsers(DVIs, &MI, &DPVs);
+    findDbgUsers(DVIs, &MI, &DVRs);
     DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
   }
 
@@ -3119,9 +3164,9 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
         for (auto *DVI : DVIs)
           if (DVI->isAddressOfVariable())
             ConvertDebugDeclareToDebugValue(DVI, SI, *DIB);
-        for (auto *DPV : DPVs)
-          if (DPV->isAddressOfVariable())
-            ConvertDebugDeclareToDebugValue(DPV, SI, *DIB);
+        for (auto *DVR : DVRs)
+          if (DVR->isAddressOfVariable())
+            ConvertDebugDeclareToDebugValue(DVR, SI, *DIB);
       } else {
         // Casts, GEP, or anything else: we're about to delete this instruction,
         // so it can not have any valid uses.
@@ -3166,9 +3211,9 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
     for (auto *DVI : DVIs)
       if (DVI->isAddressOfVariable() || DVI->getExpression()->startsWithDeref())
         DVI->eraseFromParent();
-    for (auto *DPV : DPVs)
-      if (DPV->isAddressOfVariable() || DPV->getExpression()->startsWithDeref())
-        DPV->eraseFromParent();
+    for (auto *DVR : DVRs)
+      if (DVR->isAddressOfVariable() || DVR->getExpression()->startsWithDeref())
+        DVR->eraseFromParent();
 
     return eraseInstFromFunction(MI);
   }
@@ -3835,6 +3880,12 @@ Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
     if (Instruction *Res = foldOpIntoPhi(EV, PN))
       return Res;
 
+  // Canonicalize extract (select Cond, TV, FV)
+  // -> select cond, (extract TV), (extract FV)
+  if (auto *SI = dyn_cast<SelectInst>(Agg))
+    if (Instruction *R = FoldOpIntoSelect(EV, SI, /*FoldWithMultiUse=*/true))
+      return R;
+
   // We could simplify extracts from other values. Note that nested extracts may
   // already be simplified implicitly by the above: extract (extract (insert) )
   // will be translated into extract ( insert ( extract ) ) first and then just
@@ -4563,12 +4614,13 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I,
   // mark the location undef: we know it was supposed to receive a new location
   // here, but that computation has been sunk.
   SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
-  SmallVector<DPValue *, 2> DPValues;
-  findDbgUsers(DbgUsers, I, &DPValues);
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
+  findDbgUsers(DbgUsers, I, &DbgVariableRecords);
   if (!DbgUsers.empty())
     tryToSinkInstructionDbgValues(I, InsertPos, SrcBlock, DestBlock, DbgUsers);
-  if (!DPValues.empty())
-    tryToSinkInstructionDPValues(I, InsertPos, SrcBlock, DestBlock, DPValues);
+  if (!DbgVariableRecords.empty())
+    tryToSinkInstructionDbgVariableRecords(I, InsertPos, SrcBlock, DestBlock,
+                                           DbgVariableRecords);
 
   // PS: there are numerous flaws with this behaviour, not least that right now
   // assignments can be re-ordered past other assignments to the same variable
@@ -4641,47 +4693,48 @@ void InstCombinerImpl::tryToSinkInstructionDbgValues(
   }
 }
 
-void InstCombinerImpl::tryToSinkInstructionDPValues(
+void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords(
     Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-    BasicBlock *DestBlock, SmallVectorImpl<DPValue *> &DPValues) {
-  // Implementation of tryToSinkInstructionDbgValues, but for the DPValue of
-  // variable assignments rather than dbg.values.
-
-  // Fetch all DPValues not already in the destination.
-  SmallVector<DPValue *, 2> DPValuesToSalvage;
-  for (auto &DPV : DPValues)
-    if (DPV->getParent() != DestBlock)
-      DPValuesToSalvage.push_back(DPV);
-
-  // Fetch a second collection, of DPValues in the source block that we're going
-  // to sink.
-  SmallVector<DPValue *> DPValuesToSink;
-  for (DPValue *DPV : DPValuesToSalvage)
-    if (DPV->getParent() == SrcBlock)
-      DPValuesToSink.push_back(DPV);
-
-  // Sort DPValues according to their position in the block. This is a partial
-  // order: DPValues attached to different instructions will be ordered by the
-  // instruction order, but DPValues attached to the same instruction won't
-  // have an order.
-  auto Order = [](DPValue *A, DPValue *B) -> bool {
+    BasicBlock *DestBlock,
+    SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+  // Implementation of tryToSinkInstructionDbgValues, but for the
+  // DbgVariableRecord of variable assignments rather than dbg.values.
+
+  // Fetch all DbgVariableRecords not already in the destination.
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecordsToSalvage;
+  for (auto &DVR : DbgVariableRecords)
+    if (DVR->getParent() != DestBlock)
+      DbgVariableRecordsToSalvage.push_back(DVR);
+
+  // Fetch a second collection, of DbgVariableRecords in the source block that
+  // we're going to sink.
+  SmallVector<DbgVariableRecord *> DbgVariableRecordsToSink;
+  for (DbgVariableRecord *DVR : DbgVariableRecordsToSalvage)
+    if (DVR->getParent() == SrcBlock)
+      DbgVariableRecordsToSink.push_back(DVR);
+
+  // Sort DbgVariableRecords according to their position in the block. This is a
+  // partial order: DbgVariableRecords attached to different instructions will
+  // be ordered by the instruction order, but DbgVariableRecords attached to the
+  // same instruction won't have an order.
+  auto Order = [](DbgVariableRecord *A, DbgVariableRecord *B) -> bool {
     return B->getInstruction()->comesBefore(A->getInstruction());
   };
-  llvm::stable_sort(DPValuesToSink, Order);
+  llvm::stable_sort(DbgVariableRecordsToSink, Order);
 
   // If there are two assignments to the same variable attached to the same
   // instruction, the ordering between the two assignments is important. Scan
   // for this (rare) case and establish which is the last assignment.
   using InstVarPair = std::pair<const Instruction *, DebugVariable>;
-  SmallDenseMap<InstVarPair, DPValue *> FilterOutMap;
-  if (DPValuesToSink.size() > 1) {
+  SmallDenseMap<InstVarPair, DbgVariableRecord *> FilterOutMap;
+  if (DbgVariableRecordsToSink.size() > 1) {
     SmallDenseMap<InstVarPair, unsigned> CountMap;
     // Count how many assignments to each variable there is per instruction.
-    for (DPValue *DPV : DPValuesToSink) {
+    for (DbgVariableRecord *DVR : DbgVariableRecordsToSink) {
       DebugVariable DbgUserVariable =
-          DebugVariable(DPV->getVariable(), DPV->getExpression(),
-                        DPV->getDebugLoc()->getInlinedAt());
-      CountMap[std::make_pair(DPV->getInstruction(), DbgUserVariable)] += 1;
+          DebugVariable(DVR->getVariable(), DVR->getExpression(),
+                        DVR->getDebugLoc()->getInlinedAt());
+      CountMap[std::make_pair(DVR->getInstruction(), DbgUserVariable)] += 1;
     }
 
     // If there are any instructions with two assignments, add them to the
@@ -4697,74 +4750,74 @@ void InstCombinerImpl::tryToSinkInstructionDPValues(
     // For all instruction/variable pairs needing extra filtering, find the
     // latest assignment.
     for (const Instruction *Inst : DupSet) {
-      for (DPValue &DPV :
-           llvm::reverse(DPValue::filter(Inst->getDbgRecordRange()))) {
+      for (DbgVariableRecord &DVR :
+           llvm::reverse(filterDbgVars(Inst->getDbgRecordRange()))) {
         DebugVariable DbgUserVariable =
-            DebugVariable(DPV.getVariable(), DPV.getExpression(),
-                          DPV.getDebugLoc()->getInlinedAt());
+            DebugVariable(DVR.getVariable(), DVR.getExpression(),
+                          DVR.getDebugLoc()->getInlinedAt());
         auto FilterIt =
             FilterOutMap.find(std::make_pair(Inst, DbgUserVariable));
         if (FilterIt == FilterOutMap.end())
           continue;
         if (FilterIt->second != nullptr)
           continue;
-        FilterIt->second = &DPV;
+        FilterIt->second = &DVR;
       }
     }
   }
 
-  // Perform cloning of the DPValues that we plan on sinking, filter out any
-  // duplicate assignments identified above.
-  SmallVector<DPValue *, 2> DPVClones;
+  // Perform cloning of the DbgVariableRecords that we plan on sinking, filter
+  // out any duplicate assignments identified above.
+  SmallVector<DbgVariableRecord *, 2> DVRClones;
   SmallSet<DebugVariable, 4> SunkVariables;
-  for (DPValue *DPV : DPValuesToSink) {
-    if (DPV->Type == DPValue::LocationType::Declare)
+  for (DbgVariableRecord *DVR : DbgVariableRecordsToSink) {
+    if (DVR->Type == DbgVariableRecord::LocationType::Declare)
       continue;
 
     DebugVariable DbgUserVariable =
-        DebugVariable(DPV->getVariable(), DPV->getExpression(),
-                      DPV->getDebugLoc()->getInlinedAt());
+        DebugVariable(DVR->getVariable(), DVR->getExpression(),
+                      DVR->getDebugLoc()->getInlinedAt());
 
     // For any variable where there were multiple assignments in the same place,
     // ignore all but the last assignment.
     if (!FilterOutMap.empty()) {
-      InstVarPair IVP = std::make_pair(DPV->getInstruction(), DbgUserVariable);
+      InstVarPair IVP = std::make_pair(DVR->getInstruction(), DbgUserVariable);
       auto It = FilterOutMap.find(IVP);
 
       // Filter out.
-      if (It != FilterOutMap.end() && It->second != DPV)
+      if (It != FilterOutMap.end() && It->second != DVR)
         continue;
     }
 
     if (!SunkVariables.insert(DbgUserVariable).second)
       continue;
 
-    if (DPV->isDbgAssign())
+    if (DVR->isDbgAssign())
       continue;
 
-    DPVClones.emplace_back(DPV->clone());
-    LLVM_DEBUG(dbgs() << "CLONE: " << *DPVClones.back() << '\n');
+    DVRClones.emplace_back(DVR->clone());
+    LLVM_DEBUG(dbgs() << "CLONE: " << *DVRClones.back() << '\n');
   }
 
   // Perform salvaging without the clones, then sink the clones.
-  if (DPVClones.empty())
+  if (DVRClones.empty())
     return;
 
-  salvageDebugInfoForDbgValues(*I, {}, DPValuesToSalvage);
+  salvageDebugInfoForDbgValues(*I, {}, DbgVariableRecordsToSalvage);
 
   // The clones are in reverse order of original appearance. Assert that the
   // head bit is set on the iterator as we _should_ have received it via
   // getFirstInsertionPt. Inserting like this will reverse the clone order as
   // we'll repeatedly insert at the head, such as:
-  //   DPV-3 (third insertion goes here)
-  //   DPV-2 (second insertion goes here)
-  //   DPV-1 (first insertion goes here)
-  //   Any-Prior-DPVs
+  //   DVR-3 (third insertion goes here)
+  //   DVR-2 (second insertion goes here)
+  //   DVR-1 (first insertion goes here)
+  //   Any-Prior-DVRs
   //   InsertPtInst
   assert(InsertPos.getHeadBit());
-  for (DPValue *DPVClone : DPVClones) {
-    InsertPos->getParent()->insertDbgRecordBefore(DPVClone, InsertPos);
-    LLVM_DEBUG(dbgs() << "SINK: " << *DPVClone << '\n');
+  for (DbgVariableRecord *DVRClone : DVRClones) {
+    InsertPos->getParent()->insertDbgRecordBefore(DVRClone, InsertPos);
+    LLVM_DEBUG(dbgs() << "SINK: " << *DVRClone << '\n');
   }
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 11a5c29c35f70..4bdeb6bbab85a 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -260,6 +260,10 @@ static cl::opt<bool> ClUsePageAliases("hwasan-experimental-use-page-aliases",
 
 namespace {
 
+template <typename T> T optOr(cl::opt<T> &Opt, T Other) {
+  return Opt.getNumOccurrences() ? Opt : Other;
+}
+
 bool shouldUsePageAliases(const Triple &TargetTriple) {
   return ClUsePageAliases && TargetTriple.getArch() == Triple::x86_64;
 }
@@ -269,14 +273,11 @@ bool shouldInstrumentStack(const Triple &TargetTriple) {
 }
 
 bool shouldInstrumentWithCalls(const Triple &TargetTriple) {
-  return ClInstrumentWithCalls.getNumOccurrences()
-             ? ClInstrumentWithCalls
-             : TargetTriple.getArch() == Triple::x86_64;
+  return optOr(ClInstrumentWithCalls, TargetTriple.getArch() == Triple::x86_64);
 }
 
 bool mightUseStackSafetyAnalysis(bool DisableOptimization) {
-  return ClUseStackSafety.getNumOccurrences() ? ClUseStackSafety
-                                              : !DisableOptimization;
+  return optOr(ClUseStackSafety, !DisableOptimization);
 }
 
 bool shouldUseStackSafetyAnalysis(const Triple &TargetTriple,
@@ -296,10 +297,8 @@ class HWAddressSanitizer {
   HWAddressSanitizer(Module &M, bool CompileKernel, bool Recover,
                      const StackSafetyGlobalInfo *SSI)
       : M(M), SSI(SSI) {
-    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
-    this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0
-                              ? ClEnableKhwasan
-                              : CompileKernel;
+    this->Recover = optOr(ClRecover, Recover);
+    this->CompileKernel = optOr(ClEnableKhwasan, CompileKernel);
     this->Rng =
         ClRandomSkipRate.getNumOccurrences() ? M.createRNG("hwasan") : nullptr;
 
@@ -358,7 +357,6 @@ class HWAddressSanitizer {
   bool instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag,
                        const DominatorTree &DT, const PostDominatorTree &PDT,
                        const LoopInfo &LI);
-  Value *readRegister(IRBuilder<> &IRB, StringRef Name);
   bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
   Value *getNextTagWithCall(IRBuilder<> &IRB);
   Value *getStackBaseTag(IRBuilder<> &IRB);
@@ -374,8 +372,7 @@ class HWAddressSanitizer {
   void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
   void instrumentGlobals();
 
-  Value *getPC(IRBuilder<> &IRB);
-  Value *getFP(IRBuilder<> &IRB);
+  Value *getCachedFP(IRBuilder<> &IRB);
   Value *getFrameRecordInfo(IRBuilder<> &IRB);
 
   void instrumentPersonalityFunctions();
@@ -411,8 +408,8 @@ class HWAddressSanitizer {
   ShadowMapping Mapping;
 
   Type *VoidTy = Type::getVoidTy(M.getContext());
-  Type *IntptrTy;
-  PointerType *PtrTy;
+  Type *IntptrTy = M.getDataLayout().getIntPtrType(M.getContext());
+  PointerType *PtrTy = PointerType::getUnqual(M.getContext());
   Type *Int8Ty = Type::getInt8Ty(M.getContext());
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
   Type *Int64Ty = Type::getInt64Ty(M.getContext());
@@ -450,7 +447,7 @@ class HWAddressSanitizer {
 
   Value *ShadowBase = nullptr;
   Value *StackBaseTag = nullptr;
-  Value *CachedSP = nullptr;
+  Value *CachedFP = nullptr;
   GlobalValue *ThreadPtrGlobal = nullptr;
 };
 
@@ -595,8 +592,6 @@ void HWAddressSanitizer::createHwasanCtorComdat() {
 /// inserts a call to __hwasan_init to the module's constructor list.
 void HWAddressSanitizer::initializeModule() {
   LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
-  auto &DL = M.getDataLayout();
-
   TargetTriple = Triple(M.getTargetTriple());
 
   // x86_64 currently has two modes:
@@ -614,8 +609,6 @@ void HWAddressSanitizer::initializeModule() {
 
   C = &(M.getContext());
   IRBuilder<> IRB(*C);
-  IntptrTy = IRB.getIntPtrTy(DL);
-  PtrTy = IRB.getPtrTy();
 
   HwasanCtorFunction = nullptr;
 
@@ -625,19 +618,14 @@ void HWAddressSanitizer::initializeModule() {
   bool NewRuntime =
       !TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30);
 
-  UseShortGranules =
-      ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime;
-  OutlinedChecks =
-      (TargetTriple.isAArch64() || TargetTriple.isRISCV64()) &&
-      TargetTriple.isOSBinFormatELF() &&
-      (ClInlineAllChecks.getNumOccurrences() ? !ClInlineAllChecks : !Recover);
+  UseShortGranules = optOr(ClUseShortGranules, NewRuntime);
+  OutlinedChecks = (TargetTriple.isAArch64() || TargetTriple.isRISCV64()) &&
+                   TargetTriple.isOSBinFormatELF() &&
+                   !optOr(ClInlineAllChecks, Recover);
 
-  InlineFastPath =
-      (ClInlineFastPathChecks.getNumOccurrences()
-           ? ClInlineFastPathChecks
-           : !(TargetTriple.isAndroid() ||
-               TargetTriple.isOSFuchsia())); // These platforms may prefer less
-                                             // inlining to reduce binary size.
+  // These platforms may prefer less inlining to reduce binary size.
+  InlineFastPath = optOr(ClInlineFastPathChecks, !(TargetTriple.isAndroid() ||
+                                                   TargetTriple.isOSFuchsia()));
 
   if (ClMatchAllTag.getNumOccurrences()) {
     if (ClMatchAllTag != -1) {
@@ -649,22 +637,17 @@ void HWAddressSanitizer::initializeModule() {
   UseMatchAllCallback = !CompileKernel && MatchAllTag.has_value();
 
   // If we don't have personality function support, fall back to landing pads.
-  InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences()
-                              ? ClInstrumentLandingPads
-                              : !NewRuntime;
+  InstrumentLandingPads = optOr(ClInstrumentLandingPads, !NewRuntime);
 
   if (!CompileKernel) {
     createHwasanCtorComdat();
-    bool InstrumentGlobals =
-        ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime;
+    bool InstrumentGlobals = optOr(ClGlobals, NewRuntime);
 
     if (InstrumentGlobals && !UsePageAliases)
       instrumentGlobals();
 
     bool InstrumentPersonalityFunctions =
-        ClInstrumentPersonalityFunctions.getNumOccurrences()
-            ? ClInstrumentPersonalityFunctions
-            : NewRuntime;
+        optOr(ClInstrumentPersonalityFunctions, NewRuntime);
     if (InstrumentPersonalityFunctions)
       instrumentPersonalityFunctions();
   }
@@ -1174,10 +1157,10 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
   // Extract some entropy from the stack pointer for the tags.
   // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
   // between functions).
-  Value *StackPointerLong = getFP(IRB);
+  Value *FramePointerLong = getCachedFP(IRB);
   Value *StackTag =
-      applyTagMask(IRB, IRB.CreateXor(StackPointerLong,
-                                      IRB.CreateLShr(StackPointerLong, 20)));
+      applyTagMask(IRB, IRB.CreateXor(FramePointerLong,
+                                      IRB.CreateLShr(FramePointerLong, 20)));
   StackTag->setName("hwasan.stack.base.tag");
   return StackTag;
 }
@@ -1191,9 +1174,9 @@ Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
 }
 
 Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB) {
-  Value *StackPointerLong = getFP(IRB);
+  Value *FramePointerLong = getCachedFP(IRB);
   Value *UARTag =
-      applyTagMask(IRB, IRB.CreateLShr(StackPointerLong, PointerTagShift));
+      applyTagMask(IRB, IRB.CreateLShr(FramePointerLong, PointerTagShift));
 
   UARTag->setName("hwasan.uar.tag");
   return UARTag;
@@ -1252,41 +1235,25 @@ Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
   return nullptr;
 }
 
-Value *HWAddressSanitizer::getPC(IRBuilder<> &IRB) {
-  if (TargetTriple.getArch() == Triple::aarch64)
-    return readRegister(IRB, "pc");
-  return IRB.CreatePtrToInt(IRB.GetInsertBlock()->getParent(), IntptrTy);
-}
-
-Value *HWAddressSanitizer::getFP(IRBuilder<> &IRB) {
-  if (!CachedSP) {
-    // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
-    // first).
-    Function *F = IRB.GetInsertBlock()->getParent();
-    Module *M = F->getParent();
-    auto *GetStackPointerFn = Intrinsic::getDeclaration(
-        M, Intrinsic::frameaddress,
-        IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()));
-    CachedSP = IRB.CreatePtrToInt(
-        IRB.CreateCall(GetStackPointerFn, {Constant::getNullValue(Int32Ty)}),
-        IntptrTy);
-  }
-  return CachedSP;
+Value *HWAddressSanitizer::getCachedFP(IRBuilder<> &IRB) {
+  if (!CachedFP)
+    CachedFP = memtag::getFP(IRB);
+  return CachedFP;
 }
 
 Value *HWAddressSanitizer::getFrameRecordInfo(IRBuilder<> &IRB) {
   // Prepare ring buffer data.
-  Value *PC = getPC(IRB);
-  Value *SP = getFP(IRB);
+  Value *PC = memtag::getPC(TargetTriple, IRB);
+  Value *FP = getCachedFP(IRB);
 
-  // Mix SP and PC.
+  // Mix FP and PC.
   // Assumptions:
   // PC is 0x0000PPPPPPPPPPPP  (48 bits are meaningful, others are zero)
-  // SP is 0xsssssssssssSSSS0  (4 lower bits are zero)
-  // We only really need ~20 lower non-zero bits (SSSS), so we mix like this:
-  //       0xSSSSPPPPPPPPPPPP
-  SP = IRB.CreateShl(SP, 44);
-  return IRB.CreateOr(PC, SP);
+  // FP is 0xfffffffffffFFFF0  (4 lower bits are zero)
+  // We only really need ~20 lower non-zero bits (FFFF), so we mix like this:
+  //       0xFFFFPPPPPPPPPPPP
+  FP = IRB.CreateShl(FP, 44);
+  return IRB.CreateOr(PC, FP);
 }
 
 void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
@@ -1371,38 +1338,24 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
   }
 }
 
-Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) {
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  Function *ReadRegister =
-      Intrinsic::getDeclaration(M, Intrinsic::read_register, IntptrTy);
-  MDNode *MD = MDNode::get(*C, {MDString::get(*C, Name)});
-  Value *Args[] = {MetadataAsValue::get(*C, MD)};
-  return IRB.CreateCall(ReadRegister, Args);
-}
-
 bool HWAddressSanitizer::instrumentLandingPads(
     SmallVectorImpl<Instruction *> &LandingPadVec) {
   for (auto *LP : LandingPadVec) {
     IRBuilder<> IRB(LP->getNextNonDebugInstruction());
     IRB.CreateCall(
         HwasanHandleVfork,
-        {readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp"
-                                                                      : "sp")});
+        {memtag::readRegister(
+            IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp" : "sp")});
   }
   return true;
 }
 
-static bool isLifetimeIntrinsic(Value *V) {
-  auto *II = dyn_cast<IntrinsicInst>(V);
-  return II && II->isLifetimeStartOrEnd();
-}
-
 static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
   return dyn_cast<DbgAssignIntrinsic>(DVI);
 }
 
-static DPValue *DynCastToDbgAssign(DPValue *DPV) {
-  return DPV->isDbgAssign() ? DPV : nullptr;
+static DbgVariableRecord *DynCastToDbgAssign(DbgVariableRecord *DVR) {
+  return DVR->isDbgAssign() ? DVR : nullptr;
 }
 
 bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
@@ -1456,11 +1409,13 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
 
     AI->replaceUsesWithIf(Replacement, [AICast, AILong](const Use &U) {
       auto *User = U.getUser();
-      return User != AILong && User != AICast && !isLifetimeIntrinsic(User);
+      return User != AILong && User != AICast &&
+             !memtag::isLifetimeIntrinsic(User);
     });
 
     // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records,
-    // abstracted over whether they're intrinsic-stored or DPValue stored.
+    // abstracted over whether they're intrinsic-stored or DbgVariableRecord
+    // stored.
     auto AnnotateDbgRecord = [&](auto *DPtr) {
       // Prepend "tag_offset, N" to the dwarf expression.
       // Tag offset logically applies to the alloca pointer, and it makes sense
@@ -1655,7 +1610,7 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   ShadowBase = nullptr;
   StackBaseTag = nullptr;
-  CachedSP = nullptr;
+  CachedFP = nullptr;
 }
 
 void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 8ee0bca7e354f..956ebe8fc8b9a 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -43,7 +43,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/EscapeEnumerator.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -738,8 +737,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     Value *Args[] = {Addr,
                      IRB.CreateBitOrPointerCast(SI->getValueOperand(), Ty),
                      createOrdering(&IRB, SI->getOrdering())};
-    CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
-    ReplaceInstWithInst(I, C);
+    IRB.CreateCall(TsanAtomicStore[Idx], Args);
+    SI->eraseFromParent();
   } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
     Value *Addr = RMWI->getPointerOperand();
     int Idx =
@@ -752,11 +751,12 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     const unsigned ByteSize = 1U << Idx;
     const unsigned BitSize = ByteSize * 8;
     Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
-    Value *Args[] = {Addr,
-                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+    Value *Val = RMWI->getValOperand();
+    Value *Args[] = {Addr, IRB.CreateBitOrPointerCast(Val, Ty),
                      createOrdering(&IRB, RMWI->getOrdering())};
-    CallInst *C = CallInst::Create(F, Args);
-    ReplaceInstWithInst(I, C);
+    Value *C = IRB.CreateCall(F, Args);
+    I->replaceAllUsesWith(IRB.CreateBitOrPointerCast(C, Val->getType()));
+    I->eraseFromParent();
   } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
     Value *Addr = CASI->getPointerOperand();
     Type *OrigOldValTy = CASI->getNewValOperand()->getType();
@@ -794,8 +794,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     FunctionCallee F = FI->getSyncScopeID() == SyncScope::SingleThread
                            ? TsanAtomicSignalFence
                            : TsanAtomicThreadFence;
-    CallInst *C = CallInst::Create(F, Args);
-    ReplaceInstWithInst(I, C);
+    IRB.CreateCall(F, Args);
+    FI->eraseFromParent();
   }
   return true;
 }
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index 4d901310efe52..96ecd7f368a00 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -544,15 +544,16 @@ ADCEChanged AggressiveDeadCodeElimination::removeDeadInstructions() {
   // value of the function, and may therefore be deleted safely.
   // NOTE: We reuse the Worklist vector here for memory efficiency.
   for (Instruction &I : llvm::reverse(instructions(F))) {
-    // With "RemoveDIs" debug-info stored in DPValue objects, debug-info
-    // attached to this instruction, and drop any for scopes that aren't alive,
-    // like the rest of this loop does. Extending support to assignment tracking
-    // is future work.
+    // With "RemoveDIs" debug-info stored in DbgVariableRecord objects,
+    // debug-info attached to this instruction, and drop any for scopes that
+    // aren't alive, like the rest of this loop does. Extending support to
+    // assignment tracking is future work.
     for (DbgRecord &DR : make_early_inc_range(I.getDbgRecordRange())) {
-      // Avoid removing a DPV that is linked to instructions because it holds
+      // Avoid removing a DVR that is linked to instructions because it holds
       // information about an existing store.
-      if (DPValue *DPV = dyn_cast<DPValue>(&DR); DPV && DPV->isDbgAssign())
-        if (!at::getAssignmentInsts(DPV).empty())
+      if (DbgVariableRecord *DVR = dyn_cast<DbgVariableRecord>(&DR);
+          DVR && DVR->isDbgAssign())
+        if (!at::getAssignmentInsts(DVR).empty())
           continue;
       if (AliveScopes.count(DR.getDebugLoc()->getScope()))
         continue;
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 49f8761a13923..68774cf31ce9b 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -162,27 +162,27 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
 
 void ConstantHoistingPass::collectMatInsertPts(
     const RebasedConstantListType &RebasedConstants,
-    SmallVectorImpl<Instruction *> &MatInsertPts) const {
+    SmallVectorImpl<BasicBlock::iterator> &MatInsertPts) const {
   for (const RebasedConstantInfo &RCI : RebasedConstants)
     for (const ConstantUser &U : RCI.Uses)
       MatInsertPts.emplace_back(findMatInsertPt(U.Inst, U.OpndIdx));
 }
 
 /// Find the constant materialization insertion point.
-Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
-                                                   unsigned Idx) const {
+BasicBlock::iterator ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
+                                                           unsigned Idx) const {
   // If the operand is a cast instruction, then we have to materialize the
   // constant before the cast instruction.
   if (Idx != ~0U) {
     Value *Opnd = Inst->getOperand(Idx);
     if (auto CastInst = dyn_cast<Instruction>(Opnd))
       if (CastInst->isCast())
-        return CastInst;
+        return CastInst->getIterator();
   }
 
   // The simple and common case. This also includes constant expressions.
   if (!isa<PHINode>(Inst) && !Inst->isEHPad())
-    return Inst;
+    return Inst->getIterator();
 
   // We can't insert directly before a phi node or an eh pad. Insert before
   // the terminator of the incoming or dominating block.
@@ -191,7 +191,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
   if (Idx != ~0U && isa<PHINode>(Inst)) {
     InsertionBlock = cast<PHINode>(Inst)->getIncomingBlock(Idx);
     if (!InsertionBlock->isEHPad()) {
-      return InsertionBlock->getTerminator();
+      return InsertionBlock->getTerminator()->getIterator();
     }
   } else {
     InsertionBlock = Inst->getParent();
@@ -206,7 +206,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
     IDom = IDom->getIDom();
   }
 
-  return IDom->getBlock()->getTerminator();
+  return IDom->getBlock()->getTerminator()->getIterator();
 }
 
 /// Given \p BBs as input, find another set of BBs which collectively
@@ -314,26 +314,27 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
 }
 
 /// Find an insertion point that dominates all uses.
-SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
+SetVector<BasicBlock::iterator>
+ConstantHoistingPass::findConstantInsertionPoint(
     const ConstantInfo &ConstInfo,
-    const ArrayRef<Instruction *> MatInsertPts) const {
+    const ArrayRef<BasicBlock::iterator> MatInsertPts) const {
   assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
   // Collect all basic blocks.
   SetVector<BasicBlock *> BBs;
-  SetVector<Instruction *> InsertPts;
+  SetVector<BasicBlock::iterator> InsertPts;
 
-  for (Instruction *MatInsertPt : MatInsertPts)
+  for (BasicBlock::iterator MatInsertPt : MatInsertPts)
     BBs.insert(MatInsertPt->getParent());
 
   if (BBs.count(Entry)) {
-    InsertPts.insert(&Entry->front());
+    InsertPts.insert(Entry->begin());
     return InsertPts;
   }
 
   if (BFI) {
     findBestInsertionSet(*DT, *BFI, Entry, BBs);
     for (BasicBlock *BB : BBs)
-      InsertPts.insert(&*BB->getFirstInsertionPt());
+      InsertPts.insert(BB->getFirstInsertionPt());
     return InsertPts;
   }
 
@@ -343,7 +344,7 @@ SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
     BB2 = BBs.pop_back_val();
     BB = DT->findNearestCommonDominator(BB1, BB2);
     if (BB == Entry) {
-      InsertPts.insert(&Entry->front());
+      InsertPts.insert(Entry->begin());
       return InsertPts;
     }
     BBs.insert(BB);
@@ -363,6 +364,9 @@ SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
 void ConstantHoistingPass::collectConstantCandidates(
     ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
     ConstantInt *ConstInt) {
+  if (ConstInt->getType()->isVectorTy())
+    return;
+
   InstructionCost Cost;
   // Ask the target about the cost of materializing the constant for the given
   // instruction and operand index.
@@ -761,11 +765,13 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
       Mat = GetElementPtrInst::Create(Type::getInt8Ty(*Ctx), Base, Adj->Offset,
                                       "mat_gep", Adj->MatInsertPt);
       // Hide it behind a bitcast.
-      Mat = new BitCastInst(Mat, Adj->Ty, "mat_bitcast", Adj->MatInsertPt);
+      Mat = new BitCastInst(Mat, Adj->Ty, "mat_bitcast",
+                            Adj->MatInsertPt->getIterator());
     } else
       // Constant being rebased is a ConstantInt.
-      Mat = BinaryOperator::Create(Instruction::Add, Base, Adj->Offset,
-                                   "const_mat", Adj->MatInsertPt);
+      Mat =
+          BinaryOperator::Create(Instruction::Add, Base, Adj->Offset,
+                                 "const_mat", Adj->MatInsertPt->getIterator());
 
     LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
                       << " + " << *Adj->Offset << ") in BB "
@@ -816,7 +822,8 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
 
     // Aside from constant GEPs, only constant cast expressions are collected.
     assert(ConstExpr->isCast() && "ConstExpr should be a cast");
-    Instruction *ConstExprInst = ConstExpr->getAsInstruction(Adj->MatInsertPt);
+    Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+    ConstExprInst->insertBefore(Adj->MatInsertPt);
     ConstExprInst->setOperand(0, Mat);
 
     // Use the same debug location as the instruction we are about to update.
@@ -842,9 +849,9 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
   SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec =
       BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
   for (const consthoist::ConstantInfo &ConstInfo : ConstInfoVec) {
-    SmallVector<Instruction *, 4> MatInsertPts;
+    SmallVector<BasicBlock::iterator, 4> MatInsertPts;
     collectMatInsertPts(ConstInfo.RebasedConstants, MatInsertPts);
-    SetVector<Instruction *> IPSet =
+    SetVector<BasicBlock::iterator> IPSet =
         findConstantInsertionPoint(ConstInfo, MatInsertPts);
     // We can have an empty set if the function contains unreachable blocks.
     if (IPSet.empty())
@@ -853,7 +860,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
     unsigned UsesNum = 0;
     unsigned ReBasesNum = 0;
     unsigned NotRebasedNum = 0;
-    for (Instruction *IP : IPSet) {
+    for (const BasicBlock::iterator &IP : IPSet) {
       // First, collect constants depending on this IP of the base.
       UsesNum = 0;
       SmallVector<UserAdjustment, 4> ToBeRebased;
@@ -861,7 +868,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
       for (auto const &RCI : ConstInfo.RebasedConstants) {
         UsesNum += RCI.Uses.size();
         for (auto const &U : RCI.Uses) {
-          Instruction *MatInsertPt = MatInsertPts[MatCtr++];
+          const BasicBlock::iterator &MatInsertPt = MatInsertPts[MatCtr++];
           BasicBlock *OrigMatInsertBB = MatInsertPt->getParent();
           // If Base constant is to be inserted in multiple places,
           // generate rebase for U using the Base dominating U.
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 85d4065286e41..1caed93b1b668 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -65,6 +65,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
@@ -95,6 +96,11 @@ static cl::opt<bool>
                     cl::desc("View the CFG before DFA Jump Threading"),
                     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> EarlyExitHeuristic(
+    "dfa-early-exit-heuristic",
+    cl::desc("Exit early if an unpredictable value come from the same loop"),
+    cl::Hidden, cl::init(true));
+
 static cl::opt<unsigned> MaxPathLength(
     "dfa-max-path-length",
     cl::desc("Max number of blocks searched to find a threading path"),
@@ -131,9 +137,9 @@ void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
 
 class DFAJumpThreading {
 public:
-  DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT,
+  DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
                    TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE)
-      : AC(AC), DT(DT), TTI(TTI), ORE(ORE) {}
+      : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {}
 
   bool run(Function &F);
 
@@ -161,6 +167,7 @@ class DFAJumpThreading {
 
   AssumptionCache *AC;
   DominatorTree *DT;
+  LoopInfo *LI;
   TargetTransformInfo *TTI;
   OptimizationRemarkEmitter *ORE;
 };
@@ -378,7 +385,8 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) {
 #endif
 
 struct MainSwitch {
-  MainSwitch(SwitchInst *SI, OptimizationRemarkEmitter *ORE) {
+  MainSwitch(SwitchInst *SI, LoopInfo *LI, OptimizationRemarkEmitter *ORE)
+      : LI(LI) {
     if (isCandidate(SI)) {
       Instr = SI;
     } else {
@@ -402,7 +410,7 @@ struct MainSwitch {
   ///
   /// Also, collect select instructions to unfold.
   bool isCandidate(const SwitchInst *SI) {
-    std::deque<Value *> Q;
+    std::deque<std::pair<Value *, BasicBlock *>> Q;
     SmallSet<Value *, 16> SeenValues;
     SelectInsts.clear();
 
@@ -411,22 +419,29 @@ struct MainSwitch {
     if (!isa<PHINode>(SICond))
       return false;
 
-    addToQueue(SICond, Q, SeenValues);
+    // The switch must be in a loop.
+    const Loop *L = LI->getLoopFor(SI->getParent());
+    if (!L)
+      return false;
+
+    addToQueue(SICond, nullptr, Q, SeenValues);
 
     while (!Q.empty()) {
-      Value *Current = Q.front();
+      Value *Current = Q.front().first;
+      BasicBlock *CurrentIncomingBB = Q.front().second;
       Q.pop_front();
 
       if (auto *Phi = dyn_cast<PHINode>(Current)) {
-        for (Value *Incoming : Phi->incoming_values()) {
-          addToQueue(Incoming, Q, SeenValues);
+        for (BasicBlock *IncomingBB : Phi->blocks()) {
+          Value *Incoming = Phi->getIncomingValueForBlock(IncomingBB);
+          addToQueue(Incoming, IncomingBB, Q, SeenValues);
         }
         LLVM_DEBUG(dbgs() << "\tphi: " << *Phi << "\n");
       } else if (SelectInst *SelI = dyn_cast<SelectInst>(Current)) {
         if (!isValidSelectInst(SelI))
           return false;
-        addToQueue(SelI->getTrueValue(), Q, SeenValues);
-        addToQueue(SelI->getFalseValue(), Q, SeenValues);
+        addToQueue(SelI->getTrueValue(), CurrentIncomingBB, Q, SeenValues);
+        addToQueue(SelI->getFalseValue(), CurrentIncomingBB, Q, SeenValues);
         LLVM_DEBUG(dbgs() << "\tselect: " << *SelI << "\n");
         if (auto *SelIUse = dyn_cast<PHINode>(SelI->user_back()))
           SelectInsts.push_back(SelectInstToUnfold(SelI, SelIUse));
@@ -439,6 +454,18 @@ struct MainSwitch {
         // initial switch values that can be ignored (they will hit the
         // unthreaded switch) but this assumption will get checked later after
         // paths have been enumerated (in function getStateDefMap).
+
+        // If the unpredictable value comes from the same inner loop it is
+        // likely that it will also be on the enumerated paths, causing us to
+        // exit after we have enumerated all the paths. This heuristic save
+        // compile time because a search for all the paths can become expensive.
+        if (EarlyExitHeuristic &&
+            L->contains(LI->getLoopFor(CurrentIncomingBB))) {
+          LLVM_DEBUG(dbgs()
+                     << "\tExiting early due to unpredictability heuristic.\n");
+          return false;
+        }
+
         continue;
       }
     }
@@ -446,11 +473,12 @@ struct MainSwitch {
     return true;
   }
 
-  void addToQueue(Value *Val, std::deque<Value *> &Q,
+  void addToQueue(Value *Val, BasicBlock *BB,
+                  std::deque<std::pair<Value *, BasicBlock *>> &Q,
                   SmallSet<Value *, 16> &SeenValues) {
     if (SeenValues.contains(Val))
       return;
-    Q.push_back(Val);
+    Q.push_back({Val, BB});
     SeenValues.insert(Val);
   }
 
@@ -488,6 +516,7 @@ struct MainSwitch {
     return true;
   }
 
+  LoopInfo *LI;
   SwitchInst *Instr = nullptr;
   SmallVector<SelectInstToUnfold, 4> SelectInsts;
 };
@@ -1262,7 +1291,7 @@ bool DFAJumpThreading::run(Function &F) {
 
     LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName()
                       << " is a candidate\n");
-    MainSwitch Switch(SI, ORE);
+    MainSwitch Switch(SI, LI, ORE);
 
     if (!Switch.getInstr())
       continue;
@@ -1315,10 +1344,11 @@ PreservedAnalyses DFAJumpThreadingPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   OptimizationRemarkEmitter ORE(&F);
 
-  if (!DFAJumpThreading(&AC, &DT, &TTI, &ORE).run(F))
+  if (!DFAJumpThreading(&AC, &DT, &LI, &TTI, &ORE).run(F))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 392e6ad5a66bb..bfc8bd5970bf2 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -526,7 +526,8 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
   // returned by getAssignmentMarkers so save a copy of the markers to iterate
   // over.
   auto LinkedRange = at::getAssignmentMarkers(Inst);
-  SmallVector<DPValue *> LinkedDPVAssigns = at::getDPVAssignmentMarkers(Inst);
+  SmallVector<DbgVariableRecord *> LinkedDVRAssigns =
+      at::getDVRAssignmentMarkers(Inst);
   SmallVector<DbgAssignIntrinsic *> Linked(LinkedRange.begin(),
                                            LinkedRange.end());
   auto InsertAssignForOverlap = [&](auto *Assign) {
@@ -554,7 +555,7 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
     NewAssign->setKillAddress();
   };
   for_each(Linked, InsertAssignForOverlap);
-  for_each(LinkedDPVAssigns, InsertAssignForOverlap);
+  for_each(LinkedDVRAssigns, InsertAssignForOverlap);
 }
 
 static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 6ad4be169b589..ccca8bcc1a56a 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -311,7 +311,7 @@ void Float2IntPass::walkForwards() {
 }
 
 // If there is a valid transform to be done, do it.
-bool Float2IntPass::validateAndTransform(const DataLayout &DL) {
+bool Float2IntPass::validateAndTransform() {
   bool MadeChange = false;
 
   // Iterate over every disjoint partition of the def-use graph.
@@ -376,24 +376,16 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) {
       LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
       continue;
     }
-
-    // OK, R is known to be representable.
-    // Pick the smallest legal type that will fit.
-    Type *Ty = DL.getSmallestLegalIntType(*Ctx, MinBW);
-    if (!Ty) {
-      // Every supported target supports 64-bit and 32-bit integers,
-      // so fallback to a 32 or 64-bit integer if the value fits.
-      if (MinBW <= 32) {
-        Ty = Type::getInt32Ty(*Ctx);
-      } else if (MinBW <= 64) {
-        Ty = Type::getInt64Ty(*Ctx);
-      } else {
-        LLVM_DEBUG(dbgs() << "F2I: Value requires more than bits to represent "
-                             "than the target supports!\n");
-        continue;
-      }
+    if (MinBW > 64) {
+      LLVM_DEBUG(
+          dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+      continue;
     }
 
+    // OK, R is known to be representable. Now pick a type for it.
+    // FIXME: Pick the smallest legal type that will fit.
+    Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+
     for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
          MI != ME; ++MI)
       convert(*MI, Ty);
@@ -499,8 +491,7 @@ bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
   walkBackwards();
   walkForwards();
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  bool Modified = validateAndTransform(DL);
+  bool Modified = validateAndTransform();
   if (Modified)
     cleanup();
   return Modified;
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 1058a015017e4..fd68359525707 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -401,8 +401,8 @@ static bool replaceFoldableUses(Instruction *Cond, Value *ToVal,
     Changed |= replaceNonLocalUsesWith(Cond, ToVal);
   for (Instruction &I : reverse(*KnownAtEndOfBB)) {
     // Replace any debug-info record users of Cond with ToVal.
-    for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange()))
-      DPV.replaceVariableLocationOp(Cond, ToVal, true);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      DVR.replaceVariableLocationOp(Cond, ToVal, true);
 
     // Reached the Cond whose uses we are trying to replace, so there are no
     // more uses.
@@ -1955,7 +1955,7 @@ void JumpThreadingPass::updateSSA(
   SSAUpdater SSAUpdate;
   SmallVector<Use *, 16> UsesToRename;
   SmallVector<DbgValueInst *, 4> DbgValues;
-  SmallVector<DPValue *, 4> DPValues;
+  SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
 
   for (Instruction &I : *BB) {
     // Scan all uses of this instruction to see if it is used outside of its
@@ -1972,16 +1972,16 @@ void JumpThreadingPass::updateSSA(
     }
 
     // Find debug values outside of the block
-    findDbgValues(DbgValues, &I, &DPValues);
+    findDbgValues(DbgValues, &I, &DbgVariableRecords);
     llvm::erase_if(DbgValues, [&](const DbgValueInst *DbgVal) {
       return DbgVal->getParent() == BB;
     });
-    llvm::erase_if(DPValues, [&](const DPValue *DPVal) {
-      return DPVal->getParent() == BB;
+    llvm::erase_if(DbgVariableRecords, [&](const DbgVariableRecord *DbgVarRec) {
+      return DbgVarRec->getParent() == BB;
     });
 
     // If there are no uses outside the block, we're done with this instruction.
-    if (UsesToRename.empty() && DbgValues.empty() && DPValues.empty())
+    if (UsesToRename.empty() && DbgValues.empty() && DbgVariableRecords.empty())
       continue;
     LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
 
@@ -1994,11 +1994,11 @@ void JumpThreadingPass::updateSSA(
 
     while (!UsesToRename.empty())
       SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
-    if (!DbgValues.empty() || !DPValues.empty()) {
+    if (!DbgValues.empty() || !DbgVariableRecords.empty()) {
       SSAUpdate.UpdateDebugValues(&I, DbgValues);
-      SSAUpdate.UpdateDebugValues(&I, DPValues);
+      SSAUpdate.UpdateDebugValues(&I, DbgVariableRecords);
       DbgValues.clear();
-      DPValues.clear();
+      DbgVariableRecords.clear();
     }
 
     LLVM_DEBUG(dbgs() << "\n");
@@ -2041,11 +2041,11 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
     return true;
   };
 
-  // Duplicate implementation of the above dbg.value code, using DPValues
-  // instead.
-  auto RetargetDPValueIfPossible = [&](DPValue *DPV) {
+  // Duplicate implementation of the above dbg.value code, using
+  // DbgVariableRecords instead.
+  auto RetargetDbgVariableRecordIfPossible = [&](DbgVariableRecord *DVR) {
     SmallSet<std::pair<Value *, Value *>, 16> OperandsToRemap;
-    for (auto *Op : DPV->location_ops()) {
+    for (auto *Op : DVR->location_ops()) {
       Instruction *OpInst = dyn_cast<Instruction>(Op);
       if (!OpInst)
         continue;
@@ -2056,7 +2056,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
     }
 
     for (auto &[OldOp, MappedOp] : OperandsToRemap)
-      DPV->replaceVariableLocationOp(OldOp, MappedOp);
+      DVR->replaceVariableLocationOp(OldOp, MappedOp);
   };
 
   BasicBlock *RangeBB = BI->getParent();
@@ -2080,9 +2080,9 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
   cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context);
 
   auto CloneAndRemapDbgInfo = [&](Instruction *NewInst, Instruction *From) {
-    auto DPVRange = NewInst->cloneDebugInfoFrom(From);
-    for (DPValue &DPV : DPValue::filter(DPVRange))
-      RetargetDPValueIfPossible(&DPV);
+    auto DVRRange = NewInst->cloneDebugInfoFrom(From);
+    for (DbgVariableRecord &DVR : filterDbgVars(DVRRange))
+      RetargetDbgVariableRecordIfPossible(&DVR);
   };
 
   // Clone the non-phi instructions of the source basic block into NewBB,
@@ -2109,15 +2109,15 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
       }
   }
 
-  // There may be DPValues on the terminator, clone directly from marker
-  // to marker as there isn't an instruction there.
+  // There may be DbgVariableRecords on the terminator, clone directly from
+  // marker to marker as there isn't an instruction there.
   if (BE != RangeBB->end() && BE->hasDbgRecords()) {
     // Dump them at the end.
     DPMarker *Marker = RangeBB->getMarker(BE);
     DPMarker *EndMarker = NewBB->createMarker(NewBB->end());
-    auto DPVRange = EndMarker->cloneDebugInfoFrom(Marker, std::nullopt);
-    for (DPValue &DPV : DPValue::filter(DPVRange))
-      RetargetDPValueIfPossible(&DPV);
+    auto DVRRange = EndMarker->cloneDebugInfoFrom(Marker, std::nullopt);
+    for (DbgVariableRecord &DVR : filterDbgVars(DVRRange))
+      RetargetDbgVariableRecordIfPossible(&DVR);
   }
 
   return ValueMapping;
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 40bc16f9b5751..e50413de46b1b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2701,6 +2701,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
 
   // First, we need to make sure we should do the transformation.
   SmallVector<Use *> Changes;
+  SmallVector<BinaryOperator *> Adds;
   SmallVector<BinaryOperator *> Worklist;
   if (BinaryOperator *VariantBinOp = dyn_cast<BinaryOperator>(VariantOp))
     Worklist.push_back(VariantBinOp);
@@ -2713,6 +2714,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
         isa<BinaryOperator>(BO->getOperand(1))) {
       Worklist.push_back(cast<BinaryOperator>(BO->getOperand(0)));
       Worklist.push_back(cast<BinaryOperator>(BO->getOperand(1)));
+      Adds.push_back(BO);
       continue;
     }
     if (!isReassociableOp(BO, Instruction::Mul, Instruction::FMul) ||
@@ -2735,6 +2737,12 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
   if (Changes.empty())
     return false;
 
+  // Drop the poison flags for any adds we looked through.
+  if (I.getType()->isIntOrIntVectorTy()) {
+    for (auto *Add : Adds)
+      Add->dropPoisonGeneratingFlags();
+  }
+
   // We know we should do it so let's do the transformation.
   auto *Preheader = L.getLoopPreheader();
   assert(Preheader && "Loop is not in simplify form?");
@@ -2743,9 +2751,11 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
     assert(L.isLoopInvariant(U->get()));
     Instruction *Ins = cast<Instruction>(U->getUser());
     Value *Mul;
-    if (I.getType()->isIntOrIntVectorTy())
+    if (I.getType()->isIntOrIntVectorTy()) {
       Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul");
-    else
+      // Drop the poison flags on the original multiply.
+      Ins->dropPoisonGeneratingFlags();
+    } else
       Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul");
     U->set(Mul);
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index c4e1a0db8b323..ec42e2d6e193a 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6367,10 +6367,10 @@ struct DVIRecoveryRec {
   DVIRecoveryRec(DbgValueInst *DbgValue)
       : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
         HadLocationArgList(false) {}
-  DVIRecoveryRec(DPValue *DPV)
-      : DbgRef(DPV), Expr(DPV->getExpression()), HadLocationArgList(false) {}
+  DVIRecoveryRec(DbgVariableRecord *DVR)
+      : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
 
-  PointerUnion<DbgValueInst *, DPValue *> DbgRef;
+  PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgRef;
   DIExpression *Expr;
   bool HadLocationArgList;
   SmallVector<WeakVH, 2> LocationOps;
@@ -6466,7 +6466,7 @@ static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
   if (isa<DbgValueInst *>(DVIRec.DbgRef))
     UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
   else
-    UpdateDbgValueInstImpl(cast<DPValue *>(DVIRec.DbgRef));
+    UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
 }
 
 /// Cached location ops may be erased during LSR, in which case a poison is
@@ -6512,7 +6512,7 @@ static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
   if (isa<DbgValueInst *>(DVIRec.DbgRef))
     RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
   else
-    RestorePreTransformStateImpl(cast<DPValue *>(DVIRec.DbgRef));
+    RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
 }
 
 static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
@@ -6522,7 +6522,7 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
 
   if (isa<DbgValueInst *>(DVIRec.DbgRef)
           ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
-          : !cast<DPValue *>(DVIRec.DbgRef)->isKillLocation())
+          : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
     return false;
 
   // LSR may have caused several changes to the dbg.value in the failed salvage
@@ -6620,7 +6620,7 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
                       << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
   else
     LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
-                      << *cast<DPValue *>(DVIRec.DbgRef) << "\n");
+                      << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
   return true;
 }
 
@@ -6711,9 +6711,9 @@ static void DbgGatherSalvagableDVI(
         SalvageableDVISCEVs.push_back(std::move(NewRec));
         return true;
       };
-      for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange())) {
-        if (DPV.isDbgValue() || DPV.isDbgAssign())
-          ProcessDbgValue(&DPV);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (DVR.isDbgValue() || DVR.isDbgAssign())
+          ProcessDbgValue(&DVR);
       }
       auto DVI = dyn_cast<DbgValueInst>(&I);
       if (!DVI)
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 67c011b747acf..e991296bd2fb0 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -19,6 +19,7 @@
 
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -990,12 +991,15 @@ class LowerMatrixIntrinsics {
     bool Changed = false;
     SmallVector<CallInst *, 16> MaybeFusableInsts;
     SmallVector<Instruction *, 16> MatrixInsts;
+    SmallVector<IntrinsicInst *, 16> LifetimeEnds;
 
     // First, collect all instructions with shape information and candidates for
     // fusion (currently only matrix multiplies).
     ReversePostOrderTraversal<Function *> RPOT(&Func);
     for (auto *BB : RPOT)
       for (Instruction &I : *BB) {
+        if (match(&I, m_Intrinsic<Intrinsic::lifetime_end>()))
+          LifetimeEnds.push_back(cast<IntrinsicInst>(&I));
         if (ShapeMap.find(&I) == ShapeMap.end())
           continue;
         if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))
@@ -1010,7 +1014,7 @@ class LowerMatrixIntrinsics {
 
     // Third, try to fuse candidates.
     for (CallInst *CI : MaybeFusableInsts)
-      LowerMatrixMultiplyFused(CI, FusedInsts);
+      LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds);
 
     Changed = !FusedInsts.empty();
 
@@ -1856,8 +1860,10 @@ class LowerMatrixIntrinsics {
   ///
   /// Call finalizeLowering on lowered instructions.  Instructions that are
   /// completely eliminated by fusion are added to \p FusedInsts.
-  void LowerMatrixMultiplyFused(CallInst *MatMul,
-                                SmallPtrSetImpl<Instruction *> &FusedInsts) {
+  void
+  LowerMatrixMultiplyFused(CallInst *MatMul,
+                           SmallPtrSetImpl<Instruction *> &FusedInsts,
+                           SmallVector<IntrinsicInst *, 16> &LifetimeEnds) {
     if (!FuseMatrix || !DT)
       return;
 
@@ -1946,6 +1952,55 @@ class LowerMatrixIntrinsics {
       for (Instruction *I : ToHoist)
         I->moveBefore(MatMul);
 
+      // Deal with lifetime.end calls that might be between Load0/Load1 and the
+      // store. To avoid introducing loads to dead objects (i.e. after the
+      // lifetime has been termined by @llvm.lifetime.end), either sink them
+      // after the store if in the same block, or remove the lifetime.end marker
+      // otherwise. This might pessimize further optimizations, by extending the
+      // lifetime of the object until the function returns, but should be
+      // conservatively correct.
+      MemoryLocation Load0Loc = MemoryLocation::get(LoadOp0);
+      MemoryLocation Load1Loc = MemoryLocation::get(LoadOp1);
+      BasicBlock *StoreParent = Store->getParent();
+      bool FusableOpsInSameBlock = LoadOp0->getParent() == StoreParent &&
+                                   LoadOp1->getParent() == StoreParent;
+      for (unsigned Idx = 0; Idx != LifetimeEnds.size();) {
+        IntrinsicInst *End = LifetimeEnds[Idx];
+        auto Inc = make_scope_exit([&Idx]() { Idx++; });
+        // If the lifetime.end is guaranteed to be before the loads or after the
+        // store, it won't interfere with fusion.
+        if (DT->dominates(End, LoadOp0) && DT->dominates(End, LoadOp1))
+          continue;
+        if (DT->dominates(Store, End))
+          continue;
+        // If all fusable ops are in the same block and the lifetime.end is in a
+        // different block, it won't interfere with fusion.
+        if (FusableOpsInSameBlock && End->getParent() != StoreParent)
+          continue;
+
+        // If the loads don't alias the lifetime.end, it won't interfere with
+        // fusion.
+        MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 1, nullptr);
+        if (!EndLoc.Ptr)
+          continue;
+        if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc))
+          continue;
+
+        // If both lifetime.end and the store are in the same block, extend the
+        // lifetime until after the store, so the new lifetime covers the loads
+        // we introduce later.
+        if (End->getParent() == StoreParent) {
+          End->moveAfter(Store);
+          continue;
+        }
+
+        // Otherwise remove the conflicting lifetime.end marker.
+        ToRemove.push_back(End);
+        std::swap(LifetimeEnds[Idx], LifetimeEnds.back());
+        LifetimeEnds.pop_back();
+        Inc.release();
+      }
+
       emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
       return;
     }
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index e238f311a15bc..096c6d1b1fad2 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -319,17 +319,17 @@ static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
   return DebugVariable(DVI->getVariable(), std::nullopt,
                        DVI->getDebugLoc().getInlinedAt());
 }
-static DebugVariable getAggregateVariable(DPValue *DPV) {
-  return DebugVariable(DPV->getVariable(), std::nullopt,
-                       DPV->getDebugLoc().getInlinedAt());
+static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
+  return DebugVariable(DVR->getVariable(), std::nullopt,
+                       DVR->getDebugLoc().getInlinedAt());
 }
 
 /// Helpers for handling new and old debug info modes in migrateDebugInfo.
 /// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based
 /// on the \p Unused parameter type.
-DPValue *UnwrapDbgInstPtr(DbgInstPtr P, DPValue *Unused) {
+DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) {
   (void)Unused;
-  return static_cast<DPValue *>(cast<DbgRecord *>(P));
+  return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P));
 }
 DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) {
   (void)Unused;
@@ -356,9 +356,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
                              Instruction *Inst, Value *Dest, Value *Value,
                              const DataLayout &DL) {
   auto MarkerRange = at::getAssignmentMarkers(OldInst);
-  auto DPVAssignMarkerRange = at::getDPVAssignmentMarkers(OldInst);
+  auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
   // Nothing to do if OldInst has no linked dbg.assign intrinsics.
-  if (MarkerRange.empty() && DPVAssignMarkerRange.empty())
+  if (MarkerRange.empty() && DVRAssignMarkerRange.empty())
     return;
 
   LLVM_DEBUG(dbgs() << "  migrateDebugInfo\n");
@@ -379,9 +379,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   for (auto *DAI : at::getAssignmentMarkers(OldAlloca))
     BaseFragments[getAggregateVariable(DAI)] =
         DAI->getExpression()->getFragmentInfo();
-  for (auto *DPV : at::getDPVAssignmentMarkers(OldAlloca))
-    BaseFragments[getAggregateVariable(DPV)] =
-        DPV->getExpression()->getFragmentInfo();
+  for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
+    BaseFragments[getAggregateVariable(DVR)] =
+        DVR->getExpression()->getFragmentInfo();
 
   // The new inst needs a DIAssignID unique metadata tag (if OldInst has
   // one). It shouldn't already have one: assert this assumption.
@@ -488,7 +488,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   };
 
   for_each(MarkerRange, MigrateDbgAssign);
-  for_each(DPVAssignMarkerRange, MigrateDbgAssign);
+  for_each(DVRAssignMarkerRange, MigrateDbgAssign);
 }
 
 namespace {
@@ -3195,7 +3195,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       // emit dbg.assign intrinsics for mem intrinsics storing through non-
       // constant geps, or storing a variable number of bytes.
       assert(at::getAssignmentMarkers(&II).empty() &&
-             at::getDPVAssignmentMarkers(&II).empty() &&
+             at::getDVRAssignmentMarkers(&II).empty() &&
              "AT: Unexpected link to non-const GEP");
       deleteIfTriviallyDead(OldPtr);
       return false;
@@ -3350,7 +3350,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
             DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
         };
         for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress);
-        for_each(at::getDPVAssignmentMarkers(&II), UpdateAssignAddress);
+        for_each(at::getDVRAssignmentMarkers(&II), UpdateAssignAddress);
         II.setDest(AdjustedPtr);
         II.setDestAlignment(SliceAlign);
       } else {
@@ -3939,7 +3939,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
                          DL);
       } else {
         assert(at::getAssignmentMarkers(Store).empty() &&
-               at::getDPVAssignmentMarkers(Store).empty() &&
+               at::getDVRAssignmentMarkers(Store).empty() &&
                "AT: unexpected debug.assign linked to store through "
                "unbounded GEP");
       }
@@ -5034,14 +5034,14 @@ static void insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig,
   LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
   (void)NewAssign;
 }
-static void insertNewDbgInst(DIBuilder &DIB, DPValue *Orig, AllocaInst *NewAddr,
-                             DIExpression *NewFragmentExpr,
+static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig,
+                             AllocaInst *NewAddr, DIExpression *NewFragmentExpr,
                              Instruction *BeforeInst) {
   (void)DIB;
   if (Orig->isDbgDeclare()) {
-    DPValue *DPV = DPValue::createDPVDeclare(
+    DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
         NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
-    BeforeInst->getParent()->insertDbgRecordBefore(DPV,
+    BeforeInst->getParent()->insertDbgRecordBefore(DVR,
                                                    BeforeInst->getIterator());
     return;
   }
@@ -5049,10 +5049,10 @@ static void insertNewDbgInst(DIBuilder &DIB, DPValue *Orig, AllocaInst *NewAddr,
     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
                          DIAssignID::getDistinct(NewAddr->getContext()));
   }
-  DPValue *NewAssign = DPValue::createLinkedDPVAssign(
+  DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
       NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
       Orig->getAddressExpression(), Orig->getDebugLoc());
-  LLVM_DEBUG(dbgs() << "Created new DPVAssign: " << *NewAssign << "\n");
+  LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
   (void)NewAssign;
 }
 
@@ -5220,7 +5220,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
           OldDII->eraseFromParent();
       };
       for_each(findDbgDeclares(Fragment.Alloca), RemoveOne);
-      for_each(findDPVDeclares(Fragment.Alloca), RemoveOne);
+      for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
 
       insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, FragmentExpr, &AI);
     }
@@ -5229,9 +5229,9 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
   for_each(findDbgDeclares(&AI), MigrateOne);
-  for_each(findDPVDeclares(&AI), MigrateOne);
+  for_each(findDVRDeclares(&AI), MigrateOne);
   for_each(at::getAssignmentMarkers(&AI), MigrateOne);
-  for_each(at::getDPVAssignmentMarkers(&AI), MigrateOne);
+  for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
 
   return Changed;
 }
@@ -5355,7 +5355,7 @@ bool SROA::deleteDeadInstructions(
       DeletedAllocas.insert(AI);
       for (DbgDeclareInst *OldDII : findDbgDeclares(AI))
         OldDII->eraseFromParent();
-      for (DPValue *OldDII : findDPVDeclares(AI))
+      for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
         OldDII->eraseFromParent();
     }
 
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 9fcaf2a89e563..64b850a6a5c2f 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1260,8 +1260,9 @@ static BasicBlock *buildClonedLoopBlocks(
   Module *M = ClonedPH->getParent()->getParent();
   for (auto *ClonedBB : NewBlocks)
     for (Instruction &I : *ClonedBB) {
-      RemapDPValueRange(M, I.getDbgRecordRange(), VMap,
-                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      RemapDbgVariableRecordRange(M, I.getDbgRecordRange(), VMap,
+                                  RF_NoModuleLevelChanges |
+                                      RF_IgnoreMissingLocals);
       RemapInstruction(&I, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       if (auto *II = dyn_cast<AssumeInst>(&I))
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index 8686570dfd2f4..400b56894174d 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -263,7 +263,8 @@ static InstructionCost ComputeSpeculationCost(const Instruction *I,
 bool SpeculativeExecutionPass::considerHoistingFromTo(
     BasicBlock &FromBlock, BasicBlock &ToBlock) {
   SmallPtrSet<const Instruction *, 8> NotHoisted;
-  SmallDenseMap<const Instruction *, SmallVector<DPValue *>> DPValuesToHoist;
+  SmallDenseMap<const Instruction *, SmallVector<DbgVariableRecord *>>
+      DbgVariableRecordsToHoist;
   auto HasNoUnhoistedInstr = [&NotHoisted](auto Values) {
     for (const Value *V : Values) {
       if (const auto *I = dyn_cast_or_null<Instruction>(V))
@@ -291,11 +292,11 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   InstructionCost TotalSpeculationCost = 0;
   unsigned NotHoistedInstCount = 0;
   for (const auto &I : FromBlock) {
-    // Make note of any DPValues that need hoisting. DPLabels
+    // Make note of any DbgVariableRecords that need hoisting. DPLabels
     // get left behind just like llvm.dbg.labels.
-    for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange())) {
-      if (HasNoUnhoistedInstr(DPV.location_ops()))
-        DPValuesToHoist[DPV.getInstruction()].push_back(&DPV);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      if (HasNoUnhoistedInstr(DVR.location_ops()))
+        DbgVariableRecordsToHoist[DVR.getInstruction()].push_back(&DVR);
     }
     const InstructionCost Cost = ComputeSpeculationCost(&I, *TTI);
     if (Cost.isValid() && isSafeToSpeculativelyExecute(&I) &&
@@ -314,13 +315,13 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   }
 
   for (auto I = FromBlock.begin(); I != FromBlock.end();) {
-    // If any DPValues attached to this instruction should be hoisted, hoist
-    // them now - they will end up attached to either the next hoisted
+    // If any DbgVariableRecords attached to this instruction should be hoisted,
+    // hoist them now - they will end up attached to either the next hoisted
     // instruction or the ToBlock terminator.
-    if (DPValuesToHoist.contains(&*I)) {
-      for (auto *DPV : DPValuesToHoist[&*I]) {
-        DPV->removeFromParent();
-        ToBlock.insertDbgRecordBefore(DPV,
+    if (DbgVariableRecordsToHoist.contains(&*I)) {
+      for (auto *DVR : DbgVariableRecordsToHoist[&*I]) {
+        DVR->removeFromParent();
+        ToBlock.insertDbgRecordBefore(DVR,
                                       ToBlock.getTerminator()->getIterator());
       }
     }
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 2006b40e26d05..bf1de05a647dd 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -382,8 +382,9 @@ bool llvm::MergeBlockSuccessorsIntoGivenBlocks(
 /// - Check fully overlapping fragments and not only identical fragments.
 /// - Support dbg.declare. dbg.label, and possibly other meta instructions being
 ///   part of the sequence of consecutive instructions.
-static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
-  SmallVector<DPValue *, 8> ToBeRemoved;
+static bool
+DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
+  SmallVector<DbgVariableRecord *, 8> ToBeRemoved;
   SmallDenseSet<DebugVariable> VariableSet;
   for (auto &I : reverse(*BB)) {
     for (DbgRecord &DR : reverse(I.getDbgRecordRange())) {
@@ -394,10 +395,10 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
         continue;
       }
 
-      DPValue &DPV = cast<DPValue>(DR);
+      DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
       // Skip declare-type records, as the debug intrinsic method only works
       // on dbg.value intrinsics.
-      if (DPV.getType() == DPValue::LocationType::Declare) {
+      if (DVR.getType() == DbgVariableRecord::LocationType::Declare) {
         // The debug intrinsic method treats dbg.declares are "non-debug"
         // instructions (i.e., a break in a consecutive range of debug
         // intrinsics). Emulate that to create identical outputs. See
@@ -407,8 +408,8 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
         continue;
       }
 
-      DebugVariable Key(DPV.getVariable(), DPV.getExpression(),
-                        DPV.getDebugLoc()->getInlinedAt());
+      DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
+                        DVR.getDebugLoc()->getInlinedAt());
       auto R = VariableSet.insert(Key);
       // If the same variable fragment is described more than once it is enough
       // to keep the last one (i.e. the first found since we for reverse
@@ -416,14 +417,14 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
       if (R.second)
         continue;
 
-      if (DPV.isDbgAssign()) {
+      if (DVR.isDbgAssign()) {
         // Don't delete dbg.assign intrinsics that are linked to instructions.
-        if (!at::getAssignmentInsts(&DPV).empty())
+        if (!at::getAssignmentInsts(&DVR).empty())
           continue;
         // Unlinked dbg.assign intrinsics can be treated like dbg.values.
       }
 
-      ToBeRemoved.push_back(&DPV);
+      ToBeRemoved.push_back(&DVR);
       continue;
     }
     // Sequence with consecutive dbg.value instrs ended. Clear the map to
@@ -432,15 +433,15 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
     VariableSet.clear();
   }
 
-  for (auto &DPV : ToBeRemoved)
-    DPV->eraseFromParent();
+  for (auto &DVR : ToBeRemoved)
+    DVR->eraseFromParent();
 
   return !ToBeRemoved.empty();
 }
 
 static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
   if (BB->IsNewDbgInfoFormat)
-    return DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BB);
+    return DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BB);
 
   SmallVector<DbgValueInst *, 8> ToBeRemoved;
   SmallDenseSet<DebugVariable> VariableSet;
@@ -499,29 +500,30 @@ static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
 ///
 /// Possible improvements:
 /// - Keep track of non-overlapping fragments.
-static bool DPValuesRemoveRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
-  SmallVector<DPValue *, 8> ToBeRemoved;
+static bool
+DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
+  SmallVector<DbgVariableRecord *, 8> ToBeRemoved;
   DenseMap<DebugVariable, std::pair<SmallVector<Value *, 4>, DIExpression *>>
       VariableMap;
   for (auto &I : *BB) {
-    for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange())) {
-      if (DPV.getType() == DPValue::LocationType::Declare)
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      if (DVR.getType() == DbgVariableRecord::LocationType::Declare)
         continue;
-      DebugVariable Key(DPV.getVariable(), std::nullopt,
-                        DPV.getDebugLoc()->getInlinedAt());
+      DebugVariable Key(DVR.getVariable(), std::nullopt,
+                        DVR.getDebugLoc()->getInlinedAt());
       auto VMI = VariableMap.find(Key);
       // A dbg.assign with no linked instructions can be treated like a
       // dbg.value (i.e. can be deleted).
       bool IsDbgValueKind =
-          (!DPV.isDbgAssign() || at::getAssignmentInsts(&DPV).empty());
+          (!DVR.isDbgAssign() || at::getAssignmentInsts(&DVR).empty());
 
       // Update the map if we found a new value/expression describing the
       // variable, or if the variable wasn't mapped already.
-      SmallVector<Value *, 4> Values(DPV.location_ops());
+      SmallVector<Value *, 4> Values(DVR.location_ops());
       if (VMI == VariableMap.end() || VMI->second.first != Values ||
-          VMI->second.second != DPV.getExpression()) {
+          VMI->second.second != DVR.getExpression()) {
         if (IsDbgValueKind)
-          VariableMap[Key] = {Values, DPV.getExpression()};
+          VariableMap[Key] = {Values, DVR.getExpression()};
         else
           VariableMap[Key] = {Values, nullptr};
         continue;
@@ -530,55 +532,56 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
       if (!IsDbgValueKind)
         continue;
       // Found an identical mapping. Remember the instruction for later removal.
-      ToBeRemoved.push_back(&DPV);
+      ToBeRemoved.push_back(&DVR);
     }
   }
 
-  for (auto *DPV : ToBeRemoved)
-    DPV->eraseFromParent();
+  for (auto *DVR : ToBeRemoved)
+    DVR->eraseFromParent();
 
   return !ToBeRemoved.empty();
 }
 
-static bool DPValuesRemoveUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
+static bool
+DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
   assert(BB->isEntryBlock() && "expected entry block");
-  SmallVector<DPValue *, 8> ToBeRemoved;
+  SmallVector<DbgVariableRecord *, 8> ToBeRemoved;
   DenseSet<DebugVariable> SeenDefForAggregate;
   // Returns the DebugVariable for DVI with no fragment info.
-  auto GetAggregateVariable = [](const DPValue &DPV) {
-    return DebugVariable(DPV.getVariable(), std::nullopt,
-                         DPV.getDebugLoc().getInlinedAt());
+  auto GetAggregateVariable = [](const DbgVariableRecord &DVR) {
+    return DebugVariable(DVR.getVariable(), std::nullopt,
+                         DVR.getDebugLoc().getInlinedAt());
   };
 
   // Remove undef dbg.assign intrinsics that are encountered before
   // any non-undef intrinsics from the entry block.
   for (auto &I : *BB) {
-    for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange())) {
-      if (!DPV.isDbgValue() && !DPV.isDbgAssign())
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      if (!DVR.isDbgValue() && !DVR.isDbgAssign())
         continue;
       bool IsDbgValueKind =
-          (DPV.isDbgValue() || at::getAssignmentInsts(&DPV).empty());
-      DebugVariable Aggregate = GetAggregateVariable(DPV);
+          (DVR.isDbgValue() || at::getAssignmentInsts(&DVR).empty());
+      DebugVariable Aggregate = GetAggregateVariable(DVR);
       if (!SeenDefForAggregate.contains(Aggregate)) {
-        bool IsKill = DPV.isKillLocation() && IsDbgValueKind;
+        bool IsKill = DVR.isKillLocation() && IsDbgValueKind;
         if (!IsKill) {
           SeenDefForAggregate.insert(Aggregate);
-        } else if (DPV.isDbgAssign()) {
-          ToBeRemoved.push_back(&DPV);
+        } else if (DVR.isDbgAssign()) {
+          ToBeRemoved.push_back(&DVR);
         }
       }
     }
   }
 
-  for (DPValue *DPV : ToBeRemoved)
-    DPV->eraseFromParent();
+  for (DbgVariableRecord *DVR : ToBeRemoved)
+    DVR->eraseFromParent();
 
   return !ToBeRemoved.empty();
 }
 
 static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
   if (BB->IsNewDbgInfoFormat)
-    return DPValuesRemoveRedundantDbgInstrsUsingForwardScan(BB);
+    return DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BB);
 
   SmallVector<DbgValueInst *, 8> ToBeRemoved;
   DenseMap<DebugVariable, std::pair<SmallVector<Value *, 4>, DIExpression *>>
@@ -642,7 +645,7 @@ static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
 /// - Keep track of non-overlapping fragments.
 static bool removeUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
   if (BB->IsNewDbgInfoFormat)
-    return DPValuesRemoveUndefDbgAssignsFromEntryBlock(BB);
+    return DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BB);
 
   assert(BB->isEntryBlock() && "expected entry block");
   SmallVector<DbgAssignIntrinsic *, 8> ToBeRemoved;
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 6931d1997aa6b..3eac726994ae1 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -276,8 +276,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     // attached debug-info records.
     for (Instruction &II : *BB) {
       RemapInstruction(&II, VMap, RemapFlag, TypeMapper, Materializer);
-      RemapDPValueRange(II.getModule(), II.getDbgRecordRange(), VMap, RemapFlag,
-                        TypeMapper, Materializer);
+      RemapDbgVariableRecordRange(II.getModule(), II.getDbgRecordRange(), VMap,
+                                  RemapFlag, TypeMapper, Materializer);
     }
 
   // Only update !llvm.dbg.cu for DifferentModule (not CloneModule). In the
@@ -641,8 +641,8 @@ void PruningFunctionCloner::CloneBlock(
     // Recursively clone any reachable successor blocks.
     append_range(ToClone, successors(BB->getTerminator()));
   } else {
-    // If we didn't create a new terminator, clone DPValues from the old
-    // terminator onto the new terminator.
+    // If we didn't create a new terminator, clone DbgVariableRecords from the
+    // old terminator onto the new terminator.
     Instruction *NewInst = NewBB->getTerminator();
     assert(NewInst);
 
@@ -884,14 +884,15 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
                        TypeMapper, Materializer);
   }
 
-  // Do the same for DPValues, touching all the instructions in the cloned
-  // range of blocks.
+  // Do the same for DbgVariableRecords, touching all the instructions in the
+  // cloned range of blocks.
   Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
   for (BasicBlock &BB : make_range(Begin, NewFunc->end())) {
     for (Instruction &I : BB) {
-      RemapDPValueRange(I.getModule(), I.getDbgRecordRange(), VMap,
-                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                        TypeMapper, Materializer);
+      RemapDbgVariableRecordRange(I.getModule(), I.getDbgRecordRange(), VMap,
+                                  ModuleLevelChanges ? RF_None
+                                                     : RF_NoModuleLevelChanges,
+                                  TypeMapper, Materializer);
     }
   }
 
@@ -990,8 +991,9 @@ void llvm::remapInstructionsInBlocks(ArrayRef<BasicBlock *> Blocks,
   // Rewrite the code to refer to itself.
   for (auto *BB : Blocks) {
     for (auto &Inst : *BB) {
-      RemapDPValueRange(Inst.getModule(), Inst.getDbgRecordRange(), VMap,
-                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      RemapDbgVariableRecordRange(
+          Inst.getModule(), Inst.getDbgRecordRange(), VMap,
+          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       RemapInstruction(&Inst, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     }
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 0fac5fc02e3f0..122b7a9747b67 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1010,6 +1010,18 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
 
     newFunction->addFnAttr(Attr);
   }
+
+  if (NumExitBlocks == 0) {
+    // Mark the new function `noreturn` if applicable. Terminators which resume
+    // exception propagation are treated as returning instructions. This is to
+    // avoid inserting traps after calls to outlined functions which unwind.
+    if (none_of(Blocks, [](const BasicBlock *BB) {
+          const Instruction *Term = BB->getTerminator();
+          return isa<ReturnInst>(Term) || isa<ResumeInst>(Term);
+        }))
+      newFunction->setDoesNotReturn();
+  }
+
   newFunction->insert(newFunction->end(), newRootNode);
 
   // Create scalar and aggregate iterators to name all of the arguments we
@@ -1392,19 +1404,23 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   case 0:
     // There are no successors (the block containing the switch itself), which
     // means that previously this was the last part of the function, and hence
-    // this should be rewritten as a `ret'
-
-    // Check if the function should return a value
-    if (OldFnRetTy->isVoidTy()) {
-      ReturnInst::Create(Context, nullptr, TheSwitch->getIterator());  // Return void
+    // this should be rewritten as a `ret` or `unreachable`.
+    if (newFunction->doesNotReturn()) {
+      // If fn is no return, end with an unreachable terminator.
+      (void)new UnreachableInst(Context, TheSwitch->getIterator());
+    } else if (OldFnRetTy->isVoidTy()) {
+      // We have no return value.
+      ReturnInst::Create(Context, nullptr,
+                         TheSwitch->getIterator()); // Return void
     } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
       // return what we have
-      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch->getIterator());
+      ReturnInst::Create(Context, TheSwitch->getCondition(),
+                         TheSwitch->getIterator());
     } else {
       // Otherwise we must have code extracted an unwind or something, just
       // return whatever we want.
-      ReturnInst::Create(Context,
-                         Constant::getNullValue(OldFnRetTy), TheSwitch->getIterator());
+      ReturnInst::Create(Context, Constant::getNullValue(OldFnRetTy),
+                         TheSwitch->getIterator());
     }
 
     TheSwitch->eraseFromParent();
@@ -1508,14 +1524,14 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
 static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
   for (Instruction &I : instructions(F)) {
     SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
-    SmallVector<DPValue *, 4> DPValues;
-    findDbgUsers(DbgUsers, &I, &DPValues);
+    SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
+    findDbgUsers(DbgUsers, &I, &DbgVariableRecords);
     for (DbgVariableIntrinsic *DVI : DbgUsers)
       if (DVI->getFunction() != &F)
         DVI->eraseFromParent();
-    for (DPValue *DPV : DPValues)
-      if (DPV->getFunction() != &F)
-        DPV->eraseFromParent();
+    for (DbgVariableRecord *DVR : DbgVariableRecords)
+      if (DVR->getFunction() != &F)
+        DVR->eraseFromParent();
   }
 }
 
@@ -1569,7 +1585,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
   //     point to a variable in the wrong scope.
   SmallDenseMap<DINode *, DINode *> RemappedMetadata;
   SmallVector<Instruction *, 4> DebugIntrinsicsToDelete;
-  SmallVector<DPValue *, 4> DPVsToDelete;
+  SmallVector<DbgVariableRecord *, 4> DVRsToDelete;
   DenseMap<const MDNode *, MDNode *> Cache;
 
   auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar) {
@@ -1608,19 +1624,19 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
         continue;
       }
 
-      DPValue &DPV = cast<DPValue>(DR);
+      DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
       // Apply the two updates that dbg.values get: invalid operands, and
       // variable metadata fixup.
-      if (any_of(DPV.location_ops(), IsInvalidLocation)) {
-        DPVsToDelete.push_back(&DPV);
+      if (any_of(DVR.location_ops(), IsInvalidLocation)) {
+        DVRsToDelete.push_back(&DVR);
         continue;
       }
-      if (DPV.isDbgAssign() && IsInvalidLocation(DPV.getAddress())) {
-        DPVsToDelete.push_back(&DPV);
+      if (DVR.isDbgAssign() && IsInvalidLocation(DVR.getAddress())) {
+        DVRsToDelete.push_back(&DVR);
         continue;
       }
-      if (!DPV.getDebugLoc().getInlinedAt())
-        DPV.setVariable(GetUpdatedDIVariable(DPV.getVariable()));
+      if (!DVR.getDebugLoc().getInlinedAt())
+        DVR.setVariable(GetUpdatedDIVariable(DVR.getVariable()));
     }
   };
 
@@ -1658,8 +1674,8 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
 
   for (auto *DII : DebugIntrinsicsToDelete)
     DII->eraseFromParent();
-  for (auto *DPV : DPVsToDelete)
-    DPV->getMarker()->MarkedInstr->dropOneDbgRecord(DPV);
+  for (auto *DVR : DVRsToDelete)
+    DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
   DIB.finalizeSubprogram(NewSP);
 
   // Fix up the scope information attached to the line locations in the new
@@ -1895,16 +1911,6 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC,
 
   fixupDebugInfoPostExtraction(*oldFunction, *newFunction, *TheCall);
 
-  // Mark the new function `noreturn` if applicable. Terminators which resume
-  // exception propagation are treated as returning instructions. This is to
-  // avoid inserting traps after calls to outlined functions which unwind.
-  bool doesNotReturn = none_of(*newFunction, [](const BasicBlock &BB) {
-    const Instruction *Term = BB.getTerminator();
-    return isa<ReturnInst>(Term) || isa<ResumeInst>(Term);
-  });
-  if (doesNotReturn)
-    newFunction->setDoesNotReturn();
-
   LLVM_DEBUG(if (verifyFunction(*newFunction, &errs())) {
     newFunction->dump();
     report_fatal_error("verification of newFunction failed!");
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 1bbe76a92187c..833dcbec228b8 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1710,17 +1710,17 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   };
 
   // Helper-util for updating debug-info records attached to instructions.
-  auto UpdateDPV = [&](DbgRecord *DPV) {
-    assert(DPV->getDebugLoc() && "Debug Value must have debug loc");
+  auto UpdateDVR = [&](DbgRecord *DVR) {
+    assert(DVR->getDebugLoc() && "Debug Value must have debug loc");
     if (NoInlineLineTables) {
-      DPV->setDebugLoc(TheCallDL);
+      DVR->setDebugLoc(TheCallDL);
       return;
     }
-    DebugLoc DL = DPV->getDebugLoc();
+    DebugLoc DL = DVR->getDebugLoc();
     DebugLoc IDL =
         inlineDebugLoc(DL, InlinedAtNode,
-                       DPV->getMarker()->getParent()->getContext(), IANodes);
-    DPV->setDebugLoc(IDL);
+                       DVR->getMarker()->getParent()->getContext(), IANodes);
+    DVR->setDebugLoc(IDL);
   };
 
   // Iterate over all instructions, updating metadata and debug-info records.
@@ -1728,8 +1728,8 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
          ++BI) {
       UpdateInst(*BI);
-      for (DbgRecord &DPV : BI->getDbgRecordRange()) {
-        UpdateDPV(&DPV);
+      for (DbgRecord &DVR : BI->getDbgRecordRange()) {
+        UpdateDVR(&DVR);
       }
     }
 
@@ -1797,7 +1797,7 @@ static at::StorageToVarsMap collectEscapedLocals(const DataLayout &DL,
       EscapedLocals[Base].insert(at::VarRecord(DbgAssign));
     };
     for_each(at::getAssignmentMarkers(Base), CollectAssignsForStorage);
-    for_each(at::getDPVAssignmentMarkers(Base), CollectAssignsForStorage);
+    for_each(at::getDVRAssignmentMarkers(Base), CollectAssignsForStorage);
   }
   return EscapedLocals;
 }
@@ -1829,9 +1829,9 @@ static void fixupAssignments(Function::iterator Start, Function::iterator End) {
   // attachment or use, replace it with a new version.
   for (auto BBI = Start; BBI != End; ++BBI) {
     for (Instruction &I : *BBI) {
-      for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange())) {
-        if (DPV.isDbgAssign())
-          DPV.setAssignId(GetNewID(DPV.getAssignID()));
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (DVR.isDbgAssign())
+          DVR.setAssignId(GetNewID(DVR.getAssignID()));
       }
       if (auto *ID = I.getMetadata(LLVMContext::MD_DIAssignID))
         I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index 5e0c312fe149e..ab1edf47d8db0 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -242,8 +242,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     }
 
     SmallVector<DbgValueInst *, 4> DbgValues;
-    SmallVector<DPValue *, 4> DPValues;
-    llvm::findDbgValues(DbgValues, I, &DPValues);
+    SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
+    llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
 
     // Update pre-existing debug value uses that reside outside the loop.
     for (auto *DVI : DbgValues) {
@@ -261,8 +261,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
 
     // RemoveDIs: copy-paste of block above, using non-instruction debug-info
     // records.
-    for (DPValue *DPV : DPValues) {
-      BasicBlock *UserBB = DPV->getMarker()->getParent();
+    for (DbgVariableRecord *DVR : DbgVariableRecords) {
+      BasicBlock *UserBB = DVR->getMarker()->getParent();
       if (InstBB == UserBB || L->contains(UserBB))
         continue;
       // We currently only handle debug values residing in blocks that were
@@ -271,7 +271,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0]
                                        : SSAUpdate.FindValueForBlock(UserBB);
       if (V)
-        DPV->replaceVariableLocationOp(I, V);
+        DVR->replaceVariableLocationOp(I, V);
     }
 
     // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 7b74caac9e084..3d3b97eca92b3 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -609,12 +609,12 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions(
 
 bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(DbgUsers, I, &DPUsers);
   for (auto *DII : DbgUsers)
     DII->setKillLocation();
-  for (auto *DPV : DPUsers)
-    DPV->setKillLocation();
+  for (auto *DVR : DPUsers)
+    DVR->setKillLocation();
   return !DbgUsers.empty() || !DPUsers.empty();
 }
 
@@ -1570,16 +1570,16 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
   // is removed by LowerDbgDeclare(), we need to make sure that we are
   // not inserting the same dbg.value intrinsic over and over.
   SmallVector<DbgValueInst *, 1> DbgValues;
-  SmallVector<DPValue *, 1> DPValues;
-  findDbgValues(DbgValues, APN, &DPValues);
+  SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
+  findDbgValues(DbgValues, APN, &DbgVariableRecords);
   for (auto *DVI : DbgValues) {
     assert(is_contained(DVI->getValues(), APN));
     if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
       return true;
   }
-  for (auto *DPV : DPValues) {
-    assert(is_contained(DPV->location_ops(), APN));
-    if ((DPV->getVariable() == DIVar) && (DPV->getExpression() == DIExpr))
+  for (auto *DVR : DbgVariableRecords) {
+    assert(is_contained(DVR->location_ops(), APN));
+    if ((DVR->getVariable() == DIVar) && (DVR->getExpression() == DIExpr))
       return true;
   }
   return false;
@@ -1617,23 +1617,23 @@ static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
   // Could not determine size of variable. Conservatively return false.
   return false;
 }
-// RemoveDIs: duplicate implementation of the above, using DPValues, the
-// replacement for dbg.values.
-static bool valueCoversEntireFragment(Type *ValTy, DPValue *DPV) {
-  const DataLayout &DL = DPV->getModule()->getDataLayout();
+// RemoveDIs: duplicate implementation of the above, using DbgVariableRecords,
+// the replacement for dbg.values.
+static bool valueCoversEntireFragment(Type *ValTy, DbgVariableRecord *DVR) {
+  const DataLayout &DL = DVR->getModule()->getDataLayout();
   TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
-  if (std::optional<uint64_t> FragmentSize = DPV->getFragmentSizeInBits())
+  if (std::optional<uint64_t> FragmentSize = DVR->getFragmentSizeInBits())
     return TypeSize::isKnownGE(ValueSize, TypeSize::getFixed(*FragmentSize));
 
   // We can't always calculate the size of the DI variable (e.g. if it is a
   // VLA). Try to use the size of the alloca that the dbg intrinsic describes
   // intead.
-  if (DPV->isAddressOfVariable()) {
-    // DPV should have exactly 1 location when it is an address.
-    assert(DPV->getNumVariableLocationOps() == 1 &&
+  if (DVR->isAddressOfVariable()) {
+    // DVR should have exactly 1 location when it is an address.
+    assert(DVR->getNumVariableLocationOps() == 1 &&
            "address of variable must have exactly 1 location operand.");
     if (auto *AI =
-            dyn_cast_or_null<AllocaInst>(DPV->getVariableLocationOp(0))) {
+            dyn_cast_or_null<AllocaInst>(DVR->getVariableLocationOp(0))) {
       if (std::optional<TypeSize> FragmentSize = AI->getAllocationSizeInBits(DL)) {
         return TypeSize::isKnownGE(ValueSize, *FragmentSize);
       }
@@ -1643,38 +1643,38 @@ static bool valueCoversEntireFragment(Type *ValTy, DPValue *DPV) {
   return false;
 }
 
-static void insertDbgValueOrDPValue(DIBuilder &Builder, Value *DV,
-                                    DILocalVariable *DIVar,
-                                    DIExpression *DIExpr,
-                                    const DebugLoc &NewLoc,
-                                    BasicBlock::iterator Instr) {
+static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV,
+                                              DILocalVariable *DIVar,
+                                              DIExpression *DIExpr,
+                                              const DebugLoc &NewLoc,
+                                              BasicBlock::iterator Instr) {
   if (!UseNewDbgInfoFormat) {
     auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
                                                   (Instruction *)nullptr);
     DbgVal.get<Instruction *>()->insertBefore(Instr);
   } else {
     // RemoveDIs: if we're using the new debug-info format, allocate a
-    // DPValue directly instead of a dbg.value intrinsic.
+    // DbgVariableRecord directly instead of a dbg.value intrinsic.
     ValueAsMetadata *DVAM = ValueAsMetadata::get(DV);
-    DPValue *DV = new DPValue(DVAM, DIVar, DIExpr, NewLoc.get());
+    DbgVariableRecord *DV =
+        new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get());
     Instr->getParent()->insertDbgRecordBefore(DV, Instr);
   }
 }
 
-static void insertDbgValueOrDPValueAfter(DIBuilder &Builder, Value *DV,
-                                         DILocalVariable *DIVar,
-                                         DIExpression *DIExpr,
-                                         const DebugLoc &NewLoc,
-                                         BasicBlock::iterator Instr) {
+static void insertDbgValueOrDbgVariableRecordAfter(
+    DIBuilder &Builder, Value *DV, DILocalVariable *DIVar, DIExpression *DIExpr,
+    const DebugLoc &NewLoc, BasicBlock::iterator Instr) {
   if (!UseNewDbgInfoFormat) {
     auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
                                                   (Instruction *)nullptr);
     DbgVal.get<Instruction *>()->insertAfter(&*Instr);
   } else {
     // RemoveDIs: if we're using the new debug-info format, allocate a
-    // DPValue directly instead of a dbg.value intrinsic.
+    // DbgVariableRecord directly instead of a dbg.value intrinsic.
     ValueAsMetadata *DVAM = ValueAsMetadata::get(DV);
-    DPValue *DV = new DPValue(DVAM, DIVar, DIExpr, NewLoc.get());
+    DbgVariableRecord *DV =
+        new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get());
     Instr->getParent()->insertDbgRecordAfter(DV, &*Instr);
   }
 }
@@ -1707,8 +1707,8 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
       DIExpr->isDeref() || (!DIExpr->startsWithDeref() &&
                             valueCoversEntireFragment(DV->getType(), DII));
   if (CanConvert) {
-    insertDbgValueOrDPValue(Builder, DV, DIVar, DIExpr, NewLoc,
-                            SI->getIterator());
+    insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
+                                      SI->getIterator());
     return;
   }
 
@@ -1720,8 +1720,8 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
   // know which part) we insert an dbg.value intrinsic to indicate that we
   // know nothing about the variable's content.
   DV = UndefValue::get(DV->getType());
-  insertDbgValueOrDPValue(Builder, DV, DIVar, DIExpr, NewLoc,
-                          SI->getIterator());
+  insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
+                                    SI->getIterator());
 }
 
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1747,19 +1747,19 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
   // future if multi-location support is added to the IR, it might be
   // preferable to keep tracking both the loaded value and the original
   // address in case the alloca can not be elided.
-  insertDbgValueOrDPValueAfter(Builder, LI, DIVar, DIExpr, NewLoc,
-                               LI->getIterator());
+  insertDbgValueOrDbgVariableRecordAfter(Builder, LI, DIVar, DIExpr, NewLoc,
+                                         LI->getIterator());
 }
 
-void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, StoreInst *SI,
-                                           DIBuilder &Builder) {
-  assert(DPV->isAddressOfVariable() || DPV->isDbgAssign());
-  auto *DIVar = DPV->getVariable();
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
+                                           StoreInst *SI, DIBuilder &Builder) {
+  assert(DVR->isAddressOfVariable() || DVR->isDbgAssign());
+  auto *DIVar = DVR->getVariable();
   assert(DIVar && "Missing variable");
-  auto *DIExpr = DPV->getExpression();
+  auto *DIExpr = DVR->getExpression();
   Value *DV = SI->getValueOperand();
 
-  DebugLoc NewLoc = getDebugValueLoc(DPV);
+  DebugLoc NewLoc = getDebugValueLoc(DVR);
 
   // If the alloca describes the variable itself, i.e. the expression in the
   // dbg.declare doesn't start with a dereference, we can perform the
@@ -1775,16 +1775,16 @@ void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, StoreInst *SI,
   // deref expression.
   bool CanConvert =
       DIExpr->isDeref() || (!DIExpr->startsWithDeref() &&
-                            valueCoversEntireFragment(DV->getType(), DPV));
+                            valueCoversEntireFragment(DV->getType(), DVR));
   if (CanConvert) {
-    insertDbgValueOrDPValue(Builder, DV, DIVar, DIExpr, NewLoc,
-                            SI->getIterator());
+    insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
+                                      SI->getIterator());
     return;
   }
 
   // FIXME: If storing to a part of the variable described by the dbg.declare,
   // then we want to insert a dbg.value for the corresponding fragment.
-  LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " << *DPV
+  LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " << *DVR
                     << '\n');
   assert(UseNewDbgInfoFormat);
 
@@ -1793,8 +1793,9 @@ void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, StoreInst *SI,
   // know nothing about the variable's content.
   DV = UndefValue::get(DV->getType());
   ValueAsMetadata *DVAM = ValueAsMetadata::get(DV);
-  DPValue *NewDPV = new DPValue(DVAM, DIVar, DIExpr, NewLoc.get());
-  SI->getParent()->insertDbgRecordBefore(NewDPV, SI->getIterator());
+  DbgVariableRecord *NewDVR =
+      new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get());
+  SI->getParent()->insertDbgRecordBefore(NewDVR, SI->getIterator());
 }
 
 /// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
@@ -1826,26 +1827,27 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
   // insertion point.
   // FIXME: Insert dbg.value markers in the successors when appropriate.
   if (InsertionPt != BB->end()) {
-    insertDbgValueOrDPValue(Builder, APN, DIVar, DIExpr, NewLoc, InsertionPt);
+    insertDbgValueOrDbgVariableRecord(Builder, APN, DIVar, DIExpr, NewLoc,
+                                      InsertionPt);
   }
 }
 
-void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, LoadInst *LI,
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, LoadInst *LI,
                                            DIBuilder &Builder) {
-  auto *DIVar = DPV->getVariable();
-  auto *DIExpr = DPV->getExpression();
+  auto *DIVar = DVR->getVariable();
+  auto *DIExpr = DVR->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (!valueCoversEntireFragment(LI->getType(), DPV)) {
+  if (!valueCoversEntireFragment(LI->getType(), DVR)) {
     // FIXME: If only referring to a part of the variable described by the
-    // dbg.declare, then we want to insert a DPValue for the corresponding
-    // fragment.
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to DPValue: " << *DPV
-                      << '\n');
+    // dbg.declare, then we want to insert a DbgVariableRecord for the
+    // corresponding fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to DbgVariableRecord: "
+                      << *DVR << '\n');
     return;
   }
 
-  DebugLoc NewLoc = getDebugValueLoc(DPV);
+  DebugLoc NewLoc = getDebugValueLoc(DVR);
 
   // We are now tracking the loaded value instead of the address. In the
   // future if multi-location support is added to the IR, it might be
@@ -1853,9 +1855,10 @@ void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, LoadInst *LI,
   // address in case the alloca can not be elided.
   assert(UseNewDbgInfoFormat);
 
-  // Create a DPValue directly and insert.
+  // Create a DbgVariableRecord directly and insert.
   ValueAsMetadata *LIVAM = ValueAsMetadata::get(LI);
-  DPValue *DV = new DPValue(LIVAM, DIVar, DIExpr, NewLoc.get());
+  DbgVariableRecord *DV =
+      new DbgVariableRecord(LIVAM, DIVar, DIExpr, NewLoc.get());
   LI->getParent()->insertDbgRecordAfter(DV, LI);
 }
 
@@ -1869,34 +1872,35 @@ static bool isArray(AllocaInst *AI) {
 static bool isStructure(AllocaInst *AI) {
   return AI->getAllocatedType() && AI->getAllocatedType()->isStructTy();
 }
-void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, PHINode *APN,
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, PHINode *APN,
                                            DIBuilder &Builder) {
-  auto *DIVar = DPV->getVariable();
-  auto *DIExpr = DPV->getExpression();
+  auto *DIVar = DVR->getVariable();
+  auto *DIExpr = DVR->getExpression();
   assert(DIVar && "Missing variable");
 
   if (PhiHasDebugValue(DIVar, DIExpr, APN))
     return;
 
-  if (!valueCoversEntireFragment(APN->getType(), DPV)) {
+  if (!valueCoversEntireFragment(APN->getType(), DVR)) {
     // FIXME: If only referring to a part of the variable described by the
-    // dbg.declare, then we want to insert a DPValue for the corresponding
-    // fragment.
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to DPValue: " << *DPV
-                      << '\n');
+    // dbg.declare, then we want to insert a DbgVariableRecord for the
+    // corresponding fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to DbgVariableRecord: "
+                      << *DVR << '\n');
     return;
   }
 
   BasicBlock *BB = APN->getParent();
   auto InsertionPt = BB->getFirstInsertionPt();
 
-  DebugLoc NewLoc = getDebugValueLoc(DPV);
+  DebugLoc NewLoc = getDebugValueLoc(DVR);
 
   // The block may be a catchswitch block, which does not have a valid
   // insertion point.
-  // FIXME: Insert DPValue markers in the successors when appropriate.
+  // FIXME: Insert DbgVariableRecord markers in the successors when appropriate.
   if (InsertionPt != BB->end()) {
-    insertDbgValueOrDPValue(Builder, APN, DIVar, DIExpr, NewLoc, InsertionPt);
+    insertDbgValueOrDbgVariableRecord(Builder, APN, DIVar, DIExpr, NewLoc,
+                                      InsertionPt);
   }
 }
 
@@ -1906,19 +1910,19 @@ bool llvm::LowerDbgDeclare(Function &F) {
   bool Changed = false;
   DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
   SmallVector<DbgDeclareInst *, 4> Dbgs;
-  SmallVector<DPValue *> DPVs;
+  SmallVector<DbgVariableRecord *> DVRs;
   for (auto &FI : F) {
     for (Instruction &BI : FI) {
       if (auto *DDI = dyn_cast<DbgDeclareInst>(&BI))
         Dbgs.push_back(DDI);
-      for (DPValue &DPV : DPValue::filter(BI.getDbgRecordRange())) {
-        if (DPV.getType() == DPValue::LocationType::Declare)
-          DPVs.push_back(&DPV);
+      for (DbgVariableRecord &DVR : filterDbgVars(BI.getDbgRecordRange())) {
+        if (DVR.getType() == DbgVariableRecord::LocationType::Declare)
+          DVRs.push_back(&DVR);
       }
     }
   }
 
-  if (Dbgs.empty() && DPVs.empty())
+  if (Dbgs.empty() && DVRs.empty())
     return Changed;
 
   auto LowerOne = [&](auto *DDI) {
@@ -1962,8 +1966,9 @@ bool llvm::LowerDbgDeclare(Function &F) {
             DebugLoc NewLoc = getDebugValueLoc(DDI);
             auto *DerefExpr =
                 DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
-            insertDbgValueOrDPValue(DIB, AI, DDI->getVariable(), DerefExpr,
-                                    NewLoc, CI->getIterator());
+            insertDbgValueOrDbgVariableRecord(DIB, AI, DDI->getVariable(),
+                                              DerefExpr, NewLoc,
+                                              CI->getIterator());
           }
         } else if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
           if (BI->getType()->isPointerTy())
@@ -1976,7 +1981,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
   };
 
   for_each(Dbgs, LowerOne);
-  for_each(DPVs, LowerOne);
+  for_each(DVRs, LowerOne);
 
   if (Changed)
     for (BasicBlock &BB : F)
@@ -1986,35 +1991,39 @@ bool llvm::LowerDbgDeclare(Function &F) {
 }
 
 // RemoveDIs: re-implementation of insertDebugValuesForPHIs, but which pulls the
-// debug-info out of the block's DPValues rather than dbg.value intrinsics.
-static void insertDPValuesForPHIs(BasicBlock *BB,
-                                  SmallVectorImpl<PHINode *> &InsertedPHIs) {
-  assert(BB && "No BasicBlock to clone DPValue(s) from.");
+// debug-info out of the block's DbgVariableRecords rather than dbg.value
+// intrinsics.
+static void
+insertDbgVariableRecordsForPHIs(BasicBlock *BB,
+                                SmallVectorImpl<PHINode *> &InsertedPHIs) {
+  assert(BB && "No BasicBlock to clone DbgVariableRecord(s) from.");
   if (InsertedPHIs.size() == 0)
     return;
 
-  // Map existing PHI nodes to their DPValues.
-  DenseMap<Value *, DPValue *> DbgValueMap;
+  // Map existing PHI nodes to their DbgVariableRecords.
+  DenseMap<Value *, DbgVariableRecord *> DbgValueMap;
   for (auto &I : *BB) {
-    for (DPValue &DPV : DPValue::filter(I.getDbgRecordRange())) {
-      for (Value *V : DPV.location_ops())
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      for (Value *V : DVR.location_ops())
         if (auto *Loc = dyn_cast_or_null<PHINode>(V))
-          DbgValueMap.insert({Loc, &DPV});
+          DbgValueMap.insert({Loc, &DVR});
     }
   }
   if (DbgValueMap.size() == 0)
     return;
 
-  // Map a pair of the destination BB and old DPValue to the new DPValue,
-  // so that if a DPValue is being rewritten to use more than one of the
-  // inserted PHIs in the same destination BB, we can update the same DPValue
-  // with all the new PHIs instead of creating one copy for each.
-  MapVector<std::pair<BasicBlock *, DPValue *>, DPValue *> NewDbgValueMap;
+  // Map a pair of the destination BB and old DbgVariableRecord to the new
+  // DbgVariableRecord, so that if a DbgVariableRecord is being rewritten to use
+  // more than one of the inserted PHIs in the same destination BB, we can
+  // update the same DbgVariableRecord with all the new PHIs instead of creating
+  // one copy for each.
+  MapVector<std::pair<BasicBlock *, DbgVariableRecord *>, DbgVariableRecord *>
+      NewDbgValueMap;
   // Then iterate through the new PHIs and look to see if they use one of the
-  // previously mapped PHIs. If so, create a new DPValue that will propagate
-  // the info through the new PHI. If we use more than one new PHI in a single
-  // destination BB with the same old dbg.value, merge the updates so that we
-  // get a single new DPValue with all the new PHIs.
+  // previously mapped PHIs. If so, create a new DbgVariableRecord that will
+  // propagate the info through the new PHI. If we use more than one new PHI in
+  // a single destination BB with the same old dbg.value, merge the updates so
+  // that we get a single new DbgVariableRecord with all the new PHIs.
   for (auto PHI : InsertedPHIs) {
     BasicBlock *Parent = PHI->getParent();
     // Avoid inserting a debug-info record into an EH block.
@@ -2023,13 +2032,13 @@ static void insertDPValuesForPHIs(BasicBlock *BB,
     for (auto VI : PHI->operand_values()) {
       auto V = DbgValueMap.find(VI);
       if (V != DbgValueMap.end()) {
-        DPValue *DbgII = cast<DPValue>(V->second);
+        DbgVariableRecord *DbgII = cast<DbgVariableRecord>(V->second);
         auto NewDI = NewDbgValueMap.find({Parent, DbgII});
         if (NewDI == NewDbgValueMap.end()) {
-          DPValue *NewDbgII = DbgII->clone();
+          DbgVariableRecord *NewDbgII = DbgII->clone();
           NewDI = NewDbgValueMap.insert({{Parent, DbgII}, NewDbgII}).first;
         }
-        DPValue *NewDbgII = NewDI->second;
+        DbgVariableRecord *NewDbgII = NewDI->second;
         // If PHI contains VI as an operand more than once, we may
         // replaced it in NewDbgII; confirm that it is present.
         if (is_contained(NewDbgII->location_ops(), VI))
@@ -2037,10 +2046,10 @@ static void insertDPValuesForPHIs(BasicBlock *BB,
       }
     }
   }
-  // Insert the new DPValues into their destination blocks.
+  // Insert the new DbgVariableRecords into their destination blocks.
   for (auto DI : NewDbgValueMap) {
     BasicBlock *Parent = DI.first.first;
-    DPValue *NewDbgII = DI.second;
+    DbgVariableRecord *NewDbgII = DI.second;
     auto InsertionPt = Parent->getFirstInsertionPt();
     assert(InsertionPt != Parent->end() && "Ill-formed basic block");
 
@@ -2055,7 +2064,7 @@ void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
   if (InsertedPHIs.size() == 0)
     return;
 
-  insertDPValuesForPHIs(BB, InsertedPHIs);
+  insertDbgVariableRecordsForPHIs(BB, InsertedPHIs);
 
   // Map existing PHI nodes to their dbg.values.
   ValueToValueMapTy DbgValueMap;
@@ -2117,7 +2126,7 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
                              DIBuilder &Builder, uint8_t DIExprFlags,
                              int Offset) {
   TinyPtrVector<DbgDeclareInst *> DbgDeclares = findDbgDeclares(Address);
-  TinyPtrVector<DPValue *> DPVDeclares = findDPVDeclares(Address);
+  TinyPtrVector<DbgVariableRecord *> DVRDeclares = findDVRDeclares(Address);
 
   auto ReplaceOne = [&](auto *DII) {
     assert(DII->getVariable() && "Missing variable");
@@ -2128,21 +2137,22 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
   };
 
   for_each(DbgDeclares, ReplaceOne);
-  for_each(DPVDeclares, ReplaceOne);
+  for_each(DVRDeclares, ReplaceOne);
 
-  return !DbgDeclares.empty() || !DPVDeclares.empty();
+  return !DbgDeclares.empty() || !DVRDeclares.empty();
 }
 
 static void updateOneDbgValueForAlloca(const DebugLoc &Loc,
                                        DILocalVariable *DIVar,
                                        DIExpression *DIExpr, Value *NewAddress,
-                                       DbgValueInst *DVI, DPValue *DPV,
+                                       DbgValueInst *DVI,
+                                       DbgVariableRecord *DVR,
                                        DIBuilder &Builder, int Offset) {
   assert(DIVar && "Missing variable");
 
-  // This is an alloca-based dbg.value/DPValue. The first thing it should do
-  // with the alloca pointer is dereference it. Otherwise we don't know how to
-  // handle it and give up.
+  // This is an alloca-based dbg.value/DbgVariableRecord. The first thing it
+  // should do with the alloca pointer is dereference it. Otherwise we don't
+  // know how to handle it and give up.
   if (!DIExpr || DIExpr->getNumElements() < 1 ||
       DIExpr->getElement(0) != dwarf::DW_OP_deref)
     return;
@@ -2155,16 +2165,16 @@ static void updateOneDbgValueForAlloca(const DebugLoc &Loc,
     DVI->setExpression(DIExpr);
     DVI->replaceVariableLocationOp(0u, NewAddress);
   } else {
-    assert(DPV);
-    DPV->setExpression(DIExpr);
-    DPV->replaceVariableLocationOp(0u, NewAddress);
+    assert(DVR);
+    DVR->setExpression(DIExpr);
+    DVR->replaceVariableLocationOp(0u, NewAddress);
   }
 }
 
 void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                     DIBuilder &Builder, int Offset) {
   SmallVector<DbgValueInst *, 1> DbgUsers;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgValues(DbgUsers, AI, &DPUsers);
 
   // Attempt to replace dbg.values that use this alloca.
@@ -2173,18 +2183,18 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                DVI->getExpression(), NewAllocaAddress, DVI,
                                nullptr, Builder, Offset);
 
-  // Replace any DPValues that use this alloca.
-  for (DPValue *DPV : DPUsers)
-    updateOneDbgValueForAlloca(DPV->getDebugLoc(), DPV->getVariable(),
-                               DPV->getExpression(), NewAllocaAddress, nullptr,
-                               DPV, Builder, Offset);
+  // Replace any DbgVariableRecords that use this alloca.
+  for (DbgVariableRecord *DVR : DPUsers)
+    updateOneDbgValueForAlloca(DVR->getDebugLoc(), DVR->getVariable(),
+                               DVR->getExpression(), NewAllocaAddress, nullptr,
+                               DVR, Builder, Offset);
 }
 
 /// Where possible to salvage debug information for \p I do so.
 /// If not possible mark undef.
 void llvm::salvageDebugInfo(Instruction &I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(DbgUsers, &I, &DPUsers);
   salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers);
 }
@@ -2224,7 +2234,7 @@ template <typename T> static void salvageDbgAssignAddress(T *Assign) {
 
 void llvm::salvageDebugInfoForDbgValues(
     Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers,
-    ArrayRef<DPValue *> DPUsers) {
+    ArrayRef<DbgVariableRecord *> DPUsers) {
   // These are arbitrary chosen limits on the maximum number of values and the
   // maximum size of a debug expression we can salvage up to, used for
   // performance reasons.
@@ -2290,67 +2300,68 @@ void llvm::salvageDebugInfoForDbgValues(
     LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
     Salvaged = true;
   }
-  // Duplicate of above block for DPValues.
-  for (auto *DPV : DPUsers) {
-    if (DPV->isDbgAssign()) {
-      if (DPV->getAddress() == &I) {
-        salvageDbgAssignAddress(DPV);
+  // Duplicate of above block for DbgVariableRecords.
+  for (auto *DVR : DPUsers) {
+    if (DVR->isDbgAssign()) {
+      if (DVR->getAddress() == &I) {
+        salvageDbgAssignAddress(DVR);
         Salvaged = true;
       }
-      if (DPV->getValue() != &I)
+      if (DVR->getValue() != &I)
         continue;
     }
 
     // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they
     // are implicitly pointing out the value as a DWARF memory location
     // description.
-    bool StackValue = DPV->getType() != DPValue::LocationType::Declare;
-    auto DPVLocation = DPV->location_ops();
+    bool StackValue =
+        DVR->getType() != DbgVariableRecord::LocationType::Declare;
+    auto DVRLocation = DVR->location_ops();
     assert(
-        is_contained(DPVLocation, &I) &&
+        is_contained(DVRLocation, &I) &&
         "DbgVariableIntrinsic must use salvaged instruction as its location");
     SmallVector<Value *, 4> AdditionalValues;
-    // 'I' may appear more than once in DPV's location ops, and each use of 'I'
+    // 'I' may appear more than once in DVR's location ops, and each use of 'I'
     // must be updated in the DIExpression and potentially have additional
     // values added; thus we call salvageDebugInfoImpl for each 'I' instance in
-    // DPVLocation.
+    // DVRLocation.
     Value *Op0 = nullptr;
-    DIExpression *SalvagedExpr = DPV->getExpression();
-    auto LocItr = find(DPVLocation, &I);
-    while (SalvagedExpr && LocItr != DPVLocation.end()) {
+    DIExpression *SalvagedExpr = DVR->getExpression();
+    auto LocItr = find(DVRLocation, &I);
+    while (SalvagedExpr && LocItr != DVRLocation.end()) {
       SmallVector<uint64_t, 16> Ops;
-      unsigned LocNo = std::distance(DPVLocation.begin(), LocItr);
+      unsigned LocNo = std::distance(DVRLocation.begin(), LocItr);
       uint64_t CurrentLocOps = SalvagedExpr->getNumLocationOperands();
       Op0 = salvageDebugInfoImpl(I, CurrentLocOps, Ops, AdditionalValues);
       if (!Op0)
         break;
       SalvagedExpr =
           DIExpression::appendOpsToArg(SalvagedExpr, Ops, LocNo, StackValue);
-      LocItr = std::find(++LocItr, DPVLocation.end(), &I);
+      LocItr = std::find(++LocItr, DVRLocation.end(), &I);
     }
     // salvageDebugInfoImpl should fail on examining the first element of
     // DbgUsers, or none of them.
     if (!Op0)
       break;
 
-    DPV->replaceVariableLocationOp(&I, Op0);
+    DVR->replaceVariableLocationOp(&I, Op0);
     bool IsValidSalvageExpr =
         SalvagedExpr->getNumElements() <= MaxExpressionSize;
     if (AdditionalValues.empty() && IsValidSalvageExpr) {
-      DPV->setExpression(SalvagedExpr);
-    } else if (DPV->getType() != DPValue::LocationType::Declare &&
+      DVR->setExpression(SalvagedExpr);
+    } else if (DVR->getType() != DbgVariableRecord::LocationType::Declare &&
                IsValidSalvageExpr &&
-               DPV->getNumVariableLocationOps() + AdditionalValues.size() <=
+               DVR->getNumVariableLocationOps() + AdditionalValues.size() <=
                    MaxDebugArgs) {
-      DPV->addVariableLocationOps(AdditionalValues, SalvagedExpr);
+      DVR->addVariableLocationOps(AdditionalValues, SalvagedExpr);
     } else {
       // Do not salvage using DIArgList for dbg.addr/dbg.declare, as it is
       // currently only valid for stack value expressions.
       // Also do not salvage if the resulting DIArgList would contain an
       // unreasonably large number of values.
-      DPV->setKillLocation();
+      DVR->setKillLocation();
     }
-    LLVM_DEBUG(dbgs() << "SALVAGE: " << DPV << '\n');
+    LLVM_DEBUG(dbgs() << "SALVAGE: " << DVR << '\n');
     Salvaged = true;
   }
 
@@ -2360,8 +2371,8 @@ void llvm::salvageDebugInfoForDbgValues(
   for (auto *DII : DbgUsers)
     DII->setKillLocation();
 
-  for (auto *DPV : DPUsers)
-    DPV->setKillLocation();
+  for (auto *DVR : DPUsers)
+    DVR->setKillLocation();
 }
 
 Value *getSalvageOpsForGEP(GetElementPtrInst *GEP, const DataLayout &DL,
@@ -2577,10 +2588,10 @@ using DbgValReplacement = std::optional<DIExpression *>;
 static bool rewriteDebugUsers(
     Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
     function_ref<DbgValReplacement(DbgVariableIntrinsic &DII)> RewriteExpr,
-    function_ref<DbgValReplacement(DPValue &DPV)> RewriteDPVExpr) {
+    function_ref<DbgValReplacement(DbgVariableRecord &DVR)> RewriteDVRExpr) {
   // Find debug users of From.
   SmallVector<DbgVariableIntrinsic *, 1> Users;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(Users, &From, &DPUsers);
   if (Users.empty() && DPUsers.empty())
     return false;
@@ -2589,7 +2600,7 @@ static bool rewriteDebugUsers(
   bool Changed = false;
 
   SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage;
-  SmallPtrSet<DPValue *, 1> UndefOrSalvageDPV;
+  SmallPtrSet<DbgVariableRecord *, 1> UndefOrSalvageDVR;
   if (isa<Instruction>(&To)) {
     bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
 
@@ -2608,22 +2619,22 @@ static bool rewriteDebugUsers(
       }
     }
 
-    // DPValue implementation of the above.
-    for (auto *DPV : DPUsers) {
-      Instruction *MarkedInstr = DPV->getMarker()->MarkedInstr;
+    // DbgVariableRecord implementation of the above.
+    for (auto *DVR : DPUsers) {
+      Instruction *MarkedInstr = DVR->getMarker()->MarkedInstr;
       Instruction *NextNonDebug = MarkedInstr;
       // The next instruction might still be a dbg.declare, skip over it.
       if (isa<DbgVariableIntrinsic>(NextNonDebug))
         NextNonDebug = NextNonDebug->getNextNonDebugInstruction();
 
       if (DomPointAfterFrom && NextNonDebug == &DomPoint) {
-        LLVM_DEBUG(dbgs() << "MOVE:  " << *DPV << '\n');
-        DPV->removeFromParent();
+        LLVM_DEBUG(dbgs() << "MOVE:  " << *DVR << '\n');
+        DVR->removeFromParent();
         // Ensure there's a marker.
-        DomPoint.getParent()->insertDbgRecordAfter(DPV, &DomPoint);
+        DomPoint.getParent()->insertDbgRecordAfter(DVR, &DomPoint);
         Changed = true;
       } else if (!DT.dominates(&DomPoint, MarkedInstr)) {
-        UndefOrSalvageDPV.insert(DPV);
+        UndefOrSalvageDVR.insert(DVR);
       }
     }
   }
@@ -2633,30 +2644,30 @@ static bool rewriteDebugUsers(
     if (UndefOrSalvage.count(DII))
       continue;
 
-    DbgValReplacement DVR = RewriteExpr(*DII);
-    if (!DVR)
+    DbgValReplacement DVRepl = RewriteExpr(*DII);
+    if (!DVRepl)
       continue;
 
     DII->replaceVariableLocationOp(&From, &To);
-    DII->setExpression(*DVR);
+    DII->setExpression(*DVRepl);
     LLVM_DEBUG(dbgs() << "REWRITE:  " << *DII << '\n');
     Changed = true;
   }
-  for (auto *DPV : DPUsers) {
-    if (UndefOrSalvageDPV.count(DPV))
+  for (auto *DVR : DPUsers) {
+    if (UndefOrSalvageDVR.count(DVR))
       continue;
 
-    DbgValReplacement DVR = RewriteDPVExpr(*DPV);
-    if (!DVR)
+    DbgValReplacement DVRepl = RewriteDVRExpr(*DVR);
+    if (!DVRepl)
       continue;
 
-    DPV->replaceVariableLocationOp(&From, &To);
-    DPV->setExpression(*DVR);
-    LLVM_DEBUG(dbgs() << "REWRITE:  " << DPV << '\n');
+    DVR->replaceVariableLocationOp(&From, &To);
+    DVR->setExpression(*DVRepl);
+    LLVM_DEBUG(dbgs() << "REWRITE:  " << DVR << '\n');
     Changed = true;
   }
 
-  if (!UndefOrSalvage.empty() || !UndefOrSalvageDPV.empty()) {
+  if (!UndefOrSalvage.empty() || !UndefOrSalvageDVR.empty()) {
     // Try to salvage the remaining debug users.
     salvageDebugInfo(From);
     Changed = true;
@@ -2704,15 +2715,15 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
   auto Identity = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
     return DII.getExpression();
   };
-  auto IdentityDPV = [&](DPValue &DPV) -> DbgValReplacement {
-    return DPV.getExpression();
+  auto IdentityDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement {
+    return DVR.getExpression();
   };
 
   // Handle no-op conversions.
   Module &M = *From.getModule();
   const DataLayout &DL = M.getDataLayout();
   if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
-    return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDPV);
+    return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR);
 
   // Handle integer-to-integer widening and narrowing.
   // FIXME: Use DW_OP_convert when it's available everywhere.
@@ -2724,7 +2735,7 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
     // When the width of the result grows, assume that a debugger will only
     // access the low `FromBits` bits when inspecting the source variable.
     if (FromBits < ToBits)
-      return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDPV);
+      return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR);
 
     // The width of the result has shrunk. Use sign/zero extension to describe
     // the source variable's high bits.
@@ -2740,10 +2751,10 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
       return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits,
                                      Signed);
     };
-    // RemoveDIs: duplicate implementation working on DPValues rather than on
-    // dbg.value intrinsics.
-    auto SignOrZeroExtDPV = [&](DPValue &DPV) -> DbgValReplacement {
-      DILocalVariable *Var = DPV.getVariable();
+    // RemoveDIs: duplicate implementation working on DbgVariableRecords rather
+    // than on dbg.value intrinsics.
+    auto SignOrZeroExtDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement {
+      DILocalVariable *Var = DVR.getVariable();
 
       // Without knowing signedness, sign/zero extension isn't possible.
       auto Signedness = Var->getSignedness();
@@ -2751,11 +2762,11 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
         return std::nullopt;
 
       bool Signed = *Signedness == DIBasicType::Signedness::Signed;
-      return DIExpression::appendExt(DPV.getExpression(), ToBits, FromBits,
+      return DIExpression::appendExt(DVR.getExpression(), ToBits, FromBits,
                                      Signed);
     };
     return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt,
-                             SignOrZeroExtDPV);
+                             SignOrZeroExtDVR);
   }
 
   // TODO: Floating-point conversions, vectors.
@@ -2795,8 +2806,8 @@ llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
     if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
       Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType()));
     if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
-      // EHPads can't have DPValues attached to them, but it might be possible
-      // for things with token type.
+      // EHPads can't have DbgVariableRecords attached to them, but it might be
+      // possible for things with token type.
       Inst->dropDbgRecords();
       EndInst = Inst;
       continue;
@@ -2848,7 +2859,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
       Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
     DTU->applyUpdates(Updates);
   }
-  BB->flushTerminatorDbgValues();
+  BB->flushTerminatorDbgRecords();
   return NumInstrsRemoved;
 }
 
@@ -3544,12 +3555,12 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
 
 void llvm::dropDebugUsers(Instruction &I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(DbgUsers, &I, &DPUsers);
   for (auto *DII : DbgUsers)
     DII->eraseFromParent();
-  for (auto *DPV : DPUsers)
-    DPV->eraseFromParent();
+  for (auto *DVR : DPUsers)
+    DVR->eraseFromParent();
 }
 
 void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
diff --git a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
index 81545ef375219..d9832eeb0697e 100644
--- a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
+++ b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
@@ -42,8 +42,11 @@ static bool isSafeDecreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
   ICmpInst::Predicate BoundPred =
       IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
 
+  auto StartLG = SE.applyLoopGuards(Start, L);
+  auto BoundLG = SE.applyLoopGuards(BoundSCEV, L);
+
   if (LatchBrExitIdx == 1)
-    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+    return SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG, BoundLG);
 
   assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be either 0 or 1");
 
@@ -54,10 +57,10 @@ static bool isSafeDecreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
   const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
 
   const SCEV *MinusOne =
-      SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
+      SE.getMinusSCEV(BoundLG, SE.getOne(BoundLG->getType()));
 
-  return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
-         SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
+  return SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG, MinusOne) &&
+         SE.isLoopEntryGuardedByCond(L, BoundPred, BoundLG, Limit);
 }
 
 /// Given a loop with an increasing induction variable, is it possible to
@@ -86,8 +89,11 @@ static bool isSafeIncreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
   ICmpInst::Predicate BoundPred =
       IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
 
+  auto StartLG = SE.applyLoopGuards(Start, L);
+  auto BoundLG = SE.applyLoopGuards(BoundSCEV, L);
+
   if (LatchBrExitIdx == 1)
-    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+    return SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG, BoundLG);
 
   assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
 
@@ -97,9 +103,9 @@ static bool isSafeIncreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
                        : APInt::getMaxValue(BitWidth);
   const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
 
-  return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
-                                      SE.getAddExpr(BoundSCEV, Step)) &&
-          SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
+  return (SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG,
+                                      SE.getAddExpr(BoundLG, Step)) &&
+          SE.isLoopEntryGuardedByCond(L, BoundPred, BoundLG, Limit));
 }
 
 /// Returns estimate for max latch taken count of the loop of the narrowest
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 8c6af7afa875c..4470c5af870ae 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -159,8 +159,8 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
     // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
     // intrinsics.
     SmallVector<DbgValueInst *, 1> DbgValues;
-    SmallVector<DPValue *, 1> DPValues;
-    llvm::findDbgValues(DbgValues, OrigHeaderVal, &DPValues);
+    SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
+    llvm::findDbgValues(DbgValues, OrigHeaderVal, &DbgVariableRecords);
     for (auto &DbgValue : DbgValues) {
       // The original users in the OrigHeader are already using the original
       // definitions.
@@ -183,11 +183,11 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
     }
 
     // RemoveDIs: duplicate implementation for non-instruction debug-info
-    // storage in DPValues.
-    for (DPValue *DPV : DPValues) {
+    // storage in DbgVariableRecords.
+    for (DbgVariableRecord *DVR : DbgVariableRecords) {
       // The original users in the OrigHeader are already using the original
       // definitions.
-      BasicBlock *UserBB = DPV->getMarker()->getParent();
+      BasicBlock *UserBB = DVR->getMarker()->getParent();
       if (UserBB == OrigHeader)
         continue;
 
@@ -202,7 +202,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
         NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
       else
         NewVal = UndefValue::get(OrigHeaderVal->getType());
-      DPV->replaceVariableLocationOp(OrigHeaderVal, NewVal);
+      DVR->replaceVariableLocationOp(OrigHeaderVal, NewVal);
     }
   }
 }
@@ -552,20 +552,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     for (Instruction &I : llvm::drop_begin(llvm::reverse(*OrigPreheader))) {
       if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
         DbgIntrinsics.insert(makeHash(DII));
-        // Until RemoveDIs supports dbg.declares in DPValue format, we'll need
-        // to collect DPValues attached to any other debug intrinsics.
-        for (const DPValue &DPV : DPValue::filter(DII->getDbgRecordRange()))
-          DbgIntrinsics.insert(makeHash(&DPV));
+        // Until RemoveDIs supports dbg.declares in DbgVariableRecord format,
+        // we'll need to collect DbgVariableRecords attached to any other debug
+        // intrinsics.
+        for (const DbgVariableRecord &DVR :
+             filterDbgVars(DII->getDbgRecordRange()))
+          DbgIntrinsics.insert(makeHash(&DVR));
       } else {
         break;
       }
     }
 
-    // Build DPValue hashes for DPValues attached to the terminator, which isn't
-    // considered in the loop above.
-    for (const DPValue &DPV :
-         DPValue::filter(OrigPreheader->getTerminator()->getDbgRecordRange()))
-      DbgIntrinsics.insert(makeHash(&DPV));
+    // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
+    // terminator, which isn't considered in the loop above.
+    for (const DbgVariableRecord &DVR :
+         filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
+      DbgIntrinsics.insert(makeHash(&DVR));
 
     // Remember the local noalias scope declarations in the header. After the
     // rotation, they must be duplicated and the scope must be cloned. This
@@ -577,28 +579,28 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
     Module *M = OrigHeader->getModule();
 
-    // Track the next DPValue to clone. If we have a sequence where an
+    // Track the next DbgRecord to clone. If we have a sequence where an
     // instruction is hoisted instead of being cloned:
-    //    DPValue blah
+    //    DbgRecord blah
     //    %foo = add i32 0, 0
-    //    DPValue xyzzy
+    //    DbgRecord xyzzy
     //    %bar = call i32 @foobar()
-    // where %foo is hoisted, then the DPValue "blah" will be seen twice, once
+    // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
     // attached to %foo, then when %foo his hoisted it will "fall down" onto the
     // function call:
-    //    DPValue blah
-    //    DPValue xyzzy
+    //    DbgRecord blah
+    //    DbgRecord xyzzy
     //    %bar = call i32 @foobar()
     // causing it to appear attached to the call too.
     //
     // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
-    // here" position to account for this behaviour. We point it at any DPValues
-    // on the next instruction, here labelled xyzzy, before we hoist %foo.
-    // Later, we only only clone DPValues from that position (xyzzy) onwards,
-    // which avoids cloning DPValue "blah" multiple times.
-    // (Stored as a range because it gives us a natural way of testing whether
-    //  there were DPValues on the next instruction before we hoisted things).
-    iterator_range<DPValue::self_iterator> NextDbgInsts =
+    // here" position to account for this behaviour. We point it at any
+    // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
+    // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
+    // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
+    // a range because it gives us a natural way of testing whether
+    //  there were DbgRecords on the next instruction before we hoisted things).
+    iterator_range<DbgRecord::self_iterator> NextDbgInsts =
         (I != E) ? I->getDbgRecordRange() : DPMarker::getEmptyDbgRecordRange();
 
     while (I != E) {
@@ -627,13 +629,14 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
             !NextDbgInsts.empty()) {
           auto DbgValueRange =
               LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-          RemapDPValueRange(M, DbgValueRange, ValueMap,
-                            RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+          RemapDbgVariableRecordRange(M, DbgValueRange, ValueMap,
+                                      RF_NoModuleLevelChanges |
+                                          RF_IgnoreMissingLocals);
           // Erase anything we've seen before.
-          for (DPValue &DPV :
-               make_early_inc_range(DPValue::filter(DbgValueRange)))
-            if (DbgIntrinsics.count(makeHash(&DPV)))
-              DPV.eraseFromParent();
+          for (DbgVariableRecord &DVR :
+               make_early_inc_range(filterDbgVars(DbgValueRange)))
+            if (DbgIntrinsics.count(makeHash(&DVR)))
+              DVR.eraseFromParent();
         }
 
         NextDbgInsts = I->getDbgRecordRange();
@@ -653,13 +656,15 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       if (LoopEntryBranch->getParent()->IsNewDbgInfoFormat &&
           !NextDbgInsts.empty()) {
         auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-        RemapDPValueRange(M, Range, ValueMap,
-                          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+        RemapDbgVariableRecordRange(M, Range, ValueMap,
+                                    RF_NoModuleLevelChanges |
+                                        RF_IgnoreMissingLocals);
         NextDbgInsts = DPMarker::getEmptyDbgRecordRange();
         // Erase anything we've seen before.
-        for (DPValue &DPV : make_early_inc_range(DPValue::filter(Range)))
-          if (DbgIntrinsics.count(makeHash(&DPV)))
-            DPV.eraseFromParent();
+        for (DbgVariableRecord &DVR :
+             make_early_inc_range(filterDbgVars(Range)))
+          if (DbgIntrinsics.count(makeHash(&DVR)))
+            DVR.eraseFromParent();
       }
 
       // Eagerly remap the operands of the instruction.
@@ -777,7 +782,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // OrigPreHeader's old terminator (the original branch into the loop), and
     // remove the corresponding incoming values from the PHI nodes in OrigHeader.
     LoopEntryBranch->eraseFromParent();
-    OrigPreheader->flushTerminatorDbgValues();
+    OrigPreheader->flushTerminatorDbgRecords();
 
     // Update MemorySSA before the rewrite call below changes the 1:1
     // instruction:cloned_instruction_or_value mapping.
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index ecd76b7c1fbfb..2d5b5f967ffbc 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -917,8 +917,9 @@ bool llvm::UnrollRuntimeLoopRemainder(
     for (Instruction &I : *BB) {
       RemapInstruction(&I, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-      RemapDPValueRange(M, I.getDbgRecordRange(), VMap,
-                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      RemapDbgVariableRecordRange(M, I.getDbgRecordRange(), VMap,
+                                  RF_NoModuleLevelChanges |
+                                      RF_IgnoreMissingLocals);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 05b02a42c58a6..73c5d63678229 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -605,7 +605,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
   // Use a map to unique and a vector to guarantee deterministic ordering.
   llvm::SmallDenseSet<DebugVariable, 4> DeadDebugSet;
   llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
-  llvm::SmallVector<DPValue *, 4> DeadDPValues;
+  llvm::SmallVector<DbgVariableRecord *, 4> DeadDbgVariableRecords;
 
   if (ExitBlock) {
     // Given LCSSA form is satisfied, we should not have users of instructions
@@ -631,17 +631,17 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
           U.set(Poison);
         }
 
-        // RemoveDIs: do the same as below for DPValues.
+        // RemoveDIs: do the same as below for DbgVariableRecords.
         if (Block->IsNewDbgInfoFormat) {
-          for (DPValue &DPV : llvm::make_early_inc_range(
-                   DPValue::filter(I.getDbgRecordRange()))) {
-            DebugVariable Key(DPV.getVariable(), DPV.getExpression(),
-                              DPV.getDebugLoc().get());
+          for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
+                   filterDbgVars(I.getDbgRecordRange()))) {
+            DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
+                              DVR.getDebugLoc().get());
             if (!DeadDebugSet.insert(Key).second)
               continue;
-            // Unlinks the DPV from it's container, for later insertion.
-            DPV.removeFromParent();
-            DeadDPValues.push_back(&DPV);
+            // Unlinks the DVR from it's container, for later insertion.
+            DVR.removeFromParent();
+            DeadDbgVariableRecords.push_back(&DVR);
           }
         }
 
@@ -673,11 +673,11 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
       DVI->moveBefore(*ExitBlock, InsertDbgValueBefore);
 
     // Due to the "head" bit in BasicBlock::iterator, we're going to insert
-    // each DPValue right at the start of the block, wheras dbg.values would be
-    // repeatedly inserted before the first instruction. To replicate this
-    // behaviour, do it backwards.
-    for (DPValue *DPV : llvm::reverse(DeadDPValues))
-      ExitBlock->insertDbgRecordBefore(DPV, InsertDbgValueBefore);
+    // each DbgVariableRecord right at the start of the block, wheras dbg.values
+    // would be repeatedly inserted before the first instruction. To replicate
+    // this behaviour, do it backwards.
+    for (DbgVariableRecord *DVR : llvm::reverse(DeadDbgVariableRecords))
+      ExitBlock->insertDbgRecordBefore(DVR, InsertDbgValueBefore);
   }
 
   // Remove the block from the reference counting scheme, so that we can
diff --git a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
index d671a9373bf02..8f55d7bbd3182 100644
--- a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
@@ -332,7 +332,7 @@ void MemoryOpRemark::visitVariable(const Value *V,
     }
   };
   for_each(findDbgDeclares(const_cast<Value *>(V)), FindDI);
-  for_each(findDPVDeclares(const_cast<Value *>(V)), FindDI);
+  for_each(findDVRDeclares(const_cast<Value *>(V)), FindDI);
 
   if (FoundDI) {
     assert(!Result.empty());
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index ed06d3e7f452c..8dd1002a6e4ac 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -18,7 +18,9 @@
 #include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 
 namespace llvm {
@@ -110,21 +112,21 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) {
 
 void StackInfoBuilder::visit(Instruction &Inst) {
   // Visit non-intrinsic debug-info records attached to Inst.
-  for (DPValue &DPV : DPValue::filter(Inst.getDbgRecordRange())) {
+  for (DbgVariableRecord &DVR : filterDbgVars(Inst.getDbgRecordRange())) {
     auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
           return;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
-        auto &DPVVec = AInfo.DbgVariableRecords;
-        if (DPVVec.empty() || DPVVec.back() != &DPV)
-          DPVVec.push_back(&DPV);
+        auto &DVRVec = AInfo.DbgVariableRecords;
+        if (DVRVec.empty() || DVRVec.back() != &DVR)
+          DVRVec.push_back(&DVR);
       }
     };
 
-    for_each(DPV.location_ops(), AddIfInteresting);
-    if (DPV.isDbgAssign())
-      AddIfInteresting(DPV.getAddress());
+    for_each(DVR.location_ops(), AddIfInteresting);
+    if (DVR.isDbgAssign())
+      AddIfInteresting(DVR.getAddress());
   }
 
   if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
@@ -236,5 +238,40 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
   Info.AI = NewAI;
 }
 
+bool isLifetimeIntrinsic(Value *V) {
+  auto *II = dyn_cast<IntrinsicInst>(V);
+  return II && II->isLifetimeStartOrEnd();
+}
+
+Value *readRegister(IRBuilder<> &IRB, StringRef Name) {
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ReadRegister = Intrinsic::getDeclaration(
+      M, Intrinsic::read_register, IRB.getIntPtrTy(M->getDataLayout()));
+  MDNode *MD =
+      MDNode::get(M->getContext(), {MDString::get(M->getContext(), Name)});
+  Value *Args[] = {MetadataAsValue::get(M->getContext(), MD)};
+  return IRB.CreateCall(ReadRegister, Args);
+}
+
+Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB) {
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  if (TargetTriple.getArch() == Triple::aarch64)
+    return memtag::readRegister(IRB, "pc");
+  return IRB.CreatePtrToInt(IRB.GetInsertBlock()->getParent(),
+                            IRB.getIntPtrTy(M->getDataLayout()));
+}
+
+Value *getFP(IRBuilder<> &IRB) {
+  Function *F = IRB.GetInsertBlock()->getParent();
+  Module *M = F->getParent();
+  auto *GetStackPointerFn = Intrinsic::getDeclaration(
+      M, Intrinsic::frameaddress,
+      IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()));
+  return IRB.CreatePtrToInt(
+      IRB.CreateCall(GetStackPointerFn,
+                     {Constant::getNullValue(IRB.getInt32Ty())}),
+      IRB.getIntPtrTy(M->getDataLayout()));
+}
+
 } // namespace memtag
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index b462803bad38c..adcf161b313b2 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -104,11 +104,13 @@ namespace {
 static void createDebugValue(DIBuilder &DIB, Value *NewValue,
                              DILocalVariable *Variable,
                              DIExpression *Expression, const DILocation *DI,
-                             DPValue *InsertBefore) {
-  // FIXME: Merge these two functions now that DIBuilder supports DPValues.
-  // We neeed the API to accept DPValues as an insert point for that to work.
+                             DbgVariableRecord *InsertBefore) {
+  // FIXME: Merge these two functions now that DIBuilder supports
+  // DbgVariableRecords. We neeed the API to accept DbgVariableRecords as an
+  // insert point for that to work.
   (void)DIB;
-  DPValue::createDPValue(NewValue, Variable, Expression, DI, *InsertBefore);
+  DbgVariableRecord::createDbgVariableRecord(NewValue, Variable, Expression, DI,
+                                             *InsertBefore);
 }
 static void createDebugValue(DIBuilder &DIB, Value *NewValue,
                              DILocalVariable *Variable,
@@ -123,7 +125,7 @@ class AssignmentTrackingInfo {
   /// fragment. (i.e. not be a comprehensive set if there are multiple
   /// dbg.assigns for one variable fragment).
   SmallVector<DbgVariableIntrinsic *> DbgAssigns;
-  SmallVector<DPValue *> DPVAssigns;
+  SmallVector<DbgVariableRecord *> DVRAssigns;
 
 public:
   void init(AllocaInst *AI) {
@@ -132,21 +134,21 @@ class AssignmentTrackingInfo {
       if (Vars.insert(DebugVariable(DAI)).second)
         DbgAssigns.push_back(DAI);
     }
-    for (DPValue *DPV : at::getDPVAssignmentMarkers(AI)) {
-      if (Vars.insert(DebugVariable(DPV)).second)
-        DPVAssigns.push_back(DPV);
+    for (DbgVariableRecord *DVR : at::getDVRAssignmentMarkers(AI)) {
+      if (Vars.insert(DebugVariable(DVR)).second)
+        DVRAssigns.push_back(DVR);
     }
   }
 
   /// Update assignment tracking debug info given for the to-be-deleted store
   /// \p ToDelete that stores to this alloca.
-  void
-  updateForDeletedStore(StoreInst *ToDelete, DIBuilder &DIB,
-                        SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
-                        SmallSet<DPValue *, 8> *DPVAssignsToDelete) const {
+  void updateForDeletedStore(
+      StoreInst *ToDelete, DIBuilder &DIB,
+      SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
+      SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const {
     // There's nothing to do if the alloca doesn't have any variables using
     // assignment tracking.
-    if (DbgAssigns.empty() && DPVAssigns.empty())
+    if (DbgAssigns.empty() && DVRAssigns.empty())
       return;
 
     // Insert a dbg.value where the linked dbg.assign is and remember to delete
@@ -165,8 +167,8 @@ class AssignmentTrackingInfo {
     };
     for (auto *Assign : at::getAssignmentMarkers(ToDelete))
       InsertValueForAssign(Assign, DbgAssignsToDelete);
-    for (auto *Assign : at::getDPVAssignmentMarkers(ToDelete))
-      InsertValueForAssign(Assign, DPVAssignsToDelete);
+    for (auto *Assign : at::getDVRAssignmentMarkers(ToDelete))
+      InsertValueForAssign(Assign, DVRAssignsToDelete);
 
     // It's possible for variables using assignment tracking to have no
     // dbg.assign linked to this store. These are variables in DbgAssigns that
@@ -182,7 +184,7 @@ class AssignmentTrackingInfo {
       ConvertDebugDeclareToDebugValue(Assign, ToDelete, DIB);
     };
     for_each(DbgAssigns, ConvertUnlinkedAssignToValue);
-    for_each(DPVAssigns, ConvertUnlinkedAssignToValue);
+    for_each(DVRAssigns, ConvertUnlinkedAssignToValue);
   }
 
   /// Update assignment tracking debug info given for the newly inserted PHI \p
@@ -193,20 +195,20 @@ class AssignmentTrackingInfo {
     // debug-phi.
     for (auto *DAI : DbgAssigns)
       ConvertDebugDeclareToDebugValue(DAI, NewPhi, DIB);
-    for (auto *DPV : DPVAssigns)
-      ConvertDebugDeclareToDebugValue(DPV, NewPhi, DIB);
+    for (auto *DVR : DVRAssigns)
+      ConvertDebugDeclareToDebugValue(DVR, NewPhi, DIB);
   }
 
   void clear() {
     DbgAssigns.clear();
-    DPVAssigns.clear();
+    DVRAssigns.clear();
   }
-  bool empty() { return DbgAssigns.empty() && DPVAssigns.empty(); }
+  bool empty() { return DbgAssigns.empty() && DVRAssigns.empty(); }
 };
 
 struct AllocaInfo {
   using DbgUserVec = SmallVector<DbgVariableIntrinsic *, 1>;
-  using DPUserVec = SmallVector<DPValue *, 1>;
+  using DPUserVec = SmallVector<DbgVariableRecord *, 1>;
 
   SmallVector<BasicBlock *, 32> DefiningBlocks;
   SmallVector<BasicBlock *, 32> UsingBlocks;
@@ -262,7 +264,7 @@ struct AllocaInfo {
       }
     }
     DbgUserVec AllDbgUsers;
-    SmallVector<DPValue *> AllDPUsers;
+    SmallVector<DbgVariableRecord *> AllDPUsers;
     findDbgUsers(AllDbgUsers, AI, &AllDPUsers);
     std::copy_if(AllDbgUsers.begin(), AllDbgUsers.end(),
                  std::back_inserter(DbgUsers), [](DbgVariableIntrinsic *DII) {
@@ -270,7 +272,7 @@ struct AllocaInfo {
                  });
     std::copy_if(AllDPUsers.begin(), AllDPUsers.end(),
                  std::back_inserter(DPUsers),
-                 [](DPValue *DPV) { return !DPV->isDbgAssign(); });
+                 [](DbgVariableRecord *DVR) { return !DVR->isDbgAssign(); });
     AssignmentTracking.init(AI);
   }
 };
@@ -378,7 +380,7 @@ struct PromoteMem2Reg {
   /// A set of dbg.assigns to delete because they've been demoted to
   /// dbg.values. Call cleanUpDbgAssigns to delete them.
   SmallSet<DbgAssignIntrinsic *, 8> DbgAssignsToDelete;
-  SmallSet<DPValue *, 8> DPVAssignsToDelete;
+  SmallSet<DbgVariableRecord *, 8> DVRAssignsToDelete;
 
   /// The set of basic blocks the renamer has already visited.
   SmallPtrSet<BasicBlock *, 16> Visited;
@@ -428,9 +430,9 @@ struct PromoteMem2Reg {
     for (auto *DAI : DbgAssignsToDelete)
       DAI->eraseFromParent();
     DbgAssignsToDelete.clear();
-    for (auto *DPV : DPVAssignsToDelete)
-      DPV->eraseFromParent();
-    DPVAssignsToDelete.clear();
+    for (auto *DVR : DVRAssignsToDelete)
+      DVR->eraseFromParent();
+    DVRAssignsToDelete.clear();
   }
 };
 
@@ -508,7 +510,7 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
                          const DataLayout &DL, DominatorTree &DT,
                          AssumptionCache *AC,
                          SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
-                         SmallSet<DPValue *, 8> *DPVAssignsToDelete) {
+                         SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   StoreInst *OnlyStore = Info.OnlyStore;
   bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
   BasicBlock *StoreBB = OnlyStore->getParent();
@@ -569,7 +571,7 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
   DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
   // Update assignment tracking info for the store we're going to delete.
   Info.AssignmentTracking.updateForDeletedStore(
-      Info.OnlyStore, DIB, DbgAssignsToDelete, DPVAssignsToDelete);
+      Info.OnlyStore, DIB, DbgAssignsToDelete, DVRAssignsToDelete);
 
   // Record debuginfo for the store and remove the declaration's
   // debuginfo.
@@ -618,7 +620,7 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                          LargeBlockInfo &LBI, const DataLayout &DL,
                          DominatorTree &DT, AssumptionCache *AC,
                          SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
-                         SmallSet<DPValue *, 8> *DPVAssignsToDelete) {
+                         SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
   // significantly pessimize the small block case.  This uses LargeBlockInfo to
@@ -683,7 +685,7 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     StoreInst *SI = cast<StoreInst>(AI->user_back());
     // Update assignment tracking info for the store we're going to delete.
     Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DbgAssignsToDelete,
-                                                  DPVAssignsToDelete);
+                                                  DVRAssignsToDelete);
     // Record debuginfo for the store before removing it.
     auto DbgUpdateForStore = [&](auto &Container) {
       for (auto *DbgItem : Container) {
@@ -755,7 +757,7 @@ void PromoteMem2Reg::run() {
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
       if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                   &DbgAssignsToDelete, &DPVAssignsToDelete)) {
+                                   &DbgAssignsToDelete, &DVRAssignsToDelete)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
         ++NumSingleStore;
@@ -767,7 +769,7 @@ void PromoteMem2Reg::run() {
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock &&
         promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                 &DbgAssignsToDelete, &DPVAssignsToDelete)) {
+                                 &DbgAssignsToDelete, &DVRAssignsToDelete)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -1174,7 +1176,7 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
       // Record debuginfo for the store before removing it.
       IncomingLocs[AllocaNo] = SI->getDebugLoc();
       AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB, &DbgAssignsToDelete,
-                                                   &DPVAssignsToDelete);
+                                                   &DVRAssignsToDelete);
       auto ConvertDbgDeclares = [&](auto &Container) {
         for (auto *DbgItem : Container)
           if (DbgItem->isAddressOfVariable())
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index a185e8cd371c6..38fc7763c5b20 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -1486,7 +1486,13 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
   // Try to simplify to a constant range.
   ConstantRange A = getConstantRange(V1State, I.getType());
   ConstantRange B = getConstantRange(V2State, I.getType());
-  ConstantRange R = A.binaryOp(cast<BinaryOperator>(&I)->getOpcode(), B);
+
+  auto *BO = cast<BinaryOperator>(&I);
+  ConstantRange R = ConstantRange::getEmpty(I.getType()->getScalarSizeInBits());
+  if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO))
+    R = A.overflowingBinaryOp(BO->getOpcode(), B, OBO->getNoWrapKind());
+  else
+    R = A.binaryOp(BO->getOpcode(), B);
   mergeInValue(&I, ValueLatticeElement::getRange(R));
 
   // TODO: Currently we do not exploit special values that produce something
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index fc21fb552137d..38df20c949c2e 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -199,17 +199,17 @@ void SSAUpdater::RewriteUse(Use &U) {
 
 void SSAUpdater::UpdateDebugValues(Instruction *I) {
   SmallVector<DbgValueInst *, 4> DbgValues;
-  SmallVector<DPValue *, 4> DPValues;
-  llvm::findDbgValues(DbgValues, I, &DPValues);
+  SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
+  llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
   for (auto &DbgValue : DbgValues) {
     if (DbgValue->getParent() == I->getParent())
       continue;
     UpdateDebugValue(I, DbgValue);
   }
-  for (auto &DPV : DPValues) {
-    if (DPV->getParent() == I->getParent())
+  for (auto &DVR : DbgVariableRecords) {
+    if (DVR->getParent() == I->getParent())
       continue;
-    UpdateDebugValue(I, DPV);
+    UpdateDebugValue(I, DVR);
   }
 }
 
@@ -220,10 +220,10 @@ void SSAUpdater::UpdateDebugValues(Instruction *I,
   }
 }
 
-void SSAUpdater::UpdateDebugValues(Instruction *I,
-                                   SmallVectorImpl<DPValue *> &DPValues) {
-  for (auto &DPV : DPValues) {
-    UpdateDebugValue(I, DPV);
+void SSAUpdater::UpdateDebugValues(
+    Instruction *I, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+  for (auto &DVR : DbgVariableRecords) {
+    UpdateDebugValue(I, DVR);
   }
 }
 
@@ -236,13 +236,13 @@ void SSAUpdater::UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue) {
     DbgValue->setKillLocation();
 }
 
-void SSAUpdater::UpdateDebugValue(Instruction *I, DPValue *DPV) {
-  BasicBlock *UserBB = DPV->getParent();
+void SSAUpdater::UpdateDebugValue(Instruction *I, DbgVariableRecord *DVR) {
+  BasicBlock *UserBB = DVR->getParent();
   if (HasValueForBlock(UserBB)) {
     Value *NewVal = GetValueAtEndOfBlock(UserBB);
-    DPV->replaceVariableLocationOp(I, NewVal);
+    DVR->replaceVariableLocationOp(I, NewVal);
   } else
-    DPV->setKillLocation();
+    DVR->setKillLocation();
 }
 
 void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 0f3d1403481df..55bbffb18879f 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1126,8 +1126,9 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
 
     NewBonusInst->insertInto(PredBlock, PTI->getIterator());
     auto Range = NewBonusInst->cloneDebugInfoFrom(&BonusInst);
-    RemapDPValueRange(NewBonusInst->getModule(), Range, VMap,
-                      RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    RemapDbgVariableRecordRange(NewBonusInst->getModule(), Range, VMap,
+                                RF_NoModuleLevelChanges |
+                                    RF_IgnoreMissingLocals);
 
     if (isa<DbgInfoIntrinsic>(BonusInst))
       continue;
@@ -1526,15 +1527,15 @@ static bool shouldHoistCommonInstructions(Instruction *I1, Instruction *I2,
   return true;
 }
 
-/// Hoists DPValues from \p I1 and \p OtherInstrs that are identical in
-/// lock-step to \p TI. This matches how dbg.* intrinsics are hoisting in
+/// Hoists DbgVariableRecords from \p I1 and \p OtherInstrs that are identical
+/// in lock-step to \p TI. This matches how dbg.* intrinsics are hoisting in
 /// hoistCommonCodeFromSuccessors. e.g. The input:
-///    I1                DPVs: { x, z },
-///    OtherInsts: { I2  DPVs: { x, y, z } }
-/// would result in hoisting only DPValue x.
-static void
-hoistLockstepIdenticalDPValues(Instruction *TI, Instruction *I1,
-                               SmallVectorImpl<Instruction *> &OtherInsts) {
+///    I1                DVRs: { x, z },
+///    OtherInsts: { I2  DVRs: { x, y, z } }
+/// would result in hoisting only DbgVariableRecord x.
+static void hoistLockstepIdenticalDbgVariableRecords(
+    Instruction *TI, Instruction *I1,
+    SmallVectorImpl<Instruction *> &OtherInsts) {
   if (!I1->hasDbgRecords())
     return;
   using CurrentAndEndIt =
@@ -1570,11 +1571,12 @@ hoistLockstepIdenticalDPValues(Instruction *TI, Instruction *I1,
   // This replicates the dbg.* intrinsic behaviour in
   // hoistCommonCodeFromSuccessors.
   while (none_of(Itrs, atEnd)) {
-    bool HoistDPVs = allIdentical(Itrs);
+    bool HoistDVRs = allIdentical(Itrs);
     for (CurrentAndEndIt &Pair : Itrs) {
-      // Increment Current iterator now as we may be about to move the DPValue.
+      // Increment Current iterator now as we may be about to move the
+      // DbgRecord.
       DbgRecord &DR = *Pair.first++;
-      if (HoistDPVs) {
+      if (HoistDVRs) {
         DR.removeFromParent();
         TI->getParent()->insertDbgRecordBefore(&DR, TI->getIterator());
       }
@@ -1690,7 +1692,7 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
       // at the beginning of the loop, we can hoist the terminator instruction.
       // If any instructions remain in the block, we cannot hoist terminators.
       if (NumSkipped || !AllInstsAreIdentical) {
-        hoistLockstepIdenticalDPValues(TI, I1, OtherInsts);
+        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
         return Changed;
       }
 
@@ -1720,9 +1722,9 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
         // The debug location is an integral part of a debug info intrinsic
         // and can't be separated from it or replaced.  Instead of attempting
         // to merge locations, simply hoist both copies of the intrinsic.
-        hoistLockstepIdenticalDPValues(TI, I1, OtherInsts);
-        // We've just hoisted DPValues; move I1 after them (before TI) and
-        // leave any that were not hoisted behind (by calling moveBefore
+        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
+        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
+        // and leave any that were not hoisted behind (by calling moveBefore
         // rather than moveBeforePreserving).
         I1->moveBefore(TI);
         for (auto &SuccIter : OtherSuccIterRange) {
@@ -1734,9 +1736,9 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
         // For a normal instruction, we just move one to right before the
         // branch, then replace all uses of the other with the first.  Finally,
         // we remove the now redundant second instruction.
-        hoistLockstepIdenticalDPValues(TI, I1, OtherInsts);
-        // We've just hoisted DPValues; move I1 after them (before TI) and
-        // leave any that were not hoisted behind (by calling moveBefore
+        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
+        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
+        // and leave any that were not hoisted behind (by calling moveBefore
         // rather than moveBeforePreserving).
         I1->moveBefore(TI);
         for (auto &SuccIter : OtherSuccIterRange) {
@@ -1758,7 +1760,7 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
       NumHoistCommonInstrs += SuccIterPairs.size();
     } else {
       if (NumSkipped >= HoistCommonSkipLimit) {
-        hoistLockstepIdenticalDPValues(TI, I1, OtherInsts);
+        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
         return Changed;
       }
       // We are about to skip over a pair of non-identical instructions. Record
@@ -1821,9 +1823,9 @@ bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
     }
   }
 
-  // Hoist DPValues attached to the terminator to match dbg.* intrinsic hoisting
-  // behaviour in hoistCommonCodeFromSuccessors.
-  hoistLockstepIdenticalDPValues(TI, I1, OtherSuccTIs);
+  // Hoist DbgVariableRecords attached to the terminator to match dbg.*
+  // intrinsic hoisting behaviour in hoistCommonCodeFromSuccessors.
+  hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherSuccTIs);
   // Clone the terminator and hoist it into the pred, without any debug info.
   Instruction *NT = I1->clone();
   NT->insertInto(TIParent, TI->getIterator());
@@ -3178,7 +3180,7 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
         DbgAssign->replaceVariableLocationOp(OrigV, S);
     };
     for_each(at::getAssignmentMarkers(SpeculatedStore), replaceVariable);
-    for_each(at::getDPVAssignmentMarkers(SpeculatedStore), replaceVariable);
+    for_each(at::getDVRAssignmentMarkers(SpeculatedStore), replaceVariable);
   }
 
   // Metadata can be dependent on the condition we are hoisting above.
@@ -3203,13 +3205,15 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
   }
 
   // Hoist the instructions.
-  // In "RemoveDIs" non-instr debug-info mode, drop DPValues attached to these
-  // instructions, in the same way that dbg.value intrinsics are dropped at the
-  // end of this block.
+  // In "RemoveDIs" non-instr debug-info mode, drop DbgVariableRecords attached
+  // to these instructions, in the same way that dbg.value intrinsics are
+  // dropped at the end of this block.
   for (auto &It : make_range(ThenBB->begin(), ThenBB->end()))
     for (DbgRecord &DR : make_early_inc_range(It.getDbgRecordRange()))
-      // Drop all records except assign-kind DPValues (dbg.assign equivalent).
-      if (DPValue *DPV = dyn_cast<DPValue>(&DR); !DPV || !DPV->isDbgAssign())
+      // Drop all records except assign-kind DbgVariableRecords (dbg.assign
+      // equivalent).
+      if (DbgVariableRecord *DVR = dyn_cast<DbgVariableRecord>(&DR);
+          !DVR || !DVR->isDbgAssign())
         It.dropOneDbgRecord(&DR);
   BB->splice(BI->getIterator(), ThenBB, ThenBB->begin(),
              std::prev(ThenBB->end()));
@@ -3384,7 +3388,7 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
     TranslateMap[Cond] = CB;
 
     // RemoveDIs: track instructions that we optimise away while folding, so
-    // that we can copy DPValues from them later.
+    // that we can copy DbgVariableRecords from them later.
     BasicBlock::iterator SrcDbgCursor = BB->begin();
     for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
       if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
@@ -3848,10 +3852,10 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
 
   if (PredBlock->IsNewDbgInfoFormat) {
     PredBlock->getTerminator()->cloneDebugInfoFrom(BB->getTerminator());
-    for (DPValue &DPV :
-         DPValue::filter(PredBlock->getTerminator()->getDbgRecordRange())) {
-      RemapDPValue(M, &DPV, VMap,
-                   RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    for (DbgVariableRecord &DVR :
+         filterDbgVars(PredBlock->getTerminator()->getDbgRecordRange())) {
+      RemapDbgVariableRecord(M, &DVR, VMap,
+                             RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     }
   }
 
@@ -5304,7 +5308,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
   // Ensure that any debug-info records that used to occur after the Unreachable
   // are moved to in front of it -- otherwise they'll "dangle" at the end of
   // the block.
-  BB->flushTerminatorDbgValues();
+  BB->flushTerminatorDbgRecords();
 
   // Debug-info records on the unreachable inst itself should be deleted, as
   // below we delete everything past the final executable instruction.
@@ -5326,8 +5330,8 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
     // block will be the unwind edges of Invoke/CatchSwitch/CleanupReturn,
     // and we can therefore guarantee this block will be erased.
 
-    // If we're deleting this, we're deleting any subsequent dbg.values, so
-    // delete DPValue records of variable information.
+    // If we're deleting this, we're deleting any subsequent debug info, so
+    // delete DbgRecords.
     BBI->dropDbgRecords();
 
     // Delete this instruction (any uses are guaranteed to be dead)
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index b8fa985fa3462..f6d72265305d7 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1381,6 +1381,77 @@ const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
   };
 }
 
+namespace {
+
+// Represents a interesting integer binary operation for
+// getExtendedOperandRecurrence. This may be a shl that is being treated as a
+// multiply or a 'or disjoint' that is being treated as 'add nsw nuw'.
+struct BinaryOp {
+  unsigned Opcode;
+  std::array<Value *, 2> Operands;
+  bool IsNSW = false;
+  bool IsNUW = false;
+
+  explicit BinaryOp(Instruction *Op)
+      : Opcode(Op->getOpcode()),
+        Operands({Op->getOperand(0), Op->getOperand(1)}) {
+    if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(Op)) {
+      IsNSW = OBO->hasNoSignedWrap();
+      IsNUW = OBO->hasNoUnsignedWrap();
+    }
+  }
+
+  explicit BinaryOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
+                    bool IsNSW = false, bool IsNUW = false)
+      : Opcode(Opcode), Operands({LHS, RHS}), IsNSW(IsNSW), IsNUW(IsNUW) {}
+};
+
+} // end anonymous namespace
+
+static std::optional<BinaryOp> matchBinaryOp(Instruction *Op) {
+  switch (Op->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    return BinaryOp(Op);
+  case Instruction::Or: {
+    // Convert or disjoint into add nuw nsw.
+    if (cast<PossiblyDisjointInst>(Op)->isDisjoint())
+      return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1),
+                      /*IsNSW=*/true, /*IsNUW=*/true);
+    break;
+  }
+  case Instruction::Shl: {
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(Op->getOperand(1))) {
+      unsigned BitWidth = cast<IntegerType>(SA->getType())->getBitWidth();
+
+      // If the shift count is not less than the bitwidth, the result of
+      // the shift is undefined. Don't try to analyze it, because the
+      // resolution chosen here may differ from the resolution chosen in
+      // other parts of the compiler.
+      if (SA->getValue().ult(BitWidth)) {
+        // We can safely preserve the nuw flag in all cases. It's also safe to
+        // turn a nuw nsw shl into a nuw nsw mul. However, nsw in isolation
+        // requires special handling. It can be preserved as long as we're not
+        // left shifting by bitwidth - 1.
+        bool IsNUW = Op->hasNoUnsignedWrap();
+        bool IsNSW = Op->hasNoSignedWrap() &&
+                     (IsNUW || SA->getValue().ult(BitWidth - 1));
+
+        ConstantInt *X =
+            ConstantInt::get(Op->getContext(),
+                             APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
+        return BinaryOp(Instruction::Mul, Op->getOperand(0), X, IsNSW, IsNUW);
+      }
+    }
+
+    break;
+  }
+  }
+
+  return std::nullopt;
+}
+
 /// No-wrap operations can transfer sign extension of their result to their
 /// operands. Generate the SCEV value for the widened operation without
 /// actually modifying the IR yet. If the expression after extending the
@@ -1388,24 +1459,22 @@ const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
 /// extension used.
 WidenIV::WidenedRecTy
 WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
-  // Handle the common case of add<nsw/nuw>
-  const unsigned OpCode = DU.NarrowUse->getOpcode();
-  // Only Add/Sub/Mul instructions supported yet.
-  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
-      OpCode != Instruction::Mul)
+  auto Op = matchBinaryOp(DU.NarrowUse);
+  if (!Op)
     return {nullptr, ExtendKind::Unknown};
 
+  assert((Op->Opcode == Instruction::Add || Op->Opcode == Instruction::Sub ||
+          Op->Opcode == Instruction::Mul) &&
+         "Unexpected opcode");
+
   // One operand (NarrowDef) has already been extended to WideDef. Now determine
   // if extending the other will lead to a recurrence.
-  const unsigned ExtendOperIdx =
-      DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
-  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+  const unsigned ExtendOperIdx = Op->Operands[0] == DU.NarrowDef ? 1 : 0;
+  assert(Op->Operands[1 - ExtendOperIdx] == DU.NarrowDef && "bad DU");
 
-  const OverflowingBinaryOperator *OBO =
-    cast<OverflowingBinaryOperator>(DU.NarrowUse);
   ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
-  if (!(ExtKind == ExtendKind::Sign && OBO->hasNoSignedWrap()) &&
-      !(ExtKind == ExtendKind::Zero && OBO->hasNoUnsignedWrap())) {
+  if (!(ExtKind == ExtendKind::Sign && Op->IsNSW) &&
+      !(ExtKind == ExtendKind::Zero && Op->IsNUW)) {
     ExtKind = ExtendKind::Unknown;
 
     // For a non-negative NarrowDef, we can choose either type of
@@ -1413,16 +1482,15 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
     // (see above), and we only hit this code if we need to check
     // the opposite case.
     if (DU.NeverNegative) {
-      if (OBO->hasNoSignedWrap()) {
+      if (Op->IsNSW) {
         ExtKind = ExtendKind::Sign;
-      } else if (OBO->hasNoUnsignedWrap()) {
+      } else if (Op->IsNUW) {
         ExtKind = ExtendKind::Zero;
       }
     }
   }
 
-  const SCEV *ExtendOperExpr =
-      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx));
+  const SCEV *ExtendOperExpr = SE->getSCEV(Op->Operands[ExtendOperIdx]);
   if (ExtKind == ExtendKind::Sign)
     ExtendOperExpr = SE->getSignExtendExpr(ExtendOperExpr, WideType);
   else if (ExtKind == ExtendKind::Zero)
@@ -1443,7 +1511,7 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
   if (ExtendOperIdx == 0)
     std::swap(lhs, rhs);
   const SCEVAddRecExpr *AddRec =
-      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
+      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, Op->Opcode));
 
   if (!AddRec || AddRec->getLoop() != L)
     return {nullptr, ExtendKind::Unknown};
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 3da161043d6c8..8c24599c8ce37 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -146,7 +146,7 @@ class Mapper {
   Value *mapValue(const Value *V);
   void remapInstruction(Instruction *I);
   void remapFunction(Function &F);
-  void remapDPValue(DbgRecord &DPV);
+  void remapDbgRecord(DbgRecord &DVR);
 
   Constant *mapConstant(const Constant *C) {
     return cast_or_null<Constant>(mapValue(C));
@@ -537,13 +537,13 @@ Value *Mapper::mapValue(const Value *V) {
   return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
 }
 
-void Mapper::remapDPValue(DbgRecord &DR) {
+void Mapper::remapDbgRecord(DbgRecord &DR) {
   if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
     DPL->setLabel(cast<DILabel>(mapMetadata(DPL->getLabel())));
     return;
   }
 
-  DPValue &V = cast<DPValue>(DR);
+  DbgVariableRecord &V = cast<DbgVariableRecord>(DR);
   // Remap variables and DILocations.
   auto *MappedVar = mapMetadata(V.getVariable());
   auto *MappedDILoc = mapMetadata(V.getDebugLoc());
@@ -1067,7 +1067,7 @@ void Mapper::remapFunction(Function &F) {
     for (Instruction &I : BB) {
       remapInstruction(&I);
       for (DbgRecord &DR : I.getDbgRecordRange())
-        remapDPValue(DR);
+        remapDbgRecord(DR);
     }
   }
 }
@@ -1233,14 +1233,14 @@ void ValueMapper::remapInstruction(Instruction &I) {
   FlushingMapper(pImpl)->remapInstruction(&I);
 }
 
-void ValueMapper::remapDPValue(Module *M, DPValue &V) {
-  FlushingMapper(pImpl)->remapDPValue(V);
+void ValueMapper::remapDbgVariableRecord(Module *M, DbgVariableRecord &V) {
+  FlushingMapper(pImpl)->remapDbgRecord(V);
 }
 
-void ValueMapper::remapDPValueRange(
+void ValueMapper::remapDbgVariableRecordRange(
     Module *M, iterator_range<DbgRecord::self_iterator> Range) {
-  for (DPValue &DPV : DPValue::filter(Range)) {
-    remapDPValue(M, DPV);
+  for (DbgVariableRecord &DVR : filterDbgVars(Range)) {
+    remapDbgVariableRecord(M, DVR);
   }
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index e86705e898890..5d03b66b0ce33 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -68,6 +68,9 @@ class VPBuilder {
 public:
   VPBuilder() = default;
   VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }
+  VPBuilder(VPRecipeBase *InsertPt) {
+    setInsertPoint(InsertPt->getParent(), InsertPt->getIterator());
+  }
 
   /// Clear the insertion point: created instructions will not be inserted into
   /// a block.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index edaad4d033bdf..2163930b02c1c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6911,25 +6911,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
 
     SmallVector<const Value *, 4> Operands(I->operand_values());
-    auto InstrCost = TTI.getArithmeticInstrCost(
+    return TTI.getArithmeticInstrCost(
         I->getOpcode(), VectorTy, CostKind,
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        Op2Info, Operands, I);
-
-    // Some targets can replace frem with vector library calls.
-    InstructionCost VecCallCost = InstructionCost::getInvalid();
-    if (I->getOpcode() == Instruction::FRem) {
-      LibFunc Func;
-      if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
-          TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
-        SmallVector<Type *, 4> OpTypes;
-        for (auto &Op : I->operands())
-          OpTypes.push_back(Op->getType());
-        VecCallCost =
-            TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
-      }
-    }
-    return std::min(InstrCost, VecCallCost);
+        Op2Info, Operands, I, TLI);
   }
   case Instruction::FNeg: {
     return TTI.getArithmeticInstrCost(
@@ -7913,8 +7898,7 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
   }
 }
 
-VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
-                                         VPlan &Plan) {
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
 
   // Look for cached value.
@@ -7969,7 +7953,7 @@ VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
   return ECEntryIt->second;
 }
 
-void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
+void VPRecipeBuilder::createHeaderMask() {
   BasicBlock *Header = OrigLoop->getHeader();
 
   // When not folding the tail, use nullptr to model all-true mask.
@@ -8004,7 +7988,7 @@ VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
   return BCEntryIt->second;
 }
 
-void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
+void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
   assert(OrigLoop->getHeader() != BB &&
@@ -8015,7 +7999,7 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
   VPValue *BlockMask = nullptr;
   // This is the block mask. We OR all incoming edges.
   for (auto *Predecessor : predecessors(BB)) {
-    VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
+    VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
       BlockMaskCache[BB] = EdgeMask;
       return;
@@ -8034,7 +8018,7 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
 
 VPWidenMemoryInstructionRecipe *
 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
-                                  VFRange &Range, VPlanPtr &Plan) {
+                                  VFRange &Range) {
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
          "Must be called with either a load or store");
 
@@ -8107,7 +8091,7 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
 }
 
 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
-    PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
+    PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
 
   // Check if this is an integer or fp induction. If so, build the recipe that
   // produces its scalar and vector values.
@@ -8131,7 +8115,7 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
 }
 
 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
-    TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
+    TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
   // Optimize the special case where the source is a constant integer
   // induction variable. Notice that we can only optimize the 'trunc' case
   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -8159,8 +8143,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
 }
 
 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
-                                           ArrayRef<VPValue *> Operands,
-                                           VPlanPtr &Plan) {
+                                           ArrayRef<VPValue *> Operands) {
   unsigned NumIncoming = Phi->getNumIncomingValues();
 
   // We know that all PHIs in non-header blocks are converted into selects, so
@@ -8173,7 +8156,7 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
   for (unsigned In = 0; In < NumIncoming; In++) {
     OperandsWithMask.push_back(Operands[In]);
     VPValue *EdgeMask =
-        createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
+        createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
     if (!EdgeMask) {
       assert(In == 0 && "Both null and non-null edge masks found");
       assert(all_equal(Operands) &&
@@ -8187,8 +8170,7 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
 
 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                                                    ArrayRef<VPValue *> Operands,
-                                                   VFRange &Range,
-                                                   VPlanPtr &Plan) {
+                                                   VFRange &Range) {
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [this, CI](ElementCount VF) {
         return CM.isScalarWithPredication(CI, VF);
@@ -8263,7 +8245,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
       if (Legal->isMaskRequired(CI))
         Mask = getBlockInMask(CI->getParent());
       else
-        Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
+        Mask = Plan.getVPValueOrAddLiveIn(ConstantInt::getTrue(
             IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
 
       Ops.insert(Ops.begin() + *MaskPos, Mask);
@@ -8293,7 +8275,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
 
 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
                                            ArrayRef<VPValue *> Operands,
-                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
+                                           VPBasicBlock *VPBB) {
   switch (I->getOpcode()) {
   default:
     return nullptr;
@@ -8306,8 +8288,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     if (CM.isPredicatedInst(I)) {
       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
       VPValue *Mask = getBlockInMask(I->getParent());
-      VPValue *One = Plan->getVPValueOrAddLiveIn(
-          ConstantInt::get(I->getType(), 1u, false));
+      VPValue *One =
+          Plan.getVPValueOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
       auto *SafeRHS =
          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
                            I->getDebugLoc());
@@ -8351,8 +8333,7 @@ void VPRecipeBuilder::fixHeaderPhis() {
 }
 
 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
-                                                      VFRange &Range,
-                                                      VPlan &Plan) {
+                                                      VFRange &Range) {
   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
       Range);
@@ -8407,21 +8388,22 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   return Recipe;
 }
 
-VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
-    Instruction *Instr, ArrayRef<VPValue *> Operands, VFRange &Range,
-    VPBasicBlock *VPBB, VPlanPtr &Plan) {
+VPRecipeBase *
+VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
+                                        ArrayRef<VPValue *> Operands,
+                                        VFRange &Range, VPBasicBlock *VPBB) {
   // First, check for specific widening recipes that deal with inductions, Phi
   // nodes, calls and memory operations.
   VPRecipeBase *Recipe;
   if (auto Phi = dyn_cast<PHINode>(Instr)) {
     if (Phi->getParent() != OrigLoop->getHeader())
-      return tryToBlend(Phi, Operands, Plan);
+      return tryToBlend(Phi, Operands);
 
     // Always record recipes for header phis. Later first-order recurrence phis
     // can have earlier phis as incoming values.
     recordRecipeOf(Phi);
 
-    if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
+    if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
       return Recipe;
 
     VPHeaderPHIRecipe *PhiRecipe = nullptr;
@@ -8457,9 +8439,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
     return PhiRecipe;
   }
 
-  if (isa<TruncInst>(Instr) &&
-      (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
-                                               Range, *Plan)))
+  if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
+                                    cast<TruncInst>(Instr), Operands, Range)))
     return Recipe;
 
   // All widen recipes below deal only with VF > 1.
@@ -8468,10 +8449,10 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
     return nullptr;
 
   if (auto *CI = dyn_cast<CallInst>(Instr))
-    return tryToWidenCall(CI, Operands, Range, Plan);
+    return tryToWidenCall(CI, Operands, Range);
 
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
-    return tryToWidenMemory(Instr, Operands, Range, Plan);
+    return tryToWidenMemory(Instr, Operands, Range);
 
   if (!shouldWiden(Instr, Range))
     return nullptr;
@@ -8490,7 +8471,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
                                  *CI);
   }
 
-  return tryToWiden(Instr, Operands, VPBB, Plan);
+  return tryToWiden(Instr, Operands, VPBB);
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -8562,37 +8543,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
-  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
-
-  // ---------------------------------------------------------------------------
-  // Pre-construction: record ingredients whose recipes we'll need to further
-  // process after constructing the initial VPlan.
-  // ---------------------------------------------------------------------------
-
-  // For each interleave group which is relevant for this (possibly trimmed)
-  // Range, add it to the set of groups to be later applied to the VPlan and add
-  // placeholders for its members' Recipes which we'll be replacing with a
-  // single VPInterleaveRecipe.
-  for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
-    auto applyIG = [IG, this](ElementCount VF) -> bool {
-      bool Result = (VF.isVector() && // Query is illegal for VF == 1
-                     CM.getWideningDecision(IG->getInsertPos(), VF) ==
-                         LoopVectorizationCostModel::CM_Interleave);
-      // For scalable vectors, the only interleave factor currently supported
-      // is 2 since we require the (de)interleave2 intrinsics instead of
-      // shufflevectors.
-      assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
-             "Unsupported interleave factor for scalable vectors");
-      return Result;
-    };
-    if (!getDecisionAndClampRange(applyIG, Range))
-      continue;
-    InterleaveGroups.insert(IG);
-    for (unsigned i = 0; i < IG->getFactor(); i++)
-      if (Instruction *Member = IG->getMember(i))
-        RecipeBuilder.recordRecipeOf(Member);
-  };
-
   // ---------------------------------------------------------------------------
   // Build initial VPlan: Scan the body of the loop in a topological order to
   // visit each basic block after having visited its predecessor basic blocks.
@@ -8627,6 +8577,41 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   bool HasNUW = Style == TailFoldingStyle::None;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
 
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+
+  // ---------------------------------------------------------------------------
+  // Pre-construction: record ingredients whose recipes we'll need to further
+  // process after constructing the initial VPlan.
+  // ---------------------------------------------------------------------------
+
+  // For each interleave group which is relevant for this (possibly trimmed)
+  // Range, add it to the set of groups to be later applied to the VPlan and add
+  // placeholders for its members' Recipes which we'll be replacing with a
+  // single VPInterleaveRecipe.
+  for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
+    auto applyIG = [IG, this](ElementCount VF) -> bool {
+      bool Result = (VF.isVector() && // Query is illegal for VF == 1
+                     CM.getWideningDecision(IG->getInsertPos(), VF) ==
+                         LoopVectorizationCostModel::CM_Interleave);
+      // For scalable vectors, the only interleave factor currently supported
+      // is 2 since we require the (de)interleave2 intrinsics instead of
+      // shufflevectors.
+      assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+             "Unsupported interleave factor for scalable vectors");
+      return Result;
+    };
+    if (!getDecisionAndClampRange(applyIG, Range))
+      continue;
+    InterleaveGroups.insert(IG);
+    for (unsigned i = 0; i < IG->getFactor(); i++)
+      if (Instruction *Member = IG->getMember(i))
+        RecipeBuilder.recordRecipeOf(Member);
+  };
+
+  // ---------------------------------------------------------------------------
+  // Construct recipes for the instructions in the loop
+  // ---------------------------------------------------------------------------
+
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   LoopBlocksDFS DFS(OrigLoop);
@@ -8648,9 +8633,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     Builder.setInsertPoint(VPBB);
 
     if (VPBB == HeaderVPBB)
-      RecipeBuilder.createHeaderMask(*Plan);
+      RecipeBuilder.createHeaderMask();
     else if (NeedsMasks)
-      RecipeBuilder.createBlockInMask(BB, *Plan);
+      RecipeBuilder.createBlockInMask(BB);
 
     // Introduce each ingredient into VPlan.
     // TODO: Model and preserve debug intrinsics in VPlan.
@@ -8673,10 +8658,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
         continue;
 
-      VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(
-          Instr, Operands, Range, VPBB, Plan);
+      VPRecipeBase *Recipe =
+          RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
       if (!Recipe)
-        Recipe = RecipeBuilder.handleReplication(Instr, Range, *Plan);
+        Recipe = RecipeBuilder.handleReplication(Instr, Range);
       for (auto *Def : Recipe->definedValues()) {
         auto *UV = Def->getUnderlyingValue();
         Plan->addVPValue(UV, Def);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5e0f5b7efadc4..ac16fcf767459 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1001,8 +1001,9 @@ class BoUpSLP {
           TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
-      : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
-        DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
+      : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
+        AC(AC), DB(DB), DL(DL), ORE(ORE),
+        Builder(Se->getContext(), TargetFolder(*DL)) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
     // Use the vector register size specified by the target unless overridden
     // by a command-line option.
@@ -1085,6 +1086,9 @@ class BoUpSLP {
       BS->clear();
     }
     MinBWs.clear();
+    ReductionBitWidth = 0;
+    CastMaxMinBWSizes.reset();
+    TruncNodes.clear();
     InstrElementSize.clear();
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
@@ -2287,6 +2291,7 @@ class BoUpSLP {
   void clearReductionData() {
     AnalyzedReductionsRoots.clear();
     AnalyzedReductionVals.clear();
+    AnalyzedMinBWVals.clear();
   }
   /// Checks if the given value is gathered in one of the nodes.
   bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
@@ -2307,9 +2312,11 @@ class BoUpSLP {
   /// constant and to be demoted. Required to correctly identify constant nodes
   /// to be demoted.
   bool collectValuesToDemote(
-      Value *V, SmallVectorImpl<Value *> &ToDemote,
+      Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+      SmallVectorImpl<Value *> &ToDemote,
       DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
-      SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
+      DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
+      bool &IsProfitableToDemote, bool IsTruncRoot) const;
 
   /// Check if the operands on the edges \p Edges of the \p UserTE allows
   /// reordering (i.e. the operands can be reordered because they have only one
@@ -2375,6 +2382,10 @@ class BoUpSLP {
   /// \ returns the graph entry for the \p Idx operand of the \p E entry.
   const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
 
+  /// \returns Cast context for the given graph node.
+  TargetTransformInfo::CastContextHint
+  getCastContextHint(const TreeEntry &TE) const;
+
   /// \returns the cost of the vectorizable entry.
   InstructionCost getEntryCost(const TreeEntry *E,
                                ArrayRef<Value *> VectorizedVals,
@@ -2925,11 +2936,18 @@ class BoUpSLP {
       }
       assert(!BundleMember && "Bundle and VL out of sync");
     } else {
-      MustGather.insert(VL.begin(), VL.end());
       // Build a map for gathered scalars to the nodes where they are used.
+      bool AllConstsOrCasts = true;
       for (Value *V : VL)
-        if (!isConstant(V))
+        if (!isConstant(V)) {
+          auto *I = dyn_cast<CastInst>(V);
+          AllConstsOrCasts &= I && I->getType()->isIntegerTy();
           ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
+        }
+      if (AllConstsOrCasts)
+        CastMaxMinBWSizes =
+            std::make_pair(std::numeric_limits<unsigned>::max(), 1);
+      MustGather.insert(VL.begin(), VL.end());
     }
 
     if (UserTreeIdx.UserTE)
@@ -3054,6 +3072,10 @@ class BoUpSLP {
   /// Set of hashes for the list of reduction values already being analyzed.
   DenseSet<size_t> AnalyzedReductionVals;
 
+  /// Values, already been analyzed for mininmal bitwidth and found to be
+  /// non-profitable.
+  DenseSet<Value *> AnalyzedMinBWVals;
+
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
   /// can be nullptr, it means that this Internal Scalar will be used later,
@@ -3621,7 +3643,7 @@ class BoUpSLP {
   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
 
   /// Instruction builder to construct the vectorized tree.
-  IRBuilder<> Builder;
+  IRBuilder<TargetFolder> Builder;
 
   /// A map of scalar integer values to the smallest bit width with which they
   /// can legally be represented. The values map to (width, signed) pairs,
@@ -3629,6 +3651,18 @@ class BoUpSLP {
   /// value must be signed-extended, rather than zero-extended, back to its
   /// original width.
   DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
+
+  /// Final size of the reduced vector, if the current graph represents the
+  /// input for the reduction and it was possible to narrow the size of the
+  /// reduction.
+  unsigned ReductionBitWidth = 0;
+
+  /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
+  /// type sizes, used in the tree.
+  std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
+
+  /// Indices of the vectorized trunc nodes.
+  DenseSet<unsigned> TruncNodes;
 };
 
 } // end namespace slpvectorizer
@@ -4295,9 +4329,12 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
               llvm_unreachable(
                   "Expected only consecutive, strided or masked gather loads.");
             }
+            SmallVector<int> ShuffleMask(VL.size());
+            for (int Idx : seq<int>(0, VL.size()))
+              ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
             VecLdCost +=
                 TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
-                                   std::nullopt, CostKind, I * VF, SubVecTy);
+                                   ShuffleMask, CostKind, I * VF, SubVecTy);
           }
           // If masked gather cost is higher - better to vectorize, so
           // consider it as a gather node. It will be better estimated
@@ -4313,9 +4350,10 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
     // increases the cost.
     Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
     bool ProfitableGatherPointers =
-        L && Sz > 2 && count_if(PointerOps, [L](Value *V) {
-                         return L->isLoopInvariant(V);
-                       }) <= Sz / 2;
+        L && Sz > 2 &&
+        static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
+          return L->isLoopInvariant(V);
+        })) <= Sz / 2;
     if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
           auto *GEP = dyn_cast<GetElementPtrInst>(P);
           return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
@@ -6539,6 +6577,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
+      auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
+          std::make_pair(std::numeric_limits<unsigned>::min(),
+                         std::numeric_limits<unsigned>::max()));
+      if (ShuffleOrOp == Instruction::ZExt ||
+          ShuffleOrOp == Instruction::SExt) {
+        CastMaxMinBWSizes = std::make_pair(
+            std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
+                               PrevMaxBW),
+            std::min<unsigned>(
+                DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
+                PrevMinBW));
+      } else if (ShuffleOrOp == Instruction::Trunc) {
+        CastMaxMinBWSizes = std::make_pair(
+            std::max<unsigned>(
+                DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
+                PrevMaxBW),
+            std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
+                               PrevMinBW));
+        TruncNodes.insert(VectorizableTree.size());
+      }
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
@@ -7400,7 +7458,7 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
         Index + NumSrcElts <= static_cast<int>(Mask.size()))
       return TTI.getShuffleCost(
           TTI::SK_InsertSubvector,
-          FixedVectorType::get(Tp->getElementType(), Mask.size()), std::nullopt,
+          FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
           TTI::TCK_RecipThroughput, Index, Tp);
   }
   return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
@@ -7673,9 +7731,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         if (NeedInsertSubvectorAnalysis) {
           // Add the cost for the subvectors insert.
-          for (int I = VF, E = VL.size(); I < E; I += VF)
+          SmallVector<int> ShuffleMask(VL.size());
+          for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
+            for (unsigned Idx : seq<unsigned>(0, E))
+              ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
             GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
-                                             std::nullopt, CostKind, I, LoadTy);
+                                             ShuffleMask, CostKind, I, LoadTy);
+          }
         }
         GatherCost -= ScalarsCost;
       }
@@ -7690,16 +7752,22 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       bool NeedShuffle =
           count(VL, *It) > 1 &&
           (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
+      if (!NeedShuffle)
+        return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
+                                      CostKind, std::distance(VL.begin(), It),
+                                      PoisonValue::get(VecTy), *It);
+
+      SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
+      transform(VL, ShuffleMask.begin(), [](Value *V) {
+        return isa<PoisonValue>(V) ? PoisonMaskElem : 0;   
+      });
       InstructionCost InsertCost = TTI.getVectorInstrCost(
-          Instruction::InsertElement, VecTy, CostKind,
-          NeedShuffle ? 0 : std::distance(VL.begin(), It),
+          Instruction::InsertElement, VecTy, CostKind, 0,
           PoisonValue::get(VecTy), *It);
       return InsertCost +
-             (NeedShuffle ? TTI.getShuffleCost(
-                                TargetTransformInfo::SK_Broadcast, VecTy,
-                                /*Mask=*/std::nullopt, CostKind, /*Index=*/0,
-                                /*SubTp=*/nullptr, /*Args=*/*It)
-                          : TTI::TCC_Free);
+             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
+                                ShuffleMask, CostKind, /*Index=*/0,
+                                /*SubTp=*/nullptr, /*Args=*/*It);
     }
     return GatherCost +
            (all_of(Gathers, UndefValue::classof)
@@ -8362,6 +8430,22 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
   return It->get();
 }
 
+TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
+  if (TE.State == TreeEntry::ScatterVectorize ||
+      TE.State == TreeEntry::StridedVectorize)
+    return TTI::CastContextHint::GatherScatter;
+  if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
+      !TE.isAltShuffle()) {
+    if (TE.ReorderIndices.empty())
+      return TTI::CastContextHint::Normal;
+    SmallVector<int> Mask;
+    inversePermutation(TE.ReorderIndices, Mask);
+    if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
+      return TTI::CastContextHint::Reversed;
+  }
+  return TTI::CastContextHint::None;
+}
+
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                       SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -8384,6 +8468,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   // If we have computed a smaller type for the expression, update VecTy so
   // that the costs will be accurate.
   auto It = MinBWs.find(E);
+  Type *OrigScalarTy = ScalarTy;
   if (It != MinBWs.end()) {
     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
     VecTy = FixedVectorType::get(ScalarTy, VL.size());
@@ -8441,24 +8526,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     UsedScalars.set(I);
   }
   auto GetCastContextHint = [&](Value *V) {
-    if (const TreeEntry *OpTE = getTreeEntry(V)) {
-      if (OpTE->State == TreeEntry::ScatterVectorize ||
-          OpTE->State == TreeEntry::StridedVectorize)
-        return TTI::CastContextHint::GatherScatter;
-      if (OpTE->State == TreeEntry::Vectorize &&
-          OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
-        if (OpTE->ReorderIndices.empty())
-          return TTI::CastContextHint::Normal;
-        SmallVector<int> Mask;
-        inversePermutation(OpTE->ReorderIndices, Mask);
-        if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
-          return TTI::CastContextHint::Reversed;
-      }
-    } else {
-      InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
-      if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
-        return TTI::CastContextHint::GatherScatter;
-    }
+    if (const TreeEntry *OpTE = getTreeEntry(V))
+      return getCastContextHint(*OpTE);
+    InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
+    if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
+      return TTI::CastContextHint::GatherScatter;
     return TTI::CastContextHint::None;
   };
   auto GetCostDiff =
@@ -8507,8 +8579,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
               TTI::CastContextHint CCH = GetCastContextHint(VL0);
               VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
                                                CostKind);
-              ScalarCost += Sz * TTI->getCastInstrCost(VecOpcode, UserScalarTy,
-                                                       ScalarTy, CCH, CostKind);
             }
           }
         }
@@ -8525,7 +8595,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     InstructionCost ScalarCost = 0;
     InstructionCost VecCost = 0;
     std::tie(ScalarCost, VecCost) = getGEPCosts(
-        *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, ScalarTy, VecTy);
+        *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
     LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
                              "Calculated GEPs cost for Tree"));
 
@@ -8572,7 +8642,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
           NumElts = ATy->getNumElements();
         else
           NumElts = AggregateTy->getStructNumElements();
-        SrcVecTy = FixedVectorType::get(ScalarTy, NumElts);
+        SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
       }
       if (I->hasOneUse()) {
         Instruction *Ext = I->user_back();
@@ -8740,13 +8810,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       }
     }
     auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
-      // Do not count cost here if minimum bitwidth is in effect and it is just
-      // a bitcast (here it is just a noop).
-      if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
-        return TTI::TCC_Free;
-      auto *VI = VL0->getOpcode() == Opcode
-                     ? cast<Instruction>(UniqueValues[Idx])
-                     : nullptr;
+      auto *VI = cast<Instruction>(UniqueValues[Idx]);
       return TTI->getCastInstrCost(Opcode, VL0->getType(),
                                    VL0->getOperand(0)->getType(),
                                    TTI::getCastContextHint(VI), CostKind, VI);
@@ -8789,7 +8853,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                                        ? CmpInst::BAD_FCMP_PREDICATE
                                        : CmpInst::BAD_ICMP_PREDICATE;
 
-      return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+      return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
                                      Builder.getInt1Ty(), CurrentPred, CostKind,
                                      VI);
     };
@@ -8844,7 +8908,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       TTI::OperandValueInfo Op2Info =
           TTI::getOperandInfo(VI->getOperand(OpIdx));
       SmallVector<const Value *> Operands(VI->operand_values());
-      return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind,
+      return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
                                          Op1Info, Op2Info, Operands, VI);
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
@@ -8852,7 +8916,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
       TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
       return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
-                                         Op2Info) +
+                                         Op2Info, std::nullopt, nullptr, TLI) +
              CommonCost;
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
@@ -8863,9 +8927,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   case Instruction::Load: {
     auto GetScalarCost = [&](unsigned Idx) {
       auto *VI = cast<LoadInst>(UniqueValues[Idx]);
-      return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
-                                  VI->getPointerAddressSpace(), CostKind,
-                                  TTI::OperandValueInfo(), VI);
+      return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
+                                  VI->getAlign(), VI->getPointerAddressSpace(),
+                                  CostKind, TTI::OperandValueInfo(), VI);
     };
     auto *LI0 = cast<LoadInst>(VL0);
     auto GetVectorCost = [&](InstructionCost CommonCost) {
@@ -8908,9 +8972,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     auto GetScalarCost = [=](unsigned Idx) {
       auto *VI = cast<StoreInst>(VL[Idx]);
       TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
-      return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
-                                  VI->getPointerAddressSpace(), CostKind,
-                                  OpInfo, VI);
+      return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
+                                  VI->getAlign(), VI->getPointerAddressSpace(),
+                                  CostKind, OpInfo, VI);
     };
     auto *BaseSI =
         cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
@@ -9772,6 +9836,44 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     Cost -= InsertCost;
   }
 
+  // Add the cost for reduced value resize (if required).
+  if (ReductionBitWidth != 0) {
+    assert(UserIgnoreList && "Expected reduction tree.");
+    const TreeEntry &E = *VectorizableTree.front().get();
+    auto It = MinBWs.find(&E);
+    if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
+      unsigned SrcSize = It->second.first;
+      unsigned DstSize = ReductionBitWidth;
+      unsigned Opcode = Instruction::Trunc;
+      if (SrcSize < DstSize)
+        Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+      auto *SrcVecTy =
+          FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
+      auto *DstVecTy =
+          FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
+      TTI::CastContextHint CCH = getCastContextHint(E);
+      InstructionCost CastCost;
+      switch (E.getOpcode()) {
+      case Instruction::SExt:
+      case Instruction::ZExt:
+      case Instruction::Trunc: {
+        const TreeEntry *OpTE = getOperandEntry(&E, 0);
+        CCH = getCastContextHint(*OpTE);
+        break;
+      }
+      default:
+        break;
+      }
+      CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
+                                        TTI::TCK_RecipThroughput);
+      Cost += CastCost;
+      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
+                        << " for final resize for reduction from " << SrcVecTy
+                        << " to " << DstVecTy << "\n";
+                 dbgs() << "SLP: Current total cost = " << Cost << "\n");
+    }
+  }
+
 #ifndef NDEBUG
   SmallString<256> Str;
   {
@@ -10053,11 +10155,6 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
       if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
         continue;
-      auto It = MinBWs.find(VTE);
-      // If vectorize node is demoted - do not match.
-      if (It != MinBWs.end() &&
-          It->second.first != DL->getTypeSizeInBits(V->getType()))
-        continue;
       VToTEs.insert(VTE);
     }
     if (VToTEs.empty())
@@ -10374,7 +10471,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
   APInt ShuffledElements = APInt::getZero(VL.size());
-  DenseSet<Value *> UniqueElements;
+  DenseMap<Value *, unsigned> UniqueElements;
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost Cost;
   auto EstimateInsertCost = [&](unsigned I, Value *V) {
@@ -10383,27 +10480,34 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
           TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
                                   I, Constant::getNullValue(VecTy), V);
   };
+  SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
   for (unsigned I = 0, E = VL.size(); I < E; ++I) {
     Value *V = VL[I];
     // No need to shuffle duplicates for constants.
     if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
       ShuffledElements.setBit(I);
+      ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
       continue;
     }
-    if (!UniqueElements.insert(V).second) {
-      DuplicateNonConst = true;
-      ShuffledElements.setBit(I);
+
+    auto Res = UniqueElements.try_emplace(V, I);
+    if (Res.second) {
+      EstimateInsertCost(I, V);
+      ShuffleMask[I] = I;
       continue;
     }
-    EstimateInsertCost(I, V);
+
+    DuplicateNonConst = true;
+    ShuffledElements.setBit(I);
+    ShuffleMask[I] = Res.first->second;
   }
   if (ForPoisonSrc)
     Cost =
         TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
                                       /*Extract*/ false, CostKind);
   if (DuplicateNonConst)
-    Cost +=
-        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+    Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                VecTy, ShuffleMask);
   return Cost;
 }
 
@@ -10614,8 +10718,20 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
         PostponedInsts.emplace_back(Inst, I);
   }
 
-  auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) {
-    Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos));
+  auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
+                                      Type *Ty) {
+    Value *Scalar = V;
+    if (cast<VectorType>(Vec->getType())->getElementType() != Ty) {
+      assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
+             "Expected integer types only.");
+      Vec = Builder.CreateIntCast(
+          Vec,
+          VectorType::get(Ty,
+                          cast<VectorType>(Vec->getType())->getElementCount()),
+          !isKnownNonNegative(Vec, SimplifyQuery(*DL)));
+    }
+
+    Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
     auto *InsElt = dyn_cast<InsertElementInst>(Vec);
     if (!InsElt)
       return Vec;
@@ -10625,15 +10741,25 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
     if (isa<Instruction>(V)) {
       if (TreeEntry *Entry = getTreeEntry(V)) {
         // Find which lane we need to extract.
-        unsigned FoundLane = Entry->findLaneForValue(V);
-        ExternalUses.emplace_back(V, InsElt, FoundLane);
+        User *UserOp = nullptr;
+        if (Scalar != V) {
+          if (auto *SI = dyn_cast<Instruction>(Scalar))
+            UserOp = SI;
+        } else {
+          UserOp = InsElt;
+        }
+        if (UserOp) {
+          unsigned FoundLane = Entry->findLaneForValue(V);
+          ExternalUses.emplace_back(V, UserOp, FoundLane);
+        }
       }
     }
     return Vec;
   };
   Value *Val0 =
       isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
-  FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
+  Type *ScalarTy = Val0->getType();
+  FixedVectorType *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   Value *Vec = Root ? Root : PoisonValue::get(VecTy);
   SmallVector<int> NonConsts;
   // Insert constant values at first.
@@ -10656,15 +10782,15 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
           continue;
       }
     }
-    Vec = CreateInsertElement(Vec, VL[I], I);
+    Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
   }
   // Insert non-constant values.
   for (int I : NonConsts)
-    Vec = CreateInsertElement(Vec, VL[I], I);
+    Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
   // Append instructions, which are/may be part of the loop, in the end to make
   // it possible to hoist non-loop-based instructions.
   for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
-    Vec = CreateInsertElement(Vec, Pair.first, Pair.second);
+    Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
 
   return Vec;
 }
@@ -10721,16 +10847,35 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
     SetVector<Instruction *> &GatherShuffleExtractSeq;
     /// A list of blocks that we are going to CSE.
     DenseSet<BasicBlock *> &CSEBlocks;
+    /// Data layout.
+    const DataLayout &DL;
 
   public:
     ShuffleIRBuilder(IRBuilderBase &Builder,
                      SetVector<Instruction *> &GatherShuffleExtractSeq,
-                     DenseSet<BasicBlock *> &CSEBlocks)
+                     DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
         : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
-          CSEBlocks(CSEBlocks) {}
+          CSEBlocks(CSEBlocks), DL(DL) {}
     ~ShuffleIRBuilder() = default;
     /// Creates shufflevector for the 2 operands with the given mask.
     Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
+      if (V1->getType() != V2->getType()) {
+        assert(V1->getType()->isIntOrIntVectorTy() &&
+               V1->getType()->isIntOrIntVectorTy() &&
+               "Expected integer vector types only.");
+        if (V1->getType() != V2->getType()) {
+          if (cast<VectorType>(V2->getType())
+                  ->getElementType()
+                  ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
+                                               ->getElementType()
+                                               ->getIntegerBitWidth())
+            V2 = Builder.CreateIntCast(
+                V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
+          else
+            V1 = Builder.CreateIntCast(
+                V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
+        }
+      }
       Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
       if (auto *I = dyn_cast<Instruction>(Vec)) {
         GatherShuffleExtractSeq.insert(I);
@@ -10789,7 +10934,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
     assert(V1 && "Expected at least one vector value.");
     ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
-                                    R.CSEBlocks);
+                                    R.CSEBlocks, *R.DL);
     return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
                                                        ShuffleBuilder);
   }
@@ -11656,7 +11801,7 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
-  IRBuilder<>::InsertPointGuard Guard(Builder);
+  IRBuilderBase::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue &&
       (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
@@ -11675,10 +11820,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   }
 
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
-  auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
-                          bool IsSigned) {
-    if (V->getType() != VecTy)
-      V = Builder.CreateIntCast(V, VecTy, IsSigned);
+  auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
     ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
     if (E->getOpcode() == Instruction::Store) {
       ArrayRef<int> Mask =
@@ -11705,12 +11847,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     ScalarTy = Store->getValueOperand()->getType();
   else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
     ScalarTy = IE->getOperand(1)->getType();
-  bool IsSigned = false;
   auto It = MinBWs.find(E);
-  if (It != MinBWs.end()) {
+  if (It != MinBWs.end())
     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
-    IsSigned = It->second.second;
-  }
+  auto GetOperandSignedness = [&](unsigned Idx) {
+    const TreeEntry *OpE = getOperandEntry(E, Idx);
+    bool IsSigned = false;
+    auto It = MinBWs.find(OpE);
+    if (It != MinBWs.end())
+      IsSigned = It->second.second;
+    else
+      IsSigned = any_of(OpE->Scalars, [&](Value *R) {
+        return !isKnownNonNegative(R, SimplifyQuery(*DL));
+      });
+    return IsSigned;
+  };
   auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
@@ -11734,7 +11885,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
                                PH->getParent()->getFirstInsertionPt());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
 
-        V = FinalShuffle(V, E, VecTy, IsSigned);
+        V = FinalShuffle(V, E, VecTy);
 
         E->VectorizedValue = V;
         if (PostponedPHIs)
@@ -11768,9 +11919,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
         Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
         if (VecTy != Vec->getType()) {
-          assert(MinBWs.contains(getOperandEntry(E, I)) &&
+          assert((getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
+                  MinBWs.contains(getOperandEntry(E, I))) &&
                  "Expected item in MinBWs.");
-          Vec = Builder.CreateIntCast(Vec, VecTy, It->second.second);
+          Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
         }
         NewPhi->addIncoming(Vec, IBB);
       }
@@ -11785,7 +11937,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (const TreeEntry *TE = getTreeEntry(V))
         V = TE->VectorizedValue;
       setInsertPointAfterBundle(E);
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
       E->VectorizedValue = V;
       return V;
     }
@@ -11795,7 +11947,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *Ptr = LI->getPointerOperand();
       LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
       Value *NewV = propagateMetadata(V, E->Scalars);
-      NewV = FinalShuffle(NewV, E, VecTy, IsSigned);
+      NewV = FinalShuffle(NewV, E, VecTy);
       E->VectorizedValue = NewV;
       return NewV;
     }
@@ -11981,10 +12133,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
 
       auto *CI = cast<CastInst>(VL0);
       Instruction::CastOps VecOpcode = CI->getOpcode();
-      Type *SrcScalarTy = VL0->getOperand(0)->getType();
+      Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
       auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
       if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
-          (SrcIt != MinBWs.end() || It != MinBWs.end())) {
+          (SrcIt != MinBWs.end() || It != MinBWs.end() ||
+           SrcScalarTy != CI->getOperand(0)->getType())) {
         // Check if the values are candidates to demote.
         unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
         if (SrcIt != MinBWs.end())
@@ -12003,7 +12156,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
                      ? InVec
                      : Builder.CreateCast(VecOpcode, InVec, VecTy);
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12024,11 +12177,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         return E->VectorizedValue;
       }
       if (L->getType() != R->getType()) {
-        assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+        assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
+                getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                MinBWs.contains(getOperandEntry(E, 0)) ||
                 MinBWs.contains(getOperandEntry(E, 1))) &&
                "Expected item in MinBWs.");
-        L = Builder.CreateIntCast(L, VecTy, IsSigned);
-        R = Builder.CreateIntCast(R, VecTy, IsSigned);
+        if (cast<VectorType>(L->getType())
+                ->getElementType()
+                ->getIntegerBitWidth() < cast<VectorType>(R->getType())
+                                             ->getElementType()
+                                             ->getIntegerBitWidth()) {
+          Type *CastTy = R->getType();
+          L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
+        } else {
+          Type *CastTy = L->getType();
+          R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
+        }
       }
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
@@ -12036,7 +12200,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       propagateIRFlags(V, E->Scalars, VL0);
       // Do not cast for cmps.
       VecTy = cast<FixedVectorType>(V->getType());
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12060,16 +12224,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
-      if (True->getType() != False->getType()) {
-        assert((MinBWs.contains(getOperandEntry(E, 1)) ||
+      if (True->getType() != VecTy || False->getType() != VecTy) {
+        assert((getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
+                MinBWs.contains(getOperandEntry(E, 1)) ||
                 MinBWs.contains(getOperandEntry(E, 2))) &&
                "Expected item in MinBWs.");
-        True = Builder.CreateIntCast(True, VecTy, IsSigned);
-        False = Builder.CreateIntCast(False, VecTy, IsSigned);
+        if (True->getType() != VecTy)
+          True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
+        if (False->getType() != VecTy)
+          False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
       }
 
       Value *V = Builder.CreateSelect(Cond, True, False);
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12091,7 +12259,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12128,12 +12296,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
-      if (LHS->getType() != RHS->getType()) {
-        assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+      if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
+        assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
+                getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                MinBWs.contains(getOperandEntry(E, 0)) ||
                 MinBWs.contains(getOperandEntry(E, 1))) &&
                "Expected item in MinBWs.");
-        LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
-        RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
+        if (LHS->getType() != VecTy)
+          LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
+        if (RHS->getType() != VecTy)
+          RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
       }
 
       Value *V = Builder.CreateBinOp(
@@ -12143,7 +12315,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12214,7 +12386,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       }
       Value *V = propagateMetadata(NewLI, E->Scalars);
 
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -12225,7 +12397,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       setInsertPointAfterBundle(E);
 
       Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
-      VecValue = FinalShuffle(VecValue, E, VecTy, IsSigned);
+      if (VecValue->getType() != VecTy)
+        VecValue =
+            Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
+      VecValue = FinalShuffle(VecValue, E, VecTy);
 
       Value *Ptr = SI->getPointerOperand();
       StoreInst *ST =
@@ -12267,7 +12442,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         V = propagateMetadata(I, GEPs);
       }
 
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12332,7 +12507,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
 
       propagateIRFlags(V, E->Scalars, VL0);
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12364,12 +12539,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
-      if (LHS && RHS && LHS->getType() != RHS->getType()) {
-        assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+      if (LHS && RHS &&
+          ((Instruction::isBinaryOp(E->getOpcode()) &&
+            (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
+           (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
+        assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
+                getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                MinBWs.contains(getOperandEntry(E, 0)) ||
                 MinBWs.contains(getOperandEntry(E, 1))) &&
                "Expected item in MinBWs.");
-        LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
-        RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
+        Type *CastTy = VecTy;
+        if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
+          if (cast<VectorType>(LHS->getType())
+                  ->getElementType()
+                  ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
+                                               ->getElementType()
+                                               ->getIntegerBitWidth())
+            CastTy = RHS->getType();
+          else
+            CastTy = LHS->getType();
+        }
+        if (LHS->getType() != VecTy)
+          LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
+        if (RHS->getType() != VecTy)
+          RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
       }
 
       Value *V0, *V1;
@@ -12421,9 +12614,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         CSEBlocks.insert(I->getParent());
       }
 
-      if (V->getType() != VecTy && !isa<CmpInst>(VL0))
-        V = Builder.CreateIntCast(
-            V, FixedVectorType::get(ScalarTy, E->getVectorFactor()), IsSigned);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -12519,6 +12709,15 @@ Value *BoUpSLP::vectorizeTree(
     }
     Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
     Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
+    if (Vec->getType() != PrevVec->getType()) {
+      assert(Vec->getType()->isIntOrIntVectorTy() &&
+             PrevVec->getType()->isIntOrIntVectorTy() &&
+             "Expected integer vector types only.");
+      assert(MinBWs.contains(TE->UserTreeIndices.front().UserTE) &&
+             "Expected user in MinBWs.");
+      bool IsSigned = MinBWs.lookup(TE->UserTreeIndices.front().UserTE).second;
+      Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), IsSigned);
+    }
     PrevVec->replaceAllUsesWith(Vec);
     PostponedValues.try_emplace(Vec).first->second.push_back(TE);
     // Replace the stub vector node, if it was used before for one of the
@@ -12539,7 +12738,9 @@ Value *BoUpSLP::vectorizeTree(
   DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
   // Maps extract Scalar to the corresponding extractelement instruction in the
   // basic block. Only one extractelement per block should be emitted.
-  DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
+  DenseMap<Value *,
+           DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
+      ScalarToEEs;
   SmallDenseSet<Value *, 4> UsedInserts;
   DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
   SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
@@ -12568,18 +12769,23 @@ Value *BoUpSLP::vectorizeTree(
     auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
       if (Scalar->getType() != Vec->getType()) {
         Value *Ex = nullptr;
+        Value *ExV = nullptr;
         auto It = ScalarToEEs.find(Scalar);
         if (It != ScalarToEEs.end()) {
           // No need to emit many extracts, just move the only one in the
           // current block.
           auto EEIt = It->second.find(Builder.GetInsertBlock());
           if (EEIt != It->second.end()) {
-            Instruction *I = EEIt->second;
+            Instruction *I = EEIt->second.first;
             if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
-                Builder.GetInsertPoint()->comesBefore(I))
+                Builder.GetInsertPoint()->comesBefore(I)) {
               I->moveBefore(*Builder.GetInsertPoint()->getParent(),
                             Builder.GetInsertPoint());
+              if (auto *CI = EEIt->second.second)
+                CI->moveAfter(I);
+            }
             Ex = I;
+            ExV = EEIt->second.second ? EEIt->second.second : Ex;
           }
         }
         if (!Ex) {
@@ -12594,11 +12800,14 @@ Value *BoUpSLP::vectorizeTree(
           }
           // If necessary, sign-extend or zero-extend ScalarRoot
           // to the larger type.
+          ExV = Ex;
           if (Scalar->getType() != Ex->getType())
-            Ex = Builder.CreateIntCast(Ex, Scalar->getType(),
-                                       MinBWs.find(E)->second.second);
+            ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
+                                        MinBWs.find(E)->second.second);
           if (auto *I = dyn_cast<Instruction>(Ex))
-            ScalarToEEs[Scalar].try_emplace(Builder.GetInsertBlock(), I);
+            ScalarToEEs[Scalar].try_emplace(
+                Builder.GetInsertBlock(),
+                std::make_pair(I, cast<Instruction>(ExV)));
         }
         // The then branch of the previous if may produce constants, since 0
         // operand might be a constant.
@@ -12606,7 +12815,7 @@ Value *BoUpSLP::vectorizeTree(
           GatherShuffleExtractSeq.insert(ExI);
           CSEBlocks.insert(ExI->getParent());
         }
-        return Ex;
+        return ExV;
       }
       assert(isa<FixedVectorType>(Scalar->getType()) &&
              isa<InsertElementInst>(Scalar) &&
@@ -12669,7 +12878,7 @@ Value *BoUpSLP::vectorizeTree(
             auto Key = std::make_pair(Vec, ScalarTy);
             auto VecIt = VectorCasts.find(Key);
             if (VecIt == VectorCasts.end()) {
-              IRBuilder<>::InsertPointGuard Guard(Builder);
+              IRBuilderBase::InsertPointGuard Guard(Builder);
               if (auto *IVec = dyn_cast<Instruction>(Vec))
                 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
               Vec = Builder.CreateIntCast(
@@ -12928,7 +13137,21 @@ Value *BoUpSLP::vectorizeTree(
   Builder.ClearInsertionPoint();
   InstrElementSize.clear();
 
-  return VectorizableTree[0]->VectorizedValue;
+  const TreeEntry &RootTE = *VectorizableTree.front().get();
+  Value *Vec = RootTE.VectorizedValue;
+  if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
+                                      It != MinBWs.end() &&
+                                      ReductionBitWidth != It->second.first) {
+    IRBuilder<>::InsertPointGuard Guard(Builder);
+    Builder.SetInsertPoint(ReductionRoot->getParent(),
+                           ReductionRoot->getIterator());
+    Vec = Builder.CreateIntCast(
+        Vec,
+        VectorType::get(Builder.getIntNTy(ReductionBitWidth),
+                        cast<VectorType>(Vec->getType())->getElementCount()),
+        It->second.second);
+  }
+  return Vec;
 }
 
 void BoUpSLP::optimizeGatherSequence() {
@@ -13748,36 +13971,90 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
 // smaller type with a truncation. We collect the values that will be demoted
 // in ToDemote and additional roots that require investigating in Roots.
 bool BoUpSLP::collectValuesToDemote(
-    Value *V, SmallVectorImpl<Value *> &ToDemote,
+    Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+    SmallVectorImpl<Value *> &ToDemote,
     DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
-    SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {
+    DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
+    bool &IsProfitableToDemote, bool IsTruncRoot) const {
   // We can always demote constants.
   if (isa<Constant>(V))
     return true;
 
+  if (DL->getTypeSizeInBits(V->getType()) == BitWidth) {
+    MaxDepthLevel = 1;
+    return true;
+  }
+
   // If the value is not a vectorized instruction in the expression and not used
   // by the insertelement instruction and not used in multiple vector nodes, it
   // cannot be demoted.
+  auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
+    if (MultiNodeScalars.contains(V))
+      return false;
+    uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
+    APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+    if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
+      return true;
+    auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
+    unsigned BitWidth1 = OrigBitWidth - NumSignBits;
+    if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
+      ++BitWidth1;
+    BitWidth = std::max(BitWidth, BitWidth1);
+    return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
+  };
+  auto FinalAnalysis = [&](const TreeEntry *ITE = nullptr) {
+    if (!IsProfitableToDemote)
+      return false;
+    return (ITE && ITE->UserTreeIndices.size() > 1) ||
+           IsPotentiallyTruncated(V, BitWidth);
+  };
+  // TODO: improve handling of gathered values and others.
   auto *I = dyn_cast<Instruction>(V);
-  if (!I || !getTreeEntry(I) || MultiNodeScalars.contains(I) ||
-      !Visited.insert(I).second || all_of(I->users(), [&](User *U) {
+  const TreeEntry *ITE = I ? getTreeEntry(I) : nullptr;
+  if (!ITE || !Visited.insert(I).second || MultiNodeScalars.contains(I) ||
+      all_of(I->users(), [&](User *U) {
         return isa<InsertElementInst>(U) && !getTreeEntry(U);
       }))
-    return false;
+    return FinalAnalysis();
 
   unsigned Start = 0;
   unsigned End = I->getNumOperands();
+
+  auto ProcessOperands = [&](ArrayRef<Value *> Operands, bool &NeedToExit) {
+    NeedToExit = false;
+    unsigned InitLevel = MaxDepthLevel;
+    for (Value *IncValue : Operands) {
+      unsigned Level = InitLevel;
+      if (!collectValuesToDemote(IncValue, IsProfitableToDemoteRoot, BitWidth,
+                                 ToDemote, DemotedConsts, Visited, Level,
+                                 IsProfitableToDemote, IsTruncRoot)) {
+        if (!IsProfitableToDemote)
+          return false;
+        NeedToExit = true;
+        if (!FinalAnalysis(ITE))
+          return false;
+        continue;
+      }
+      MaxDepthLevel = std::max(MaxDepthLevel, Level);
+    }
+    return true;
+  };
+  bool NeedToExit = false;
   switch (I->getOpcode()) {
 
   // We can always demote truncations and extensions. Since truncations can
   // seed additional demotion, we save the truncated value.
   case Instruction::Trunc:
-    Roots.push_back(I->getOperand(0));
+    if (!IsTruncRoot)
+      MaxDepthLevel = 1;
+    if (IsProfitableToDemoteRoot)
+      IsProfitableToDemote = true;
     break;
   case Instruction::ZExt:
   case Instruction::SExt:
-    if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0)))
-      return false;
+    if (!IsTruncRoot)
+      MaxDepthLevel = 1;
+    IsProfitableToDemote = true;
     break;
 
   // We can demote certain binary operations if we can demote both of their
@@ -13787,22 +14064,21 @@ bool BoUpSLP::collectValuesToDemote(
   case Instruction::Mul:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
-    if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,
-                               Visited) ||
-        !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,
-                               Visited))
+  case Instruction::Xor: {
+    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
+      return false;
+    if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
       return false;
     break;
+  }
 
   // We can demote selects if we can demote their true and false values.
   case Instruction::Select: {
+    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
+      return false;
     Start = 1;
-    SelectInst *SI = cast<SelectInst>(I);
-    if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,
-                               Roots, Visited) ||
-        !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,
-                               Roots, Visited))
+    auto *SI = cast<SelectInst>(I);
+    if (!ProcessOperands({SI->getTrueValue(), SI->getFalseValue()}, NeedToExit))
       return false;
     break;
   }
@@ -13811,172 +14087,267 @@ bool BoUpSLP::collectValuesToDemote(
   // we don't need to worry about cycles since we ensure single use above.
   case Instruction::PHI: {
     PHINode *PN = cast<PHINode>(I);
-    for (Value *IncValue : PN->incoming_values())
-      if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,
-                                 Visited))
-        return false;
+    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
+      return false;
+    SmallVector<Value *> Ops(PN->incoming_values().begin(),
+                             PN->incoming_values().end());
+    if (!ProcessOperands(Ops, NeedToExit))
+      return false;
     break;
   }
 
   // Otherwise, conservatively give up.
   default:
-    return false;
+    MaxDepthLevel = 1;
+    return FinalAnalysis();
   }
+  if (NeedToExit)
+    return true;
 
+  ++MaxDepthLevel;
   // Gather demoted constant operands.
   for (unsigned Idx : seq<unsigned>(Start, End))
     if (isa<Constant>(I->getOperand(Idx)))
       DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
   // Record the value that we can demote.
   ToDemote.push_back(V);
-  return true;
+  return IsProfitableToDemote;
 }
 
 void BoUpSLP::computeMinimumValueSizes() {
   // We only attempt to truncate integer expressions.
-  auto &TreeRoot = VectorizableTree[0]->Scalars;
-  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
-  if (!TreeRootIT || VectorizableTree.front()->State == TreeEntry::NeedToGather)
+  bool IsStoreOrInsertElt =
+      VectorizableTree.front()->getOpcode() == Instruction::Store ||
+      VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
+  if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 &&
+      (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
+       CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
     return;
 
+  unsigned NodeIdx = 0;
+  if (IsStoreOrInsertElt &&
+      VectorizableTree.front()->State != TreeEntry::NeedToGather)
+    NodeIdx = 1;
+
   // Ensure the roots of the vectorizable tree don't form a cycle.
-  if (!VectorizableTree.front()->UserTreeIndices.empty())
+  if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
+      (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
+      (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
+                              [NodeIdx](const EdgeInfo &EI) {
+                                return EI.UserTE->Idx >
+                                       static_cast<int>(NodeIdx);
+                              })))
     return;
 
-  // Conservatively determine if we can actually truncate the roots of the
-  // expression. Collect the values that can be demoted in ToDemote and
-  // additional roots that require investigating in Roots.
-  SmallVector<Value *, 32> ToDemote;
-  DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
-  SmallVector<Value *, 4> Roots;
-  for (auto *Root : TreeRoot) {
-    DenseSet<Value *> Visited;
-    if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))
-      return;
-  }
-
-  // The maximum bit width required to represent all the values that can be
-  // demoted without loss of precision. It would be safe to truncate the roots
-  // of the expression to this width.
-  auto MaxBitWidth = 1u;
-
-  // We first check if all the bits of the roots are demanded. If they're not,
-  // we can truncate the roots to this narrower type.
-  for (auto *Root : TreeRoot) {
-    auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
-    MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),
-                                     MaxBitWidth);
-  }
-
-  // True if the roots can be zero-extended back to their original type, rather
-  // than sign-extended. We know that if the leading bits are not demanded, we
-  // can safely zero-extend. So we initialize IsKnownPositive to True.
-  bool IsKnownPositive = true;
-
-  // If all the bits of the roots are demanded, we can try a little harder to
-  // compute a narrower type. This can happen, for example, if the roots are
-  // getelementptr indices. InstCombine promotes these indices to the pointer
-  // width. Thus, all their bits are technically demanded even though the
-  // address computation might be vectorized in a smaller type.
-  //
-  // We start by looking at each entry that can be demoted. We compute the
-  // maximum bit width required to store the scalar by using ValueTracking to
-  // compute the number of high-order bits we can truncate.
-  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
-      all_of(TreeRoot, [](Value *V) {
-        return all_of(V->users(),
-                      [](User *U) { return isa<GetElementPtrInst>(U); });
-      })) {
-    MaxBitWidth = 8u;
+  // The first value node for store/insertelement is sext/zext/trunc? Skip it,
+  // resize to the final type.
+  bool IsTruncRoot = false;
+  bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
+  if (NodeIdx != 0 &&
+      VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
+      (VectorizableTree[NodeIdx]->getOpcode() == Instruction::ZExt ||
+       VectorizableTree[NodeIdx]->getOpcode() == Instruction::SExt ||
+       VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc)) {
+    assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
+    IsTruncRoot = VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc;
+    IsProfitableToDemoteRoot = true;
+    ++NodeIdx;
+  }
+
+  // Analyzed in reduction already and not profitable - exit.
+  if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
+    return;
 
+  SmallVector<Value *> ToDemote;
+  DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
+  auto ComputeMaxBitWidth = [&](ArrayRef<Value *> TreeRoot, unsigned VF,
+                                bool IsTopRoot, bool IsProfitableToDemoteRoot,
+                                unsigned Opcode, unsigned Limit, bool IsTruncRoot) {
+    ToDemote.clear();
+    auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+    if (!TreeRootIT || !Opcode)
+      return 0u;
+
+    if (AnalyzedMinBWVals.contains(TreeRoot.front()))
+      return 0u;
+
+    unsigned NumParts = TTI->getNumberOfParts(
+        FixedVectorType::get(TreeRoot.front()->getType(), VF));
+
+    // The maximum bit width required to represent all the values that can be
+    // demoted without loss of precision. It would be safe to truncate the roots
+    // of the expression to this width.
+    unsigned MaxBitWidth = 1u;
+
+    // True if the roots can be zero-extended back to their original type,
+    // rather than sign-extended. We know that if the leading bits are not
+    // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
+    // True.
     // Determine if the sign bit of all the roots is known to be zero. If not,
     // IsKnownPositive is set to False.
-    IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
+    bool IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
       KnownBits Known = computeKnownBits(R, *DL);
       return Known.isNonNegative();
     });
 
-    // Determine the maximum number of bits required to store the scalar
-    // values.
-    for (auto *Scalar : ToDemote) {
-      auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
-      auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
-      MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
-    }
-
-    // If we can't prove that the sign bit is zero, we must add one to the
-    // maximum bit width to account for the unknown sign bit. This preserves
-    // the existing sign bit so we can safely sign-extend the root back to the
-    // original type. Otherwise, if we know the sign bit is zero, we will
-    // zero-extend the root instead.
-    //
-    // FIXME: This is somewhat suboptimal, as there will be cases where adding
-    //        one to the maximum bit width will yield a larger-than-necessary
-    //        type. In general, we need to add an extra bit only if we can't
-    //        prove that the upper bit of the original type is equal to the
-    //        upper bit of the proposed smaller type. If these two bits are the
-    //        same (either zero or one) we know that sign-extending from the
-    //        smaller type will result in the same value. Here, since we can't
-    //        yet prove this, we are just making the proposed smaller type
-    //        larger to ensure correctness.
-    if (!IsKnownPositive)
-      ++MaxBitWidth;
-  }
-
-  // Round MaxBitWidth up to the next power-of-two.
-  MaxBitWidth = llvm::bit_ceil(MaxBitWidth);
-
-  // If the maximum bit width we compute is less than the with of the roots'
-  // type, we can proceed with the narrowing. Otherwise, do nothing.
-  if (MaxBitWidth >= TreeRootIT->getBitWidth())
-    return;
+    // We first check if all the bits of the roots are demanded. If they're not,
+    // we can truncate the roots to this narrower type.
+    for (auto *Root : TreeRoot) {
+      unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
+      TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
+      unsigned BitWidth1 = NumTypeBits - NumSignBits;
+      // If we can't prove that the sign bit is zero, we must add one to the
+      // maximum bit width to account for the unknown sign bit. This preserves
+      // the existing sign bit so we can safely sign-extend the root back to the
+      // original type. Otherwise, if we know the sign bit is zero, we will
+      // zero-extend the root instead.
+      //
+      // FIXME: This is somewhat suboptimal, as there will be cases where adding
+      //        one to the maximum bit width will yield a larger-than-necessary
+      //        type. In general, we need to add an extra bit only if we can't
+      //        prove that the upper bit of the original type is equal to the
+      //        upper bit of the proposed smaller type. If these two bits are
+      //        the same (either zero or one) we know that sign-extending from
+      //        the smaller type will result in the same value. Here, since we
+      //        can't yet prove this, we are just making the proposed smaller
+      //        type larger to ensure correctness.
+      if (!IsKnownPositive)
+        ++BitWidth1;
+
+      APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
+      unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
+      MaxBitWidth =
+          std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
+    }
+
+    if (MaxBitWidth < 8 && MaxBitWidth > 1)
+      MaxBitWidth = 8;
+
+    // If the original type is large, but reduced type does not improve the reg
+    // use - ignore it.
+    if (NumParts > 1 &&
+        NumParts ==
+            TTI->getNumberOfParts(FixedVectorType::get(
+                IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
+      return 0u;
+
+    bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
+                                Opcode == Instruction::SExt ||
+                                Opcode == Instruction::ZExt || NumParts > 1;
+    // Conservatively determine if we can actually truncate the roots of the
+    // expression. Collect the values that can be demoted in ToDemote and
+    // additional roots that require investigating in Roots.
+    for (auto *Root : TreeRoot) {
+      DenseSet<Value *> Visited;
+      unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
+      bool NeedToDemote = IsProfitableToDemote;
+
+      if (!collectValuesToDemote(Root, IsProfitableToDemoteRoot, MaxBitWidth,
+                                 ToDemote, DemotedConsts, Visited,
+                                 MaxDepthLevel, NeedToDemote, IsTruncRoot) ||
+          (MaxDepthLevel <= Limit &&
+           !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
+              (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
+               DL->getTypeSizeInBits(Root->getType()) /
+                       DL->getTypeSizeInBits(
+                           cast<Instruction>(Root)->getOperand(0)->getType()) >
+                   2)))))
+        return 0u;
+    }
+    // Round MaxBitWidth up to the next power-of-two.
+    MaxBitWidth = bit_ceil(MaxBitWidth);
+
+    return MaxBitWidth;
+  };
 
   // If we can truncate the root, we must collect additional values that might
   // be demoted as a result. That is, those seeded by truncations we will
   // modify.
-  while (!Roots.empty()) {
-    DenseSet<Value *> Visited;
-    collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots,
-                          Visited);
-  }
-
-  // Check that all users are marked for demotion.
-  DenseSet<Value *> Demoted(ToDemote.begin(), ToDemote.end());
-  DenseSet<const TreeEntry *> Visited;
-  for (Value *V: ToDemote) {
-    const TreeEntry *TE = getTreeEntry(V);
-    assert(TE && "Expected vectorized scalar.");
-    if (!Visited.insert(TE).second)
-      continue;
-    if (!all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
-          return all_of(EI.UserTE->Scalars,
-                        [&](Value *V) { return Demoted.contains(V); });
-        }))
-      return;
-  }
-  // Finally, map the values we can demote to the maximum bit with we computed.
-  for (auto *Scalar : ToDemote) {
-    auto *TE = getTreeEntry(Scalar);
-    assert(TE && "Expected vectorized scalar.");
-    if (MinBWs.contains(TE))
+  // Add reduction ops sizes, if any.
+  if (UserIgnoreList &&
+      isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
+    for (Value *V : *UserIgnoreList) {
+      auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
+      auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
+      unsigned BitWidth1 = NumTypeBits - NumSignBits;
+      if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
+        ++BitWidth1;
+      auto Mask = DB->getDemandedBits(cast<Instruction>(V));
+      unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
+      ReductionBitWidth =
+          std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
+    }
+    if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
+      ReductionBitWidth = 8;
+
+    ReductionBitWidth = bit_ceil(ReductionBitWidth);
+  }
+  bool IsTopRoot = NodeIdx == 0;
+  while (NodeIdx < VectorizableTree.size() &&
+         VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
+         VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
+    ++NodeIdx;
+    IsTruncRoot = true;
+  }
+  while (NodeIdx < VectorizableTree.size()) {
+    ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
+    unsigned Limit = 2;
+    unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
+    if (IsTopRoot &&
+        ReductionBitWidth ==
+            DL->getTypeSizeInBits(
+                VectorizableTree.front()->Scalars.front()->getType()))
+      Limit = 3;
+    unsigned MaxBitWidth = ComputeMaxBitWidth(
+        TreeRoot, VectorizableTree[NodeIdx]->getVectorFactor(), IsTopRoot,
+        IsProfitableToDemoteRoot, Opcode, Limit, IsTruncRoot);
+    IsTopRoot = false;
+    IsProfitableToDemoteRoot = true;
+
+    if (TruncNodes.empty()) {
+      NodeIdx = VectorizableTree.size();
+    } else {
+      NodeIdx = *TruncNodes.begin() + 1;
+      TruncNodes.erase(TruncNodes.begin());
+      IsTruncRoot = true;
+    }
+
+    // If the maximum bit width we compute is less than the with of the roots'
+    // type, we can proceed with the narrowing. Otherwise, do nothing.
+    if (MaxBitWidth == 0 ||
+        MaxBitWidth >=
+            cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
+      if (UserIgnoreList)
+        AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
       continue;
-    bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
-      KnownBits Known = computeKnownBits(R, *DL);
-      return !Known.isNonNegative();
-    });
-    MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
-    const auto *I = cast<Instruction>(Scalar);
-    auto DCIt = DemotedConsts.find(I);
-    if (DCIt != DemotedConsts.end()) {
-      for (unsigned Idx : DCIt->getSecond()) {
-        // Check that all instructions operands are demoted.
-        if (all_of(TE->Scalars, [&](Value *V) {
-              auto SIt = DemotedConsts.find(cast<Instruction>(V));
-              return SIt != DemotedConsts.end() &&
-                     is_contained(SIt->getSecond(), Idx);
-            })) {
+    }
+
+    // Finally, map the values we can demote to the maximum bit with we
+    // computed.
+    for (Value *Scalar : ToDemote) {
+      TreeEntry *TE = getTreeEntry(Scalar);
+      assert(TE && "Expected vectorized scalar.");
+      if (MinBWs.contains(TE))
+        continue;
+      bool IsSigned = TE->getOpcode() == Instruction::SExt ||
+                      any_of(TE->Scalars, [&](Value *R) {
+                        return !isKnownNonNegative(R, SimplifyQuery(*DL));
+                      });
+      MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
+      const auto *I = cast<Instruction>(Scalar);
+      auto DCIt = DemotedConsts.find(I);
+      if (DCIt != DemotedConsts.end()) {
+        for (unsigned Idx : DCIt->getSecond()) {
+          // Check that all instructions operands are demoted.
           const TreeEntry *CTE = getOperandEntry(TE, Idx);
-          MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
+          if (all_of(TE->Scalars,
+                     [&](Value *V) {
+                       auto SIt = DemotedConsts.find(cast<Instruction>(V));
+                       return SIt != DemotedConsts.end() &&
+                              is_contained(SIt->getSecond(), Idx);
+                     }) ||
+              all_of(CTE->Scalars, Constant::classof))
+            MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
         }
       }
     }
@@ -14658,10 +15029,9 @@ class HorizontalReduction {
   }
 
   /// Creates reduction operation with the current opcode.
-  static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
+  static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
                          Value *RHS, const Twine &Name, bool UseSelect) {
     unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
-    bool IsConstant = isConstant(LHS) && isConstant(RHS);
     switch (Kind) {
     case RecurKind::Or:
       if (UseSelect &&
@@ -14683,49 +15053,33 @@ class HorizontalReduction {
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
     case RecurKind::FMax:
-      if (IsConstant)
-        return ConstantFP::get(LHS->getType(),
-                               maxnum(cast<ConstantFP>(LHS)->getValueAPF(),
-                                      cast<ConstantFP>(RHS)->getValueAPF()));
       return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
     case RecurKind::FMin:
-      if (IsConstant)
-        return ConstantFP::get(LHS->getType(),
-                               minnum(cast<ConstantFP>(LHS)->getValueAPF(),
-                                      cast<ConstantFP>(RHS)->getValueAPF()));
       return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
     case RecurKind::FMaximum:
-      if (IsConstant)
-        return ConstantFP::get(LHS->getType(),
-                               maximum(cast<ConstantFP>(LHS)->getValueAPF(),
-                                      cast<ConstantFP>(RHS)->getValueAPF()));
       return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
     case RecurKind::FMinimum:
-      if (IsConstant)
-        return ConstantFP::get(LHS->getType(),
-                               minimum(cast<ConstantFP>(LHS)->getValueAPF(),
-                                      cast<ConstantFP>(RHS)->getValueAPF()));
       return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
     case RecurKind::SMax:
-      if (IsConstant || UseSelect) {
+      if (UseSelect) {
         Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       }
       return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
     case RecurKind::SMin:
-      if (IsConstant || UseSelect) {
+      if (UseSelect) {
         Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       }
       return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
     case RecurKind::UMax:
-      if (IsConstant || UseSelect) {
+      if (UseSelect) {
         Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       }
       return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
     case RecurKind::UMin:
-      if (IsConstant || UseSelect) {
+      if (UseSelect) {
         Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       }
@@ -14737,7 +15091,7 @@ class HorizontalReduction {
 
   /// Creates reduction operation with the current opcode with the IR flags
   /// from \p ReductionOps, dropping nuw/nsw flags.
-  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
+  static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
                          Value *RHS, const Twine &Name,
                          const ReductionOpsListType &ReductionOps) {
     bool UseSelect =
@@ -15119,7 +15473,7 @@ class HorizontalReduction {
   }
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
-  Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI,
+  Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
                      const TargetLibraryInfo &TLI) {
     constexpr int ReductionLimit = 4;
     constexpr unsigned RegMaxNumber = 4;
@@ -15145,7 +15499,9 @@ class HorizontalReduction {
       return nullptr;
     }
 
-    IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
+    IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
+                                    TargetFolder(DL));
+    Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
 
     // Track the reduced values in case if they are replaced by extractelement
     // because of the vectorization.
@@ -15826,7 +16182,7 @@ class HorizontalReduction {
   }
 
   /// Emit a horizontal reduction of the vectorized value.
-  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
+  Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
                        unsigned ReduxWidth, const TargetTransformInfo *TTI) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
@@ -16241,7 +16597,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
     HorizontalReduction HorRdx;
     if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
-    return HorRdx.tryToReduce(R, TTI, *TLI);
+    return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
   };
   auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
     if (TryOperandsAsNewSeeds && FutureSeed == Root) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b1498026adadf..29a395c357311 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -23,6 +23,9 @@ class TargetLibraryInfo;
 
 /// Helper class to create VPRecipies from IR instructions.
 class VPRecipeBuilder {
+  /// The VPlan new recipes are added to.
+  VPlan &Plan;
+
   /// The loop that we evaluate.
   Loop *OrigLoop;
 
@@ -69,53 +72,50 @@ class VPRecipeBuilder {
   /// recipe that takes an additional VPInstruction for the mask.
   VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
                                                    ArrayRef<VPValue *> Operands,
-                                                   VFRange &Range,
-                                                   VPlanPtr &Plan);
+                                                   VFRange &Range);
 
   /// Check if an induction recipe should be constructed for \p Phi. If so build
   /// and return it. If not, return null.
   VPHeaderPHIRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
                                                ArrayRef<VPValue *> Operands,
-                                               VPlan &Plan, VFRange &Range);
+                                               VFRange &Range);
 
   /// Optimize the special case where the operand of \p I is a constant integer
   /// induction variable.
   VPWidenIntOrFpInductionRecipe *
   tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
-                                 VFRange &Range, VPlan &Plan);
+                                 VFRange &Range);
 
   /// Handle non-loop phi nodes. Return a new VPBlendRecipe otherwise. Currently
   /// all such phi nodes are turned into a sequence of select instructions as
   /// the vectorizer currently performs full if-conversion.
-  VPBlendRecipe *tryToBlend(PHINode *Phi, ArrayRef<VPValue *> Operands,
-                            VPlanPtr &Plan);
+  VPBlendRecipe *tryToBlend(PHINode *Phi, ArrayRef<VPValue *> Operands);
 
   /// Handle call instructions. If \p CI can be widened for \p Range.Start,
   /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
   /// decision from \p Range.Start to \p Range.End.
   VPWidenCallRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
-                                    VFRange &Range, VPlanPtr &Plan);
+                                    VFRange &Range);
 
   /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
   /// if it can. The function should only be called if the cost-model indicates
   /// that widening should be performed.
   VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
-                            VPBasicBlock *VPBB, VPlanPtr &Plan);
+                            VPBasicBlock *VPBB);
 
 public:
-  VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+  VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM,
                   PredicatedScalarEvolution &PSE, VPBuilder &Builder)
-      : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE),
-        Builder(Builder) {}
+      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM),
+        PSE(PSE), Builder(Builder) {}
 
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.
   VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr,
                                        ArrayRef<VPValue *> Operands,
-                                       VFRange &Range, VPBasicBlock *VPBB,
-                                       VPlanPtr &Plan);
+                                       VFRange &Range, VPBasicBlock *VPBB);
 
   /// Set the recipe created for given ingredient. This operation is a no-op for
   /// ingredients that were not marked using a nullptr entry in the map.
@@ -128,19 +128,19 @@ class VPRecipeBuilder {
   }
 
   /// Create the mask for the vector loop header block.
-  void createHeaderMask(VPlan &Plan);
+  void createHeaderMask();
 
   /// A helper function that computes the predicate of the block BB, assuming
   /// that the header block of the loop is set to True or the loop mask when
   /// tail folding.
-  void createBlockInMask(BasicBlock *BB, VPlan &Plan);
+  void createBlockInMask(BasicBlock *BB);
 
   /// Returns the *entry* mask for the block \p BB.
   VPValue *getBlockInMask(BasicBlock *BB) const;
 
   /// A helper function that computes the predicate of the edge between SRC
   /// and DST.
-  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan);
+  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 
   /// A helper that returns the previously computed predicate of the edge
   /// between SRC and DST.
@@ -166,8 +166,7 @@ class VPRecipeBuilder {
   /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as
   /// last operand. Range.End may be decreased to ensure same recipe behavior
   /// from \p Range.Start to \p Range.End.
-  VPReplicateRecipe *handleReplication(Instruction *I, VFRange &Range,
-                                       VPlan &Plan);
+  VPReplicateRecipe *handleReplication(Instruction *I, VFRange &Range);
 
   /// Add the incoming values from the backedge to reduction & first-order
   /// recurrence cross-iteration phis.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index af6d0081bffeb..d720f17a3a881 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1127,6 +1127,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
     return WrapFlags.HasNSW;
   }
 
+  bool isDisjoint() const {
+    assert(OpType == OperationType::DisjointOp &&
+           "recipe cannot have a disjoing flag");
+    return DisjointFlags.IsDisjoint;
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void printFlags(raw_ostream &O) const;
 #endif
@@ -2136,6 +2142,8 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags {
     assert(isPredicated() && "Trying to get the mask of a unpredicated recipe");
     return getOperand(getNumOperands() - 1);
   }
+
+  unsigned getOpcode() const { return getUnderlyingInstr()->getOpcode(); }
 };
 
 /// A recipe for generating conditional branches on the bits of a mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b90c588b60756..a03a408686ef1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -50,13 +50,82 @@ template <typename Class> struct bind_ty {
   }
 };
 
+/// Match a specified integer value or vector of all elements of that
+/// value.
+struct specific_intval {
+  APInt Val;
+
+  specific_intval(APInt V) : Val(std::move(V)) {}
+
+  bool match(VPValue *VPV) {
+    if (!VPV->isLiveIn())
+      return false;
+    Value *V = VPV->getLiveInIRValue();
+    const auto *CI = dyn_cast<ConstantInt>(V);
+    if (!CI && V->getType()->isVectorTy())
+      if (const auto *C = dyn_cast<Constant>(V))
+        CI = dyn_cast_or_null<ConstantInt>(
+            C->getSplatValue(/*UndefsAllowed=*/false));
+
+    return CI && APInt::isSameValue(CI->getValue(), Val);
+  }
+};
+
+inline specific_intval m_SpecificInt(uint64_t V) {
+  return specific_intval(APInt(64, V));
+}
+
+/// Matching combinators
+template <typename LTy, typename RTy> struct match_combine_or {
+  LTy L;
+  RTy R;
+
+  match_combine_or(const LTy &Left, const RTy &Right) : L(Left), R(Right) {}
+
+  template <typename ITy> bool match(ITy *V) {
+    if (L.match(V))
+      return true;
+    if (R.match(V))
+      return true;
+    return false;
+  }
+};
+
+template <typename LTy, typename RTy>
+inline match_combine_or<LTy, RTy> m_CombineOr(const LTy &L, const RTy &R) {
+  return match_combine_or<LTy, RTy>(L, R);
+}
+
 /// Match a VPValue, capturing it if we match.
 inline bind_ty<VPValue> m_VPValue(VPValue *&V) { return V; }
 
-template <typename Op0_t, unsigned Opcode> struct UnaryVPInstruction_match {
+namespace detail {
+
+/// A helper to match an opcode against multiple recipe types.
+template <unsigned Opcode, typename...> struct MatchRecipeAndOpcode {};
+
+template <unsigned Opcode, typename RecipeTy>
+struct MatchRecipeAndOpcode<Opcode, RecipeTy> {
+  static bool match(const VPRecipeBase *R) {
+    auto *DefR = dyn_cast<RecipeTy>(R);
+    return DefR && DefR->getOpcode() == Opcode;
+  }
+};
+
+template <unsigned Opcode, typename RecipeTy, typename... RecipeTys>
+struct MatchRecipeAndOpcode<Opcode, RecipeTy, RecipeTys...> {
+  static bool match(const VPRecipeBase *R) {
+    return MatchRecipeAndOpcode<Opcode, RecipeTy>::match(R) ||
+           MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R);
+  }
+};
+} // namespace detail
+
+template <typename Op0_t, unsigned Opcode, typename... RecipeTys>
+struct UnaryRecipe_match {
   Op0_t Op0;
 
-  UnaryVPInstruction_match(Op0_t Op0) : Op0(Op0) {}
+  UnaryRecipe_match(Op0_t Op0) : Op0(Op0) {}
 
   bool match(const VPValue *V) {
     auto *DefR = V->getDefiningRecipe();
@@ -64,37 +133,58 @@ template <typename Op0_t, unsigned Opcode> struct UnaryVPInstruction_match {
   }
 
   bool match(const VPRecipeBase *R) {
-    auto *DefR = dyn_cast<VPInstruction>(R);
-    if (!DefR || DefR->getOpcode() != Opcode)
+    if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
       return false;
-    assert(DefR->getNumOperands() == 1 &&
+    assert(R->getNumOperands() == 1 &&
            "recipe with matched opcode does not have 1 operands");
-    return Op0.match(DefR->getOperand(0));
+    return Op0.match(R->getOperand(0));
   }
 };
 
-template <typename Op0_t, typename Op1_t, unsigned Opcode>
-struct BinaryVPInstruction_match {
+template <typename Op0_t, unsigned Opcode>
+using UnaryVPInstruction_match =
+    UnaryRecipe_match<Op0_t, Opcode, VPInstruction>;
+
+template <typename Op0_t, unsigned Opcode>
+using AllUnaryRecipe_match =
+    UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
+                      VPWidenCastRecipe, VPInstruction>;
+
+template <typename Op0_t, typename Op1_t, unsigned Opcode,
+          typename... RecipeTys>
+struct BinaryRecipe_match {
   Op0_t Op0;
   Op1_t Op1;
 
-  BinaryVPInstruction_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}
+  BinaryRecipe_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}
 
   bool match(const VPValue *V) {
     auto *DefR = V->getDefiningRecipe();
     return DefR && match(DefR);
   }
 
+  bool match(const VPSingleDefRecipe *R) {
+    return match(static_cast<const VPRecipeBase *>(R));
+  }
+
   bool match(const VPRecipeBase *R) {
-    auto *DefR = dyn_cast<VPInstruction>(R);
-    if (!DefR || DefR->getOpcode() != Opcode)
+    if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
       return false;
-    assert(DefR->getNumOperands() == 2 &&
+    assert(R->getNumOperands() == 2 &&
            "recipe with matched opcode does not have 2 operands");
-    return Op0.match(DefR->getOperand(0)) && Op1.match(DefR->getOperand(1));
+    return Op0.match(R->getOperand(0)) && Op1.match(R->getOperand(1));
   }
 };
 
+template <typename Op0_t, typename Op1_t, unsigned Opcode>
+using BinaryVPInstruction_match =
+    BinaryRecipe_match<Op0_t, Op1_t, Opcode, VPInstruction>;
+
+template <typename Op0_t, typename Op1_t, unsigned Opcode>
+using AllBinaryRecipe_match =
+    BinaryRecipe_match<Op0_t, Op1_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
+                       VPWidenCastRecipe, VPInstruction>;
+
 template <unsigned Opcode, typename Op0_t>
 inline UnaryVPInstruction_match<Op0_t, Opcode>
 m_VPInstruction(const Op0_t &Op0) {
@@ -130,6 +220,52 @@ inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount>
 m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
   return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
 }
+
+template <unsigned Opcode, typename Op0_t>
+inline AllUnaryRecipe_match<Op0_t, Opcode> m_Unary(const Op0_t &Op0) {
+  return AllUnaryRecipe_match<Op0_t, Opcode>(Op0);
+}
+
+template <typename Op0_t>
+inline AllUnaryRecipe_match<Op0_t, Instruction::Trunc>
+m_Trunc(const Op0_t &Op0) {
+  return m_Unary<Instruction::Trunc, Op0_t>(Op0);
+}
+
+template <typename Op0_t>
+inline AllUnaryRecipe_match<Op0_t, Instruction::ZExt> m_ZExt(const Op0_t &Op0) {
+  return m_Unary<Instruction::ZExt, Op0_t>(Op0);
+}
+
+template <typename Op0_t>
+inline AllUnaryRecipe_match<Op0_t, Instruction::SExt> m_SExt(const Op0_t &Op0) {
+  return m_Unary<Instruction::SExt, Op0_t>(Op0);
+}
+
+template <typename Op0_t>
+inline match_combine_or<AllUnaryRecipe_match<Op0_t, Instruction::ZExt>,
+                        AllUnaryRecipe_match<Op0_t, Instruction::SExt>>
+m_ZExtOrSExt(const Op0_t &Op0) {
+  return m_CombineOr(m_ZExt(Op0), m_SExt(Op0));
+}
+
+template <unsigned Opcode, typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode> m_Binary(const Op0_t &Op0,
+                                                            const Op1_t &Op1) {
+  return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul>
+m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
+m_Or(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
+}
 } // namespace VPlanPatternMatch
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3b19db9f0d30d..c6ec99fbbf0a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -156,8 +156,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
     if (NeedsDuplicating) {
       if (ScalarVFOnly)
         continue;
-      Instruction *I = cast<Instruction>(
-          cast<VPReplicateRecipe>(SinkCandidate)->getUnderlyingValue());
+      Instruction *I = SinkCandidate->getUnderlyingInstr();
       auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true);
       // TODO: add ".cloned" suffix to name of Clone's VPValue.
 
@@ -815,27 +814,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
   }
 }
 
-/// Returns true is \p V is constant one.
-static bool isConstantOne(VPValue *V) {
-  if (!V->isLiveIn())
-    return false;
-  auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue());
-  return C && C->isOne();
-}
-
-/// Returns the llvm::Instruction opcode for \p R.
-static unsigned getOpcodeForRecipe(VPRecipeBase &R) {
-  if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R))
-    return WidenR->getUnderlyingInstr()->getOpcode();
-  if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R))
-    return WidenC->getOpcode();
-  if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R))
-    return RepR->getUnderlyingInstr()->getOpcode();
-  if (auto *VPI = dyn_cast<VPInstruction>(&R))
-    return VPI->getOpcode();
-  return 0;
-}
-
 /// Try to simplify recipe \p R.
 static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   // Try to remove redundant blend recipes.
@@ -849,24 +827,9 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
-  switch (getOpcodeForRecipe(R)) {
-  case Instruction::Mul: {
-    VPValue *A = R.getOperand(0);
-    VPValue *B = R.getOperand(1);
-    if (isConstantOne(A))
-      return R.getVPSingleValue()->replaceAllUsesWith(B);
-    if (isConstantOne(B))
-      return R.getVPSingleValue()->replaceAllUsesWith(A);
-    break;
-  }
-  case Instruction::Trunc: {
-    VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe();
-    if (!Ext)
-      break;
-    unsigned ExtOpcode = getOpcodeForRecipe(*Ext);
-    if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
-      break;
-    VPValue *A = Ext->getOperand(0);
+  using namespace llvm::VPlanPatternMatch;
+  VPValue *A;
+  if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
     VPValue *Trunc = R.getVPSingleValue();
     Type *TruncTy = TypeInfo.inferScalarType(Trunc);
     Type *ATy = TypeInfo.inferScalarType(A);
@@ -875,8 +838,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     } else {
       // Don't replace a scalarizing recipe with a widened cast.
       if (isa<VPReplicateRecipe>(&R))
-        break;
+        return;
       if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+
+        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
+                                 ? Instruction::SExt
+                                 : Instruction::ZExt;
         auto *VPC =
             new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
         VPC->insertBefore(&R);
@@ -902,11 +869,11 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
     }
 #endif
-    break;
-  }
-  default:
-    break;
   }
+
+  if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)),
+                            m_Mul(m_SpecificInt(1), m_VPValue(A)))))
+    return R.getVPSingleValue()->replaceAllUsesWith(A);
 }
 
 /// Try to simplify the recipes in \p Plan.
@@ -1249,6 +1216,23 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
       // load/store. If the underlying instruction has poison-generating flags,
       // drop them directly.
       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
+        VPValue *A, *B;
+        using namespace llvm::VPlanPatternMatch;
+        // Dropping disjoint from an OR may yield incorrect results, as some
+        // analysis may have converted it to an Add implicitly (e.g. SCEV used
+        // for dependence analysis). Instead, replace it with an equivalent Add.
+        // This is possible as all users of the disjoint OR only access lanes
+        // where the operands are disjoint or poison otherwise.
+        if (match(RecWithFlags, m_Or(m_VPValue(A), m_VPValue(B))) &&
+            RecWithFlags->isDisjoint()) {
+          VPBuilder Builder(RecWithFlags);
+          VPInstruction *New = Builder.createOverflowingOp(
+              Instruction::Add, {A, B}, {false, false},
+              RecWithFlags->getDebugLoc());
+          RecWithFlags->replaceAllUsesWith(New);
+          RecWithFlags->eraseFromParent();
+          CurRec = New;
+        }
         RecWithFlags->dropPoisonGeneratingFlags();
       } else {
         Instruction *Instr = dyn_cast_or_null<Instruction>(
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b869f21eb08cf..09cb0c5116c73 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -792,9 +792,12 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
   // intrinsic
   VectorType *VecTy = cast<VectorType>(VPI.getType());
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  SmallVector<int> Mask;
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
+    Mask.resize(FVTy->getNumElements(), 0);
   InstructionCost SplatCost =
       TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+      TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask);
 
   // Calculate the cost of the VP Intrinsic
   SmallVector<Type *, 4> Args;
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll
index 9a333dc8b8ddd..a91d562b3f6f1 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll
@@ -520,3 +520,69 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16x i
 
   ret void
 }
+
+define void @fixed_m1_in_m2_notail(<8 x i32> %src, <8 x i32> %passthru) vscale_range(2) {
+; CHECK-LABEL: 'fixed_m1_in_m2_notail'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 10, i32 11, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'fixed_m1_in_m2_notail'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %1 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 10, i32 11, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
+  shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 6, i32 7>
+  shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 10, i32 11, i32 7>
+  shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret void
+}
+
+define void @fixed_m2_in_m4_notail(<8 x i64> %src, <8 x i64> %passthru) vscale_range(2) {
+; CHECK-LABEL: 'fixed_m2_in_m4_notail'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %1 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %2 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 8, i32 8, i32 10, i32 11, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %3 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 10, i32 11, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %4 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 8, i32 10, i32 11, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %5 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'fixed_m2_in_m4_notail'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %1 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %2 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 8, i32 8, i32 10, i32 11, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %3 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 10, i32 11, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %4 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 8, i32 10, i32 11, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %5 = shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 10, i32 11>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 8, i32 8, i32 10, i32 11, i32 5, i32 6, i32 7>
+  shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 10, i32 11, i32 6, i32 7>
+  shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 8, i32 10, i32 11, i32 7>
+  shufflevector <8 x i64> %src, <8 x i64> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 10, i32 11>
+  ret void
+}
+
+define void @fixed_m1_in_m2_tail(<8 x i32> %src, <8 x i32> %passthru) vscale_range(2) {
+; CHECK-LABEL: 'fixed_m1_in_m2_tail'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %1 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 8, i32 9, i32 10, i32 4, i32 4, i32 5, i32 6, i32 8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %2 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 10, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'fixed_m1_in_m2_tail'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %1 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 8, i32 9, i32 10, i32 4, i32 4, i32 5, i32 6, i32 8>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %2 = shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 10, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 8, i32 9, i32 10, i32 4, i32 4, i32 5, i32 6, i32 8>
+  shufflevector <8 x i32> %src, <8 x i32> %passthru, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 10, i32 7>
+  ret void
+}
diff --git a/llvm/test/Analysis/ValueTracking/known-non-zero.ll b/llvm/test/Analysis/ValueTracking/known-non-zero.ll
index d804fe9664903..30d340dc725a3 100644
--- a/llvm/test/Analysis/ValueTracking/known-non-zero.ll
+++ b/llvm/test/Analysis/ValueTracking/known-non-zero.ll
@@ -1292,4 +1292,15 @@ true:
 false:
   ret i1 %ne
 }
+
+define <2 x i1> @range_metadata_vec(ptr %p, <2 x i32> %x) {
+; CHECK-LABEL: @range_metadata_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %v = load <2 x i32>, ptr %p, !range !{i32 1, i32 100}
+  %or = or <2 x i32> %v, %x
+  %cmp = icmp ne <2 x i32> %or, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
 declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-select-from-cond.ll b/llvm/test/Analysis/ValueTracking/knownbits-select-from-cond.ll
new file mode 100644
index 0000000000000..c3343edfb4c9b
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/knownbits-select-from-cond.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i8 @select_condition_implies_highbits_op1(i8 %xx, i8 noundef %y) {
+; CHECK-LABEL: @select_condition_implies_highbits_op1(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i8 [[Y:%.*]], 3
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], i8 [[Y]], i8 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i8 [[SEL]], 32
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and i8 %xx, 15
+  %cond = icmp ult i8 %y, 3
+  %sel = select i1 %cond, i8 %y, i8 %x
+  %r = add i8 %sel, 32
+  ret i8 %r
+}
+
+define i8 @select_condition_implies_highbits_op1_maybe_undef_fail(i8 %xx, i8 %y) {
+; CHECK-LABEL: @select_condition_implies_highbits_op1_maybe_undef_fail(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i8 [[Y:%.*]], 3
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], i8 [[Y]], i8 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = add i8 [[SEL]], 32
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and i8 %xx, 15
+  %cond = icmp ult i8 %y, 3
+  %sel = select i1 %cond, i8 %y, i8 %x
+  %r = add i8 %sel, 32
+  ret i8 %r
+}
+
+define i8 @select_condition_implies_highbits_op2(i8 %xx, i8 noundef %y) {
+; CHECK-LABEL: @select_condition_implies_highbits_op2(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[Y:%.*]], 3
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], i8 [[X]], i8 [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i8 [[SEL]], 32
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and i8 %xx, 15
+  %cond = icmp ugt i8 %y, 3
+  %sel = select i1 %cond, i8 %x, i8 %y
+  %r = add i8 %sel, 32
+  ret i8 %r
+}
+
+define i8 @select_condition_implies_highbits_op1_and(i8 %xx, i8 noundef %y, i1 %other_cond) {
+; CHECK-LABEL: @select_condition_implies_highbits_op1_and(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT:    [[COND0:%.*]] = icmp ult i8 [[Y:%.*]], 3
+; CHECK-NEXT:    [[COND:%.*]] = and i1 [[COND0]], [[OTHER_COND:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], i8 [[Y]], i8 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i8 [[SEL]], 32
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and i8 %xx, 15
+  %cond0 = icmp ult i8 %y, 3
+  %cond = and i1 %cond0, %other_cond
+  %sel = select i1 %cond, i8 %y, i8 %x
+  %r = add i8 %sel, 32
+  ret i8 %r
+}
+
+define i8 @select_condition_implies_highbits_op2_or(i8 %xx, i8 noundef %y, i1 %other_cond) {
+; CHECK-LABEL: @select_condition_implies_highbits_op2_or(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], 15
+; CHECK-NEXT:    [[COND0:%.*]] = icmp ugt i8 [[Y:%.*]], 3
+; CHECK-NEXT:    [[COND:%.*]] = or i1 [[COND0]], [[OTHER_COND:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], i8 [[X]], i8 [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = or disjoint i8 [[SEL]], 32
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and i8 %xx, 15
+  %cond0 = icmp ugt i8 %y, 3
+  %cond = or i1 %cond0, %other_cond
+  %sel = select i1 %cond, i8 %x, i8 %y
+  %r = add i8 %sel, 32
+  ret i8 %r
+}
diff --git a/llvm/test/Assembler/debug-info.ll b/llvm/test/Assembler/debug-info.ll
index 419623a2cb7d1..06144b261373f 100644
--- a/llvm/test/Assembler/debug-info.ll
+++ b/llvm/test/Assembler/debug-info.ll
@@ -1,8 +1,8 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder %s
 
-; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39}
-!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42}
+; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46}
 
 ; CHECK:      !0 = !DISubrange(count: 3, lowerBound: 0)
 ; CHECK-NEXT: !1 = !DISubrange(count: 3, lowerBound: 4)
@@ -99,3 +99,15 @@
 ; CHECK-NEXT: !39 = !DIBasicType(name: "u64.le", size: 64, align: 1, encoding: DW_ATE_unsigned, flags: DIFlagLittleEndian)
 !41 = !DIBasicType(name: "u64.be", size: 64, align: 1, encoding: DW_ATE_unsigned, flags: DIFlagBigEndian)
 !42 = !DIBasicType(name: "u64.le", size: 64, align: 1, encoding: DW_ATE_unsigned, flags: DIFlagLittleEndian)
+
+; CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !13, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: false, ptrAuthAuthenticatesNullValues: false)
+!43 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !15, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234)
+
+; CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !13, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: true, ptrAuthAuthenticatesNullValues: false)
+!44 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !15, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: true)
+
+; CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !13, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: false, ptrAuthAuthenticatesNullValues: true)
+!45 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !15, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthAuthenticatesNullValues: true)
+
+; CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !13, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: true, ptrAuthAuthenticatesNullValues: true)
+!46 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !15, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: true, ptrAuthAuthenticatesNullValues: true)
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index be0207599478b..953a16b7e624e 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -334,6 +334,20 @@ define void @test_fast_math_flags_call_outer(float %a) {
   ret void
 }
 
+define void @test_func_prefix_data_01() prefix i32 123 {
+  ret void
+}
+
+define void @test_func_prefix_data_02() prefix i64 2000 {
+  ret void
+}
+
+%func_prolog_struct = type <{ i8, i8, ptr }>
+
+define void @test_func_prologue_data_01() prologue %func_prolog_struct <{ i8 235, i8 8, ptr zeroinitializer}> {
+  ret void
+}
+
 !llvm.dbg.cu = !{!0, !2}
 !llvm.module.flags = !{!3}
 
diff --git a/llvm/test/Bitcode/DIExpression-aggresult.ll b/llvm/test/Bitcode/DIExpression-aggresult.ll
index 0b89454aa2f94..017218277d02b 100644
--- a/llvm/test/Bitcode/DIExpression-aggresult.ll
+++ b/llvm/test/Bitcode/DIExpression-aggresult.ll
@@ -1,4 +1,5 @@
 ; RUN: llvm-dis -o - %s.bc | FileCheck %s
+; RUN: llvm-dis -o - %s.bc --load-bitcode-into-experimental-debuginfo-iterators=true | FileCheck %s
 %class.A = type { i32, i32, i32, i32 }
 
 define void @_Z3fooi(%class.A* sret(%class.A) %agg.result) #0 !dbg !3 {
diff --git a/llvm/test/Bitcode/dbg-record-roundtrip.ll b/llvm/test/Bitcode/dbg-record-roundtrip.ll
new file mode 100644
index 0000000000000..bd347cac72067
--- /dev/null
+++ b/llvm/test/Bitcode/dbg-record-roundtrip.ll
@@ -0,0 +1,172 @@
+;; Roundtrip tests.
+
+;; Load RemoveDIs mode in llvm-dis but write out debug intrinsics.
+; RUN: llvm-as --write-experimental-debuginfo-iterators-to-bitcode=true %s -o - \
+; RUN: | llvm-dis --load-bitcode-into-experimental-debuginfo-iterators=true --write-experimental-debuginfo=false \
+; RUN: | FileCheck %s
+
+;; Load and write RemoveDIs mode in llvm-dis.
+; RUN: llvm-as --write-experimental-debuginfo-iterators-to-bitcode=true %s -o - \
+; RUN: | llvm-dis --load-bitcode-into-experimental-debuginfo-iterators=true --write-experimental-debuginfo=true \
+; RUN: | FileCheck %s --check-prefixes=RECORDS
+
+;; Load intrinsics directly into the new format (auto-upgrade).
+; RUN: llvm-as --write-experimental-debuginfo-iterators-to-bitcode=false %s -o - \
+; RUN: | llvm-dis --load-bitcode-into-experimental-debuginfo-iterators=true --write-experimental-debuginfo=true \
+; RUN: | FileCheck %s --check-prefixes=RECORDS
+
+;; Check that verify-uselistorder passes regardless of input format.
+; RUN: llvm-as %s --write-experimental-debuginfo-iterators-to-bitcode=true -o - | verify-uselistorder
+; RUN: verify-uselistorder %s
+
+;; Confirm we're producing RemoveDI records from various tools.
+; RUN: opt %s -o - --write-experimental-debuginfo-iterators-to-bitcode=true | llvm-bcanalyzer - | FileCheck %s --check-prefix=BITCODE
+; RUN: llvm-as %s -o - --write-experimental-debuginfo-iterators-to-bitcode=true | llvm-bcanalyzer - | FileCheck %s --check-prefix=BITCODE
+; BITCODE-DAG: DEBUG_RECORD_LABEL
+; BITCODE-DAG: DEBUG_RECORD_VALUE
+; BITCODE-DAG: DEBUG_RECORD_ASSIGN
+; BITCODE-DAG: DEBUG_RECORD_DECLARE
+
+;; Check that llvm-link doesn't explode if we give it different formats to
+;; link.
+;; NOTE: This test fails intermittently on linux if the llvm-as output is piped
+;; into llvm-link in the RUN lines below, unless the verify-uselistorder RUN
+;; lines above are removed. Write to a temporary file to avoid that weirdness.
+;; NOTE2: Unfortunately, the above only stopped it occuring on my machine.
+;; It failed again intermittently here:
+;; https://lab.llvm.org/buildbot/#/builders/245/builds/21930
+;; Allow this test to fail-over twice, until this strangeness is understood.
+; ALLOW_RETRIES: 2
+; RUN: llvm-as %s --experimental-debuginfo-iterators=true --write-experimental-debuginfo-iterators-to-bitcode=true -o %t
+; RUN: llvm-link %t %s --experimental-debuginfo-iterators=false -o /dev/null
+; RUN: llvm-as %s --experimental-debuginfo-iterators=false -o %t
+; RUN: llvm-link %t %s --experimental-debuginfo-iterators=true
+
+;; Checks inline.
+
+@g = internal dso_local global i32 0, align 4, !dbg !0
+
+define internal dso_local noundef i32 @_Z3funv(i32 %p, ptr %storage) !dbg !13 {
+entry:
+;; Dbg record at top of block, check dbg.value configurations.
+; CHECK: entry:
+; CHECK-NEXT: dbg.value(metadata i32 %p, metadata ![[e:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg:[0-9]+]]
+; CHECK-NEXT: dbg.value(metadata ![[empty:[0-9]+]], metadata ![[e]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.value(metadata i32 poison, metadata ![[e]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.value(metadata i32 1, metadata ![[f:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: entry:
+; RECORDS-NEXT: dbg_value(i32 %p, ![[e:[0-9]+]], !DIExpression(), ![[dbg:[0-9]+]])
+; RECORDS-NEXT: dbg_value(![[empty:[0-9]+]], ![[e]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_value(i32 poison, ![[e]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_value(i32 1, ![[f:[0-9]+]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.value(metadata i32 %p, metadata !32, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.value(metadata !29, metadata !32, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 poison, metadata !32, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 1, metadata !33, metadata !DIExpression()), !dbg !19
+;; Arglist with an argument, constant, local use before def, poison.
+; CHECK-NEXT: dbg.value(metadata !DIArgList(i32 %p, i32 0, i32 %0, i32 poison), metadata ![[f]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_LLVM_arg, 3, DW_OP_plus, DW_OP_minus)), !dbg ![[dbg]]
+; RECORDS-NEXT: dbg_value(!DIArgList(i32 %p, i32 0, i32 %0, i32 poison), ![[f]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_LLVM_arg, 3, DW_OP_plus, DW_OP_minus), ![[dbg]])
+  tail call void @llvm.dbg.value(metadata !DIArgList(i32 %p, i32 0, i32 %0, i32 poison), metadata !33, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_LLVM_arg, 3, DW_OP_plus, DW_OP_minus)), !dbg !19
+;; Check dbg.assign use before def (value, addr and ID). Check expression order too.
+; CHECK: dbg.assign(metadata i32 %0, metadata ![[i:[0-9]+]], metadata !DIExpression(DW_OP_plus_uconst, 0),
+; CHECK-SAME:       metadata ![[ID:[0-9]+]], metadata ptr %a, metadata !DIExpression(DW_OP_plus_uconst, 1)), !dbg ![[dbg]]
+; RECORDS: dbg_assign(i32 %0, ![[i:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 0),
+; RECORDS-SAME:       ![[ID:[0-9]+]], ptr %a, !DIExpression(DW_OP_plus_uconst, 1), ![[dbg]])
+  tail call void @llvm.dbg.assign(metadata i32 %0, metadata !36, metadata !DIExpression(DW_OP_plus_uconst, 0), metadata !37, metadata ptr %a, metadata !DIExpression(DW_OP_plus_uconst, 1)), !dbg !19
+  %a = alloca i32, align 4, !DIAssignID !37
+; CHECK: %a = alloca i32, align 4, !DIAssignID ![[ID]]
+;; Check dbg.declare configurations.
+; CHECK-NEXT: dbg.declare(metadata ptr %a, metadata ![[a:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.declare(metadata ![[empty:[0-9]+]], metadata ![[b:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.declare(metadata ptr poison, metadata ![[c:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.declare(metadata ptr null, metadata ![[d:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.declare(metadata ptr @g, metadata ![[h:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: %a = alloca i32, align 4, !DIAssignID ![[ID]]
+;; Check dbg.declare configurations.
+; RECORDS-NEXT: dbg_declare(ptr %a, ![[a:[0-9]+]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_declare(![[empty:[0-9]+]], ![[b:[0-9]+]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_declare(ptr poison, ![[c:[0-9]+]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_declare(ptr null, ![[d:[0-9]+]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_declare(ptr @g, ![[h:[0-9]+]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.declare(metadata ptr %a, metadata !17, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.declare(metadata !29, metadata !28, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.declare(metadata ptr poison, metadata !30, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.declare(metadata ptr null, metadata !31, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.declare(metadata ptr @g, metadata !35, metadata !DIExpression()), !dbg !19
+;; Argument value dbg.declare.
+; CHECK: dbg.declare(metadata ptr %storage, metadata ![[g:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: dbg_declare(ptr %storage, ![[g:[0-9]+]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.declare(metadata ptr %storage, metadata !34, metadata !DIExpression()), !dbg !19
+;; Use before def dbg.value.
+; CHECK: dbg.value(metadata i32 %0, metadata ![[e]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: dbg_value(i32 %0, ![[e]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.value(metadata i32 %0, metadata !32, metadata !DIExpression()), !dbg !19
+  %0 = load i32, ptr @g, align 4, !dbg !20
+;; Non-argument local value dbg.value.
+; CHECK: dbg.value(metadata i32 %0, metadata ![[e]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: dbg_value(i32 %0, ![[e]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.value(metadata i32 %0, metadata !32, metadata !DIExpression()), !dbg !19
+  store i32 %0, ptr %a, align 4, !dbg !19
+  %1 = load i32, ptr %a, align 4, !dbg !25
+; CHECK: dbg.label(metadata ![[label:[0-9]+]]), !dbg ![[dbg]]
+; RECORDS: dbg_label(![[label:[0-9]+]], ![[dbg]])
+  tail call void @llvm.dbg.label(metadata !38), !dbg !19
+  ret i32 %1, !dbg !27
+}
+
+; CHECK-DAG: ![[a]] = !DILocalVariable(name: "a",
+; CHECK-DAG: ![[b]] = !DILocalVariable(name: "b",
+; CHECK-DAG: ![[c]] = !DILocalVariable(name: "c",
+; CHECK-DAG: ![[d]] = !DILocalVariable(name: "d",
+; CHECK-DAG: ![[e]] = !DILocalVariable(name: "e",
+; CHECK-DAG: ![[f]] = !DILocalVariable(name: "f",
+; CHECK-DAG: ![[g]] = !DILocalVariable(name: "g",
+; CHECK-DAG: ![[h]] = !DILocalVariable(name: "h",
+; CHECK-DAG: ![[i]] = !DILocalVariable(name: "i",
+; CHECK-DAG: ![[empty]] = !{}
+; CHECK-DAG: ![[label]] = !DILabel
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+declare void @llvm.dbg.label(metadata)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!6, !7, !8, !9, !10, !11}
+!llvm.ident = !{!12}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "g", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "test.cpp", directory: "/")
+!4 = !{!0}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!6 = !{i32 7, !"Dwarf Version", i32 5}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = !{i32 8, !"PIC Level", i32 2}
+!10 = !{i32 7, !"PIE Level", i32 2}
+!11 = !{i32 7, !"uwtable", i32 2}
+!12 = !{!"clang version 19.0.0"}
+!13 = distinct !DISubprogram(name: "fun", linkageName: "_Z3funv", scope: !3, file: !3, line: 2, type: !14, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !16)
+!14 = !DISubroutineType(types: !15)
+!15 = !{!5}
+!16 = !{!17}
+!17 = !DILocalVariable(name: "a", scope: !13, file: !3, line: 3, type: !5)
+!18 = !DILocation(line: 3, column: 3, scope: !13)
+!19 = !DILocation(line: 3, column: 7, scope: !13)
+!20 = !DILocation(line: 3, column: 11, scope: !13)
+!25 = !DILocation(line: 4, column: 12, scope: !13)
+!26 = !DILocation(line: 5, column: 1, scope: !13)
+!27 = !DILocation(line: 4, column: 5, scope: !13)
+!28 = !DILocalVariable(name: "b", scope: !13, file: !3, line: 3, type: !5)
+!29 = !{}
+!30 = !DILocalVariable(name: "c", scope: !13, file: !3, line: 3, type: !5)
+!31 = !DILocalVariable(name: "d", scope: !13, file: !3, line: 3, type: !5)
+!32 = !DILocalVariable(name: "e", scope: !13, file: !3, line: 3, type: !5)
+!33 = !DILocalVariable(name: "f", scope: !13, file: !3, line: 3, type: !5)
+!34 = !DILocalVariable(name: "g", scope: !13, file: !3, line: 3, type: !5)
+!35 = !DILocalVariable(name: "h", scope: !13, file: !3, line: 3, type: !5)
+!36 = !DILocalVariable(name: "i", scope: !13, file: !3, line: 3, type: !5)
+!37 = distinct !DIAssignID()
+!38 = !DILabel(scope: !13, name: "label", file: !3, line: 1)
diff --git a/llvm/test/Bitcode/upgrade-dbg-addr.ll b/llvm/test/Bitcode/upgrade-dbg-addr.ll
index 40fd7db18948b..06a411c2c8348 100644
--- a/llvm/test/Bitcode/upgrade-dbg-addr.ll
+++ b/llvm/test/Bitcode/upgrade-dbg-addr.ll
@@ -1,6 +1,7 @@
 ; Test upgrade of dbg.addr intrinsics into dbg.value with DW_OP_deref appended
 ;
 ; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: llvm-dis < %s.bc --load-bitcode-into-experimental-debuginfo-iterators --write-experimental-debuginfo=false | FileCheck %s
 ; RUN: verify-uselistorder < %s.bc
 
 define i32 @example(i32 %num) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
new file mode 100644
index 0000000000000..6fced31a622d9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+
+---
+name:            add_unused
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_unused
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: %add:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %add:_(s32), %o:_(s1) = G_SADDO %0, %1
+    $w0 = COPY %add(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+name:            add_canon
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_canon
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %add:_(s32), %o:_(s1) = G_SADDO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %const:_(s32) = G_CONSTANT i32 10
+    %add:_(s32), %o:_(s1) = G_SADDO %const, %1
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            add_const_fold
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_const_fold
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %add:_(s32) = G_CONSTANT i32 21
+    ; CHECK-NEXT: %o_wide:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %const:_(s32) = G_CONSTANT i32 10
+    %const1:_(s32) = G_CONSTANT i32 11
+    %add:_(s32), %o:_(s1) = G_UADDO %const, %const1
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            add_add_zero
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_add_zero
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w1 = COPY [[C]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %const:_(s32) = G_CONSTANT i32 10
+    %addl:_(s32) = nsw G_ADD  %2, %const
+    %const1:_(s32) = G_CONSTANT i32 -10
+    %add:_(s32), %o:_(s1) = G_SADDO %addl, %const1
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
new file mode 100644
index 0000000000000..10882a06af1b4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -O0 -mtriple=aarch64-- --global-isel --global-isel-abort=2 --verify-machineinstrs --stop-after=irtranslator %s -o - | FileCheck %s
+
+define void @vector_deinterleave2_v4i32(<4 x i32> %a) {
+  ; CHECK-LABEL: name: vector_deinterleave2_v4i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $q0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(0, 2)
+  ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(1, 3)
+  ; CHECK-NEXT:   RET_ReallyLR
+    %res = call {<2 x i32>, <2 x i32>} @llvm.experimental.vector.deinterleave2.v4i32(<4 x i32> %a)
+    ret void
+}
+
+define void @vector_deinterleave2_v8f32(<8 x float> %a) {
+  ; CHECK-LABEL: name: vector_deinterleave2_v8f32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $q0, $q1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[BITCAST]](<4 x s32>), [[BITCAST1]](<4 x s32>)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(0, 2, 4, 6)
+  ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(1, 3, 5, 7)
+  ; CHECK-NEXT:   RET_ReallyLR
+    %res = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %a)
+    ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
new file mode 100644
index 0000000000000..f51e47a428d10
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -O0 -mtriple=aarch64-- --global-isel --global-isel-abort=2 --verify-machineinstrs --stop-after=irtranslator %s -o - | FileCheck %s
+
+define void @vector_interleave2_v4i32(<2 x i32> %a, <2 x i32> %b) {
+  ; CHECK-LABEL: name: vector_interleave2_v4i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $d0, $d1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[COPY1]], shufflemask(0, 2, 1, 3)
+  ; CHECK-NEXT:   RET_ReallyLR
+    %res = call <4 x i32> @llvm.experimental.vector.interleave2.v4i32(<2 x i32> %a, <2 x i32> %b)
+    ret void
+}
+
+define void @vector_interleave2_v8f32(<4 x float> %a, <4 x float> %b) {
+  ; CHECK-LABEL: name: vector_interleave2_v8f32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $q0, $q1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
+  ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<8 x s32>) = G_SHUFFLE_VECTOR [[BITCAST]](<4 x s32>), [[BITCAST1]], shufflemask(0, 4, 1, 5, 2, 6, 3, 7)
+  ; CHECK-NEXT:   RET_ReallyLR
+    %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+    ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 5cbb8649d158b..b8328eda9a661 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -607,9 +607,11 @@ body:             |
     ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s16)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s16)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32)
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>)
-    ; CHECK-NEXT: $s0 = COPY [[TRUNC]](<2 x s16>)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[TRUNC]](<4 x s16>)
+    ; CHECK-NEXT: $s0 = COPY [[UV]](<2 x s16>)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(p0) = COPY $x0
     %1(<2 x s16>) = G_LOAD %0(p0) :: (load (<2 x s16>))
@@ -711,33 +713,24 @@ body:             |
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD %ptr(p0) :: (load (p0), align 64)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD %ptr(p0) :: (load (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD]](<2 x s64>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
-    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (p0) from unknown-address + 8)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD1]](<2 x s64>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD1]](p0) :: (load (p0) from unknown-address + 16, align 16)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
-    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C2]](s64)
-    ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD2]](p0) :: (load (p0) from unknown-address + 24)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD3]](p0) :: (load (p0) from unknown-address + 32, align 32)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
-    ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C4]](s64)
-    ; CHECK-NEXT: [[LOAD5:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD4]](p0) :: (load (p0) from unknown-address + 40)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD]](p0), [[LOAD1]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD2]](p0), [[LOAD3]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD4]](p0), [[LOAD5]](p0)
-    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
-    ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR1]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD5]](p0) :: (store (<2 x s64>) into unknown-address + 16)
-    ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR2]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST2]](<2 x s64>), [[PTR_ADD6]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<2 x s64>) from unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD2]](<2 x s64>)
+    ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST3]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
+    ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST1]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST4]](<2 x s64>), [[PTR_ADD2]](p0) :: (store (<2 x s64>) into unknown-address + 16)
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
+    ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST2]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST5]](<2 x s64>), [[PTR_ADD3]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
     ; CHECK-NEXT: RET_ReallyLR
     %ptr:_(p0) = COPY $x0
     %val:_(<6 x p0>) = G_LOAD %ptr(p0) :: (load (<6 x p0>))
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
index 661265173ae82..ed40a2ff7ea70 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
@@ -540,9 +540,17 @@ body:             |
     ; CHECK: liveins: $d0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[COPY]](<2 x s32>)
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s8>), [[TRUNC]](<2 x s8>)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[CONCAT_VECTORS]](<4 x s8>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR1]](<4 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s8>), [[UV3:%[0-9]+]]:_(<2 x s8>), [[UV4:%[0-9]+]]:_(<2 x s8>), [[UV5:%[0-9]+]]:_(<2 x s8>) = G_UNMERGE_VALUES [[TRUNC2]](<8 x s8>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s8>) = G_CONCAT_VECTORS [[UV2]](<2 x s8>), [[UV2]](<2 x s8>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[CONCAT_VECTORS1]](<4 x s8>)
     ; CHECK-NEXT: $d0 = COPY [[ANYEXT]](<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(<2 x s32>) = COPY $d0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
index 94f56e5650b22..9483cbf06f405 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="addo_by_0" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="match_addos" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
 # REQUIRES: asserts
 
 # (G_*ADDO x, 0) -> x + no carry
diff --git a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
index 859be2d337476..b940734c6988c 100644
--- a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
+++ b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
@@ -19,6 +19,8 @@
 ...
 ---
 name: foo
+frameInfo:
+  adjustsStack:    true
 body: |
   bb.0 (%ir-block.0):
     ; CHECK-LABEL: name: foo
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index bccfdb93d786f..b7c02d6855a9e 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -4,12 +4,10 @@
 
 ; PR23065: SCALAR_TO_VECTOR implies the top elements 1 to N-1 of the N-element vector are undefined.
 
-; CHECK-GI:         warning: Instruction selection used fallback path for bitcast_v4i8_i32
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_i32_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_v2i16_i32
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_i32_v2i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_v2i16_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_v4i8_v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for bitcast_i32_v4i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for bitcast_i32_v2i16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for bitcast_v2i16_v4i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for bitcast_v4i8_v2i16
 
 define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-SD-LABEL: foo1:
@@ -54,15 +52,28 @@ define <4 x i16> @foo2(<2 x i32> %a) {
 ; ===== To and From Scalar Types =====
 
 define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){
-; CHECK-LABEL: bitcast_v4i8_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bitcast_v4i8_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bitcast_v4i8_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %c = add <4 x i8> %a, %b
   %d = bitcast <4 x i8> %c to i32
   ret i32 %d
@@ -81,18 +92,27 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){
 }
 
 define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){
-; CHECK-LABEL: bitcast_v2i16_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    ldr w0, [sp, #12]
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bitcast_v2i16_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [sp, #12]
+; CHECK-SD-NEXT:    strh w8, [sp, #14]
+; CHECK-SD-NEXT:    ldr w0, [sp, #12]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bitcast_v2i16_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %c = add <2 x i16> %a, %b
   %d = bitcast <2 x i16> %c to i32
   ret i32 %d
@@ -388,7 +408,7 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    add x8, sp, #12
-; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    str s0, [sp, #12]
 ; CHECK-NEXT:    ld1 { v0.h }[0], [x8]
 ; CHECK-NEXT:    orr x8, x8, #0x2
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
index 7b8448de2331b..7cdb10e7159f0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
@@ -1,23 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
-; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve -o - | FileCheck %s
-; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve2 -o - | FileCheck %s
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve2 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --global-isel --global-isel-abort=2 --mattr=+complxnum,+neon,+fullfp16 -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s --global-isel --global-isel-abort=2 --mattr=+complxnum,+neon,+fullfp16,+sve -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s --global-isel --global-isel-abort=2 --mattr=+complxnum,+neon,+fullfp16,+sve2 -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 target triple = "aarch64"
 
+; CHECK-GI:      warning: Instruction selection used fallback path for complex_add_v16f16
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for complex_add_v32f16
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for complex_add_v16f16_with_intrinsic
+
 ; Expected to not transform
 define <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) {
-; CHECK-LABEL: complex_add_v2f16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h2, v0.h[1]
-; CHECK-NEXT:    mov h3, v1.h[1]
-; CHECK-NEXT:    fsub h1, h1, h2
-; CHECK-NEXT:    fadd h0, h3, h0
-; CHECK-NEXT:    mov v1.h[1], v0.h[0]
-; CHECK-NEXT:    fmov d0, d1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: complex_add_v2f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov h2, v0.h[1]
+; CHECK-SD-NEXT:    mov h3, v1.h[1]
+; CHECK-SD-NEXT:    fsub h1, h1, h2
+; CHECK-SD-NEXT:    fadd h0, h3, h0
+; CHECK-SD-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-SD-NEXT:    fmov d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: complex_add_v2f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NEXT:    fsub h1, h1, h2
+; CHECK-GI-NEXT:    fadd h0, h3, h0
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT:    fmov d0, d1
+; CHECK-GI-NEXT:    ret
 entry:
   %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 0>
   %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 1>
@@ -162,17 +181,29 @@ entry:
 
 ; Expected not to transform as it is integer
 define <16 x i16> @complex_add_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: complex_add_v16i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v4.8h, v2.8h, v3.8h
-; CHECK-NEXT:    uzp1 v5.8h, v0.8h, v1.8h
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    uzp2 v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    sub v2.8h, v4.8h, v0.8h
-; CHECK-NEXT:    add v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    zip1 v0.8h, v2.8h, v1.8h
-; CHECK-NEXT:    zip2 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: complex_add_v16i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uzp1 v4.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    uzp1 v5.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    sub v2.8h, v4.8h, v0.8h
+; CHECK-SD-NEXT:    add v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    zip1 v0.8h, v2.8h, v1.8h
+; CHECK-SD-NEXT:    zip2 v1.8h, v2.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: complex_add_v16i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v4.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-NEXT:    uzp2 v2.8h, v2.8h, v3.8h
+; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    add v2.8h, v2.8h, v4.8h
+; CHECK-GI-NEXT:    zip1 v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    zip2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %a.real = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %a.imag = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
new file mode 100644
index 0000000000000..c5c525a15ad9b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -0,0 +1,1126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for extract_v4i32_vector_insert
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v4i32_vector_insert_const
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v4i32_vector_extract
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v4i32_vector_extract_const
+
+define i64 @extract_v2i64_undef_index(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_undef_index:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_undef_index:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str q0, [sp, #-16]!
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    ldr x0, [sp], #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 undef
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_undef_vector(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_undef_vector:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_undef_vector:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> undef, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_opaque(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_opaque:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_opaque:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_oob(<2 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v2i64_oob:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 5
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_freeze(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_freeze:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_freeze:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %fvector = freeze <2 x i64> %a
+  %d = extractelement <2 x i64> %fvector, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_extract_of_insert(<2 x i64> %a, i64 %element, i64 %c) {
+; CHECK-LABEL: extract_v2i64_extract_of_insert:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %vector = insertelement <2 x i64> %a, i64 %element, i64 %c
+  %d = extractelement <2 x i64> %vector, i64 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_extract_of_insert_different_const(<2 x i64> %a, i64 %element) {
+; CHECK-SD-LABEL: extract_v2i64_extract_of_insert_different_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x0, v0.d[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_extract_of_insert_different_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = insertelement <2 x i64> %a, i64 %element, i64 0
+  %d = extractelement <2 x i64> %vector, i64 1
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_extract_build_vector_const(<2 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v2i64_extract_build_vector_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #11 // =0xb
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> <i64 42, i64 11>, i32 1
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_extract_build_vector_opaque(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_extract_build_vector_opaque:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI8_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_extract_build_vector_opaque:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI8_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x1
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> <i64 42, i64 11>, i32 %c
+  ret i64 %d
+}
+
+
+define i64 @extract_v2i32_zext(<2 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i32_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %zvector = zext <2 x i32> %a to <2 x i64>
+  %d = extractelement <2 x i64> %zvector, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2double_fptosi(<2 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2double_fptosi:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2double_fptosi:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = fptosi <2 x double> %a to <2 x i64>
+  %d = extractelement <2 x i64> %vector, i32 %c
+  ret i64 %d
+}
+
+define double @extract_v2double_fneg(<2 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2double_fneg:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fneg v0.2d, v0.2d
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr d0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2double_fneg:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fneg v0.2d, v0.2d
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr d0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = fneg <2 x double> %a
+  %d = extractelement <2 x double> %vector, i32 %c
+  ret double %d
+}
+
+define i32 @extract_v4i32_add(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_add:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI12_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_add:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = add <4 x i32> %a, <i32 42, i32 11, i32 17, i32 6>
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define float @extract_v4i32_minimum(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_minimum:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_minimum:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.minimum.v4float(<4 x float> %a, <4 x float> %b)
+  %d = extractelement <4 x float> %vector, i32 %c
+  ret float %d
+}
+
+define float @extract_v4i32_minimum_build_vector(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_minimum_build_vector:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_minimum_build_vector:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.minimum.v4float(<4 x float> %a, <4 x float> <float 42.0, float 11.0, float 17.0, float 6.0>)
+  %d = extractelement <4 x float> %vector, i32 %c
+  ret float %d
+}
+
+define float @extract_v4i32_minimum_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_minimum_build_vector_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI15_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.minimum.v4float(<4 x float> %a, <4 x float> <float 42.0, float 11.0, float 17.0, float 6.0>)
+  %d = extractelement <4 x float> %vector, i32 1
+  ret float %d
+}
+
+define float @extract_v4i32_copysign_build_vector(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI16_0
+; CHECK-SD-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_copysign_build_vector:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.copysign.v4float(<4 x float> %a, <4 x float> <float 42.0, float 11.0, float 17.0, float 6.0>)
+  %d = extractelement <4 x float> %vector, i32 %c
+  ret float %d
+}
+
+define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    adrp x8, .LCPI17_0
+; CHECK-SD-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI17_0]
+; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    mov s0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_copysign_build_vector_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.copysign.v4float(<4 x float> %a, <4 x float> <float 42.0, float 11.0, float 17.0, float 6.0>)
+  %d = extractelement <4 x float> %vector, i32 2
+  ret float %d
+}
+
+
+define i32 @extract_v4i32_icmp(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_icmp:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI18_0
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_icmp:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI18_0
+; CHECK-GI-NEXT:    movi v2.4s, #1
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = icmp sle <4 x i32> %a, <i32 42, i32 11, i32 17, i32 6>
+  %zvector = zext <4 x i1> %vector to <4 x i32>
+  %d = extractelement <4 x i32> %zvector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_icmp_const(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_icmp_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    adrp x8, .LCPI19_0
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-SD-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_icmp_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI19_0
+; CHECK-GI-NEXT:    movi v2.4s, #1
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-GI-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = icmp sle <4 x i32> %a, <i32 42, i32 11, i32 17, i32 6>
+  %zvector = zext <4 x i1> %vector to <4 x i32>
+  %d = extractelement <4 x i32> %zvector, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v4float_fcmp(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4float_fcmp:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    movi v1.4s, #1
+; CHECK-SD-NEXT:    fcmeq v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4float_fcmp:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fmov v1.4s, #1.00000000
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = fcmp uno <4 x float> %a, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %zvector = zext <4 x i1> %vector to <4 x i32>
+  %d = extractelement <4 x i32> %zvector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4float_fcmp_const(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4float_fcmp_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.4s, #1
+; CHECK-SD-NEXT:    fcmeq v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    mov w0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4float_fcmp_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov v1.4s, #1.00000000
+; CHECK-GI-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = fcmp uno <4 x float> %a, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %zvector = zext <4 x i1> %vector to <4 x i32>
+  %d = extractelement <4 x i32> %zvector, i32 1
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_select(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %cond) {
+; CHECK-SD-LABEL: extract_v4i32_select:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-SD-NEXT:    adrp x8, .LCPI22_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI22_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-SD-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_select:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI22_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI22_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = select <4 x i1> %cond, <4 x i32> %a, <4 x i32> <i32 42, i32 11, i32 17, i32 6>
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_select_const(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %cond) {
+; CHECK-SD-LABEL: extract_v4i32_select_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-SD-NEXT:    movi v2.4s, #17
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-SD-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_select_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI23_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = select <4 x i1> %cond, <4 x i32> %a, <4 x i32> <i32 42, i32 11, i32 17, i32 6>
+  %d = extractelement <4 x i32> %vector, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_abs(<4 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_abs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    frintp v0.4s, v0.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    frintm v0.4s, v0.4s
+; CHECK-SD-NEXT:    fabs v0.4s, v0.4s
+; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT:    abs v0.4s, v0.4s
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_abs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    frintp v0.4s, v0.4s
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    frintm v0.4s, v0.4s
+; CHECK-GI-NEXT:    fabs v0.4s, v0.4s
+; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %ceil = call <4 x float> @llvm.ceil.v4float(<4 x float> %a)
+  %floor = call <4 x float> @llvm.floor.v4float(<4 x float> %ceil)
+  %fabs = call <4 x float> @llvm.fabs.v4float(<4 x float> %floor)
+  %abs = fptosi <4 x float> %fabs to <4 x i32>
+  %vector = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %abs, i1 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_abs_const(<4 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_abs_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w0, #4 // =0x4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_abs_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI25_0
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI25_0]
+; CHECK-GI-NEXT:    frintp v0.4s, v0.4s
+; CHECK-GI-NEXT:    frintm v0.4s, v0.4s
+; CHECK-GI-NEXT:    fabs v0.4s, v0.4s
+; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %ceil = call <4 x float> @llvm.ceil.v4float(<4 x float> <float 1.0, float 4.0, float 3.0, float 2.0>)
+  %floor = call <4 x float> @llvm.floor.v4float(<4 x float> %ceil)
+  %fabs = call <4 x float> @llvm.fabs.v4float(<4 x float> %floor)
+  %abs = fptosi <4 x float> %fabs to <4 x i32>
+  %vector = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %abs, i1 0)
+  %d = extractelement <4 x i32> %vector, i32 1
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_abs_half_const(<4 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_abs_half_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI26_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_abs_half_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI26_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    frintp v0.4s, v0.4s
+; CHECK-GI-NEXT:    frintm v0.4s, v0.4s
+; CHECK-GI-NEXT:    fabs v0.4s, v0.4s
+; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %ceil = call <4 x float> @llvm.ceil.v4float(<4 x float> <float 1.0, float 4.0, float 3.0, float 2.0>)
+  %floor = call <4 x float> @llvm.floor.v4float(<4 x float> %ceil)
+  %fabs = call <4 x float> @llvm.fabs.v4float(<4 x float> %floor)
+  %abs = fptosi <4 x float> %fabs to <4 x i32>
+  %vector = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %abs, i1 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_insert(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_vector_insert:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-NEXT:    str q1, [sp]
+; CHECK-NEXT:    ldr w0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_insert_const(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_vector_insert_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov w0, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 1
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_extract(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_vector_extract:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    ldr w0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %a, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_extract_const(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_vector_extract_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %a, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_load(<4 x i32> %a, <2 x i32> %b, i32 %c, ptr %arg) {
+; CHECK-SD-LABEL: extract_v4i32_load:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    ldr w0, [x1, x8, lsl #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_load:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    ldr w0, [x1, x8, lsl #2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = load  <4 x i32>, ptr %arg
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_load_const(<4 x i32> %a, <2 x i32> %b, i32 %c, ptr %arg) {
+; CHECK-LABEL: extract_v4i32_load_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w0, [x1]
+; CHECK-NEXT:    ret
+entry:
+  %vector = load  <4 x i32>, ptr %arg
+  %d = extractelement <4 x i32> %vector, i32 0
+  ret i32 %d
+}
+
+define double @extract_v4i32_bitcast(<4 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_bitcast:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr d0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_bitcast:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr d0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = bitcast <4 x i32> %a to <2 x double>
+  %d = extractelement <2 x double> %vector, i32 %c
+  ret double %d
+}
+
+define double @extract_v4i32_bitcast_const(<4 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i32_bitcast_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %vector = bitcast <4 x i32> %a to <2 x double>
+  %d = extractelement <2 x double> %vector, i32 0
+  ret double %d
+}
+
+define i32 @extract_v4i32_shuffle(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_shuffle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    mov v1.s[3], v0.s[3]
+; CHECK-SD-NEXT:    str q1, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_shuffle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 3>
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_shuffle_const(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_shuffle_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_shuffle_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 3>
+  %d = extractelement <4 x i32> %vector, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_splat(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_splat:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    movi v0.4s, #11
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_splat:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    movi v0.4s, #11
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> splat (i32 11), i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_splat_const(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_splat_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #11 // =0xb
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> splat (i32 11), i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vp_add(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %mask, i32 %evl) {
+; CHECK-SD-LABEL: extract_v4i32_vp_add:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_vp_add:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vp_add_const(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %mask, i32 %evl) {
+; CHECK-SD-LABEL: extract_v4i32_vp_add_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov w0, v0.s[3]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_vp_add_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov s0, v0.s[3]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+  %d = extractelement <4 x i32> %vector, i32 3
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_phi(i64 %val, i32  %limit, ptr %ptr) {
+; CHECK-SD-LABEL: extract_v4i32_phi:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    dup v1.2s, w0
+; CHECK-SD-NEXT:    adrp x8, .LCPI41_0
+; CHECK-SD-NEXT:    movi v0.2s, #16
+; CHECK-SD-NEXT:    ldr d2, [x8, :lo12:.LCPI41_0]
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:  .LBB41_1: // %loop
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    fmov w8, s1
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    cmp w8, w1
+; CHECK-SD-NEXT:    add w0, w8, #10
+; CHECK-SD-NEXT:    str w0, [x2, w8, sxtw #2]
+; CHECK-SD-NEXT:    b.lo .LBB41_1
+; CHECK-SD-NEXT:  // %bb.2: // %ret
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_phi:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI41_0
+; CHECK-GI-NEXT:    dup v0.2d, x0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI41_0]
+; CHECK-GI-NEXT:    add v1.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    movi v0.2s, #16
+; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
+; CHECK-GI-NEXT:  .LBB41_1: // %loop
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    cmp w8, w1
+; CHECK-GI-NEXT:    add w0, w9, #10
+; CHECK-GI-NEXT:    str w0, [x2, w8, sxtw #2]
+; CHECK-GI-NEXT:    b.lo .LBB41_1
+; CHECK-GI-NEXT:  // %bb.2: // %ret
+; CHECK-GI-NEXT:    ret
+entry:
+  %tempvector = insertelement <2 x i64> undef, i64 %val, i32 0
+  %vector = shufflevector <2 x i64> %tempvector, <2 x i64> undef, <2 x i32> zeroinitializer
+  %0 = add <2 x i64> %vector, <i64 1, i64 2>
+  %1 = trunc <2 x i64> %0 to <2 x i32>
+  br label %loop
+
+loop:
+  %2 = phi <2 x i32> [ %1, %entry ], [ %inc, %loop ]
+  %elt = extractelement <2 x i32> %2, i32 0
+  %end = icmp ult i32 %elt, %limit
+  %3 = add i32 10, %elt
+  %4 = sext i32 %elt to i64
+  %5 = getelementptr i32, ptr %ptr, i64 %4
+  store i32 %3, ptr %5
+  %inc = add <2 x i32> %2, <i32 16, i32 16>
+  br i1 %end, label %loop, label %ret
+
+ret:
+  ret i32 %3
+}
+
+
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 2d0b5574cdd7b..9916aeeab1cad 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1108,61 +1108,54 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov w12, #31 // =0x1f
-; CHECK-GI-FP16-NEXT:    ldr s4, [sp]
-; CHECK-GI-FP16-NEXT:    fmov s2, w12
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
+; CHECK-GI-FP16-NEXT:    ldr s3, [sp]
+; CHECK-GI-FP16-NEXT:    fmov s1, w10
 ; CHECK-GI-FP16-NEXT:    fmov s6, w0
-; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #8]
 ; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #24]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
-; CHECK-GI-FP16-NEXT:    umov w9, v1.h[4]
-; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
-; CHECK-GI-FP16-NEXT:    umov w11, v1.h[5]
-; CHECK-GI-FP16-NEXT:    umov w10, v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w12
-; CHECK-GI-FP16-NEXT:    umov w13, v1.h[2]
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
+; CHECK-GI-FP16-NEXT:    umov w9, v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
 ; CHECK-GI-FP16-NEXT:    mov v6.s[1], w1
 ; CHECK-GI-FP16-NEXT:    mov v7.s[1], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT:    fmov s3, w9
-; CHECK-GI-FP16-NEXT:    fmov s0, w8
-; CHECK-GI-FP16-NEXT:    umov w8, v1.h[6]
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w12
-; CHECK-GI-FP16-NEXT:    umov w9, v1.h[3]
+; CHECK-GI-FP16-NEXT:    fmov s2, w8
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
 ; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w11
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    fmov s1, w10
-; CHECK-GI-FP16-NEXT:    neg v17.4s, v2.4s
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    neg v17.4s, v1.4s
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s3
+; CHECK-GI-FP16-NEXT:    fmov s3, w7
+; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s4
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #16]
+; CHECK-GI-FP16-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-FP16-NEXT:    fmov s2, w4
+; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w5
 ; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
+; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v17.4s
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
-; CHECK-GI-FP16-NEXT:    fmov s4, w7
-; CHECK-GI-FP16-NEXT:    mov v0.s[2], w13
-; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s5
-; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #16]
-; CHECK-GI-FP16-NEXT:    ushl v2.4s, v3.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    fmov s3, w4
-; CHECK-GI-FP16-NEXT:    mov v0.s[3], w9
-; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w5
-; CHECK-GI-FP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-FP16-NEXT:    sshl v2.4s, v2.4s, v17.4s
-; CHECK-GI-FP16-NEXT:    fmov w8, s5
-; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    eor v1.16b, v2.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    mov v3.s[2], w6
-; CHECK-GI-FP16-NEXT:    mov v4.s[3], w8
-; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    and v1.16b, v7.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-GI-FP16-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    eor v4.16b, v1.16b, v5.16b
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w6
+; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
+; CHECK-GI-FP16-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v7.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    orr v1.16b, v1.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-FP16-NEXT:    mov s4, v0.s[3]
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 519a2978d8604..93d3d96d67b65 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -36,6 +36,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @exp_v1f64(<1 x double> %x) {
+; CHECK-LABEL: exp_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl exp
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.exp.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @exp_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: exp_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -1293,6 +1306,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @exp2_v1f64(<1 x double> %x) {
+; CHECK-LABEL: exp2_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl exp2
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.exp2.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @exp2_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: exp2_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -2550,6 +2576,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @log_v1f64(<1 x double> %x) {
+; CHECK-LABEL: log_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl log
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.log.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @log_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: log_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -3807,6 +3846,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @log2_v1f64(<1 x double> %x) {
+; CHECK-LABEL: log2_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl log2
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.log2.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @log2_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: log2_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -5064,6 +5116,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @log10_v1f64(<1 x double> %x) {
+; CHECK-LABEL: log10_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl log10
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.log10.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @log10_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: log10_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 1b1cfead0f97a..2ad5623b65517 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -1,29 +1,50 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_v2f16_v4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup v2.2s, v0.s[1]
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    mov v1.h[0], v0.h[1]
-; CHECK-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vector_deinterleave_v2f16_v4f16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v2.2s, v0.s[1]
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.h[0], v0.h[1]
+; CHECK-SD-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    uzp2 v1.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    mov h0, v2.h[1]
+; CHECK-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT:    fmov d0, d2
+; CHECK-GI-NEXT:    ret
   %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
   ret {<2 x half>, <2 x half>}   %retval
 }
 
 define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_v4f16_v8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    uzp1 v2.4h, v0.4h, v1.4h
-; CHECK-NEXT:    uzp2 v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    fmov d0, d2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vector_deinterleave_v4f16_v8f16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    uzp1 v2.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    fmov d0, d2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_deinterleave_v4f16_v8f16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT:    fmov d0, d2
+; CHECK-GI-NEXT:    ret
   %retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>}   %retval
 }
@@ -40,13 +61,21 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %v
 }
 
 define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) {
-; CHECK-LABEL: vector_deinterleave_v2f32_v4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v2.2s, v0.2s, v1.2s
-; CHECK-NEXT:    zip2 v1.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmov d0, d2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vector_deinterleave_v2f32_v4f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    zip1 v2.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    zip2 v1.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmov d0, d2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_deinterleave_v2f32_v4f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uzp1 v2.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT:    fmov d0, d2
+; CHECK-GI-NEXT:    ret
   %retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
   ret {<2 x float>, <2 x float>}   %retval
 }
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index 071c1ffdbb45d..eb81aff33e496 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 ; CHECK-LABEL: interleave2_v4f16:
@@ -11,15 +12,22 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 }
 
 define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) {
-; CHECK-LABEL: interleave2_v8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    adrp x8, .LCPI1_0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: interleave2_v8f16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    adrp x8, .LCPI1_0
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: interleave2_v8f16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %retval = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1)
   ret <8 x half> %retval
 }
@@ -36,14 +44,21 @@ define <16 x half> @interleave2_v16f16(<8 x half> %vec0, <8 x half> %vec1) {
 }
 
 define <4 x float> @interleave2_v4f32(<2 x float> %vec0, <2 x float> %vec1) {
-; CHECK-LABEL: interleave2_v4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    rev64 v1.4s, v0.4s
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: interleave2_v4f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    rev64 v1.4s, v0.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: interleave2_v4f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %retval = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1)
   ret <4 x float> %retval
 }
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
index f80a8df18a03a..79502121f3689 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
@@ -1477,6 +1477,61 @@ define fp128 @fpext_f128_f64(double %x) #0 {
   ret fp128 %val
 }
 
+; CHECK-LABEL: sin_v1f64:
+; CHECK: bl sin
+define <1 x double> @sin_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.sin.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: cos_v1f64:
+; CHECK: bl cos
+define <1 x double> @cos_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.cos.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: pow_v1f64:
+; CHECK: bl pow
+define <1 x double> @pow_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.pow.v1f64(<1 x double> %x, <1 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: log_v1f64:
+; CHECK: bl log
+define <1 x double> @log_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.log.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: log2_v1f64:
+; CHECK: bl log2
+define <1 x double> @log2_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.log2.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: log10_v1f64:
+; CHECK: bl log10
+define <1 x double> @log10_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.log10.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: exp_v1f64:
+; CHECK: bl exp
+define <1 x double> @exp_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.exp.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: exp2_v1f64:
+; CHECK: bl exp2
+define <1 x double> @exp2_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.exp2.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
 
 attributes #0 = { strictfp }
 
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index c2ad1aafd65fc..8d40121ad4543 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -37,6 +37,21 @@ entry:
   ret half %c
 }
 
+define <1 x double> @pow_v1f64(<1 x double> %x) {
+; CHECK-LABEL: pow_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    bl pow
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.pow.v1f64(<1 x double> %x, <1 x double> <double 3.140000e+00>)
+  ret <1 x double> %c
+}
+
 define <2 x double> @pow_v2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SD-LABEL: pow_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 2ab1610edcc7f..0b34f9570fa77 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -36,6 +36,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @sin_v1f64(<1 x double> %x) {
+; CHECK-LABEL: sin_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl sin
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.sin.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @sin_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: sin_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -1293,6 +1306,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @cos_v1f64(<1 x double> %x) {
+; CHECK-LABEL: cos_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl cos
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.cos.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @cos_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: cos_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index 2269d75cdbb9e..e12502980790d 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -329,9 +329,17 @@ define <8 x i16> @hadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
   ret <8 x i16> %result
 }
 
-
-
-
+define <8 x i16> @sub_fixedwidth_v4i32(<8 x i16> %a0, <8 x i16> %a1)  {
+; CHECK-LABEL: sub_fixedwidth_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %or = or <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a0, %a1
+  %srl = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <8 x i16> %or, %srl
+  ret <8 x i16> %res
+}
 
 define <8 x i16> @rhaddu_base(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: rhaddu_base:
@@ -859,6 +867,18 @@ define <4 x i32> @urhadd_v4i32(<4 x i32> %x) {
   ret <4 x i32> %r
 }
 
+define <8 x i16> @uhadd_fixedwidth_v4i32(<8 x i16> %a0, <8 x i16> %a1)  {
+; CHECK-LABEL: uhadd_fixedwidth_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %and = and <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a0, %a1
+  %srl = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <8 x i16> %and, %srl
+  ret <8 x i16> %res
+}
+
 declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll
index 458bd7eeba16c..834417b98743a 100644
--- a/llvm/test/CodeGen/AArch64/isinf.ll
+++ b/llvm/test/CodeGen/AArch64/isinf.ll
@@ -58,22 +58,14 @@ define i32 @replace_isinf_call_f64(double %x) {
 define i32 @replace_isinf_call_f128(fp128 %x) {
 ; CHECK-LABEL: replace_isinf_call_f128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    str q0, [sp]
-; CHECK-NEXT:    ldrb w8, [sp, #15]
-; CHECK-NEXT:    and w8, w8, #0x7f
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    ldr q0, [sp]
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    bl __eqtf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldp x9, x8, [sp], #16
+; CHECK-NEXT:    and x8, x8, #0x7fffffffffffffff
+; CHECK-NEXT:    eor x8, x8, #0x7fff000000000000
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cmp x8, #0
 ; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %abs = tail call fp128 @llvm.fabs.f128(fp128 %x)
   %cmpinf = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 2164c2aad2011..ceac37f91238a 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -5521,7 +5521,8 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-FP16-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
@@ -5580,12 +5581,13 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    fmov w9, s0
+; CHECK-GI-FP16-NEXT:    and w9, w9, #0xff
+; CHECK-GI-FP16-NEXT:    and w8, w8, #0xff
+; CHECK-GI-FP16-NEXT:    ucvtf h0, w9
+; CHECK-GI-FP16-NEXT:    ucvtf h1, w8
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 56f4272c4363c..51d17ad0644f1 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -532,11 +532,18 @@ define double @exp10_f64(double %x) {
   ret double %r
 }
 
-; FIXME: Broken
-; define <1 x double> @exp10_v1f64(<1 x double> %x) {
-;   %r = call <1 x double> @llvm.exp10.v1f64(<1 x double> %x)
-;   ret <1 x double> %r
-; }
+define <1 x double> @exp10_v1f64(<1 x double> %x) {
+; CHECK-LABEL: exp10_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl exp10
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %r = call <1 x double> @llvm.exp10.v1f64(<1 x double> %x)
+  ret <1 x double> %r
+}
 
 define <2 x double> @exp10_v2f64(<2 x double> %x) {
 ; SDAG-LABEL: exp10_v2f64:
diff --git a/llvm/test/CodeGen/AArch64/overflow.ll b/llvm/test/CodeGen/AArch64/overflow.ll
index 444aaeb0f3fe7..1fd60c0309790 100644
--- a/llvm/test/CodeGen/AArch64/overflow.ll
+++ b/llvm/test/CodeGen/AArch64/overflow.ll
@@ -19,20 +19,12 @@ entry:
 }
 
 define zeroext i1 @saddo1.i32.fold(i32 %v1, i32 %v2, ptr %res) {
-; SDAG-LABEL: saddo1.i32.fold:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    mov w8, #20 // =0x14
-; SDAG-NEXT:    mov w0, wzr
-; SDAG-NEXT:    str w8, [x2]
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: saddo1.i32.fold:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    mov w8, #9 // =0x9
-; GISEL-NEXT:    adds w8, w8, #11
-; GISEL-NEXT:    cset w0, vs
-; GISEL-NEXT:    str w8, [x2]
-; GISEL-NEXT:    ret
+; CHECK-LABEL: saddo1.i32.fold:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #20 // =0x14
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 9, i32 11)
   %val = extractvalue {i32, i1} %t, 0
@@ -123,18 +115,11 @@ entry:
 }
 
 define zeroext i1 @saddo.canon.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; SDAG-LABEL: saddo.canon.i32:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    mov w0, wzr
-; SDAG-NEXT:    str w4, [x5]
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: saddo.canon.i32:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    adds w8, wzr, w4
-; GISEL-NEXT:    cset w0, vs
-; GISEL-NEXT:    str w8, [x5]
-; GISEL-NEXT:    ret
+; CHECK-LABEL: saddo.canon.i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str w4, [x5]
+; CHECK-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 0, i32 %v5)
   %val = extractvalue {i32, i1} %t, 0
@@ -143,13 +128,19 @@ entry:
   ret i1 %obit
 }
 define zeroext i1 @saddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; CHECK-LABEL: saddo.add.i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add w8, w4, #100
-; CHECK-NEXT:    subs w8, w8, #100
-; CHECK-NEXT:    cset w0, vs
-; CHECK-NEXT:    str w8, [x5]
-; CHECK-NEXT:    ret
+; SDAG-LABEL: saddo.add.i32:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add w8, w4, #100
+; SDAG-NEXT:    subs w8, w8, #100
+; SDAG-NEXT:    cset w0, vs
+; SDAG-NEXT:    str w8, [x5]
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: saddo.add.i32:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    mov w0, wzr
+; GISEL-NEXT:    str w4, [x5]
+; GISEL-NEXT:    ret
 entry:
   %lhs = add nsw i32 %v5, 100
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %lhs, i32 -100)
@@ -160,13 +151,20 @@ entry:
 }
 
 define zeroext i1 @uaddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; CHECK-LABEL: uaddo.add.i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add w8, w4, #5
-; CHECK-NEXT:    adds w8, w8, #5
-; CHECK-NEXT:    cset w0, hs
-; CHECK-NEXT:    str w8, [x5]
-; CHECK-NEXT:    ret
+; SDAG-LABEL: uaddo.add.i32:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add w8, w4, #5
+; SDAG-NEXT:    adds w8, w8, #5
+; SDAG-NEXT:    cset w0, hs
+; SDAG-NEXT:    str w8, [x5]
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: uaddo.add.i32:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    adds w8, w4, #10
+; GISEL-NEXT:    cset w0, hs
+; GISEL-NEXT:    str w8, [x5]
+; GISEL-NEXT:    ret
 entry:
   %lhs = add nuw i32 %v5, 5
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %lhs, i32 5)
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 61f04fbf0484f..3e0d5dd875097 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -280,13 +280,12 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    sxtb x8, w2
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
@@ -444,13 +443,12 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    sbfx x8, x2, #0, #10
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index d79f3ae11167f..b1131f287fe9a 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -202,7 +202,7 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-SD-NEXT:    ext v0.8b, v1.8b, v0.8b, #6
 ; CHECK-SD-NEXT:    zip1 v1.4h, v1.4h, v0.4h
 ; CHECK-SD-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
-; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
@@ -390,7 +390,7 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
-; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 3fa1ee5b9b011..dba3227459b90 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -38,4 +38,43 @@ define void @streaming_compatible() #0 {
 
 declare void @non_streaming()
 
+
+; Verify that COALESCER_BARRIER is also supported without +sme.
+
+define void @streaming_compatible_arg(float %f) #0 {
+; CHECK-LABEL: streaming_compatible_arg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    tbz w19, #0, .LBB1_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    bl non_streaming
+; CHECK-NEXT:    tbz w19, #0, .LBB1_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  call void @non_streaming(float %f)
+  ret void
+}
+
+
 attributes #0 = { nounwind "aarch64_pstate_sm_compatible" }
diff --git a/llvm/test/CodeGen/AArch64/sme-machine-licm-vg.mir b/llvm/test/CodeGen/AArch64/sme-machine-licm-vg.mir
new file mode 100644
index 0000000000000..e6cce9af0daf0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-machine-licm-vg.mir
@@ -0,0 +1,64 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=early-machinelicm %s -verify-machineinstrs -o - | FileCheck %s
+---
+name:            test_should_hoist_pfalse
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test_should_hoist_pfalse
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr64all = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr64all = COPY [[COPY]]
+  ; CHECK-NEXT:   [[PFALSE:%[0-9]+]]:ppr = PFALSE implicit $vg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr64common = PHI [[COPY2]], %bb.0, %5, %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64sp = PHI [[COPY3]], %bb.0, %7, %bb.1
+  ; CHECK-NEXT:   STR_PXI [[PFALSE]], [[PHI]], 0
+  ; CHECK-NEXT:   [[SUBSXri:%[0-9]+]]:gpr64 = SUBSXri [[PHI1]], 1, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr64all = COPY [[SUBSXri]]
+  ; CHECK-NEXT:   [[INCD_XPiI:%[0-9]+]]:gpr64 = INCD_XPiI [[PHI]], 31, 1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr64all = COPY [[INCD_XPiI]]
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.1
+    liveins: $x0, $x1
+
+    %5:gpr64 = COPY $x1
+    %4:gpr64 = COPY $x0
+    MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+    %6:gpr64all = COPY %4
+    %7:gpr64all = COPY %5
+
+  bb.1:
+    successors: %bb.2, %bb.1
+
+    %0:gpr64common = PHI %6, %bb.0, %3, %bb.1
+    %1:gpr64sp = PHI %7, %bb.0, %2, %bb.1
+    %8:ppr = PFALSE implicit $vg
+    STR_PXI killed %8, %0, 0
+    %9:gpr64 = SUBSXri %1, 1, 0, implicit-def $nzcv
+    %2:gpr64all = COPY %9
+    %10:gpr64 = INCD_XPiI %0, 31, 1
+    %3:gpr64all = COPY %10
+
+
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/soft-float-abi.ll b/llvm/test/CodeGen/AArch64/soft-float-abi.ll
new file mode 100644
index 0000000000000..291c3875c2488
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/soft-float-abi.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc --mtriple aarch64-none-eabi < %s -mattr=-fp-armv8 | FileCheck %s
+
+; See also clang/test/CodeGen/aarch64-soft-float-abi.c, which tests the clang
+; parts of the soft-float ABI.
+
+; FP types up to 64-bit are passed in a general purpose register.
+define half @test0(half %a, half %b)  {
+; CHECK-LABEL: test0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+entry:
+  ret half %b
+}
+
+define bfloat @test1(i32 %a, bfloat %b) {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+entry:
+  ret bfloat %b
+}
+
+define float @test2(i64 %a, float %b) {
+; CHECK-LABEL: test2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+entry:
+  ret float %b
+}
+
+define double @test3(half %a, double %b) {
+; CHECK-LABEL: test3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x0, x1
+; CHECK-NEXT:    ret
+entry:
+  ret double %b
+}
+
+; fp128 is passed in a pair of GPRs.
+define fp128 @test4(fp128 %a, fp128 %b) {
+; CHECK-LABEL: test4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    ret
+entry:
+  ret fp128 %b
+}
+
+; fp128 is passed in an aligned pair of GPRs, leaving one register unused is
+; necessary.
+define fp128 @test5(float %a, fp128 %b) {
+; CHECK-LABEL: test5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    ret
+entry:
+  ret fp128 %b
+}
+
+; If the alignment of an fp128 leaves a register unused, it remains unused even
+; if a later argument could fit in it.
+define i64 @test6(i64 %a, fp128 %b, i64 %c) {
+; CHECK-LABEL: test6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    ret
+entry:
+  ret i64 %c
+}
+
+; HFAs are all bit-casted to integer types in the frontend when using the
+; soft-float ABI, so they get passed in the same way as non-homeogeneous
+; aggregates. The IR is identical to the equivalent integer types, so nothing
+; to test here.
+
+; The PCS for vector and HVA types is not defined by the soft-float ABI because
+; these types are only defined by the ACLE when vector hardware is available,
+; so nothing to test here.
+
+; The front-end generates IR for va_arg which always reads from the integer
+; register save area, and never the floating-point register save area. The
+; layout of the va_list type remains the same, the floating-point related
+; fields are unused. The only change needed in the backend is  in va_start, to
+; not attempt to save the floating-point registers or set the FP fields in the
+; va_list.
+%struct.__va_list = type { ptr, ptr, ptr, i32, i32 }
+declare void @llvm.va_start(ptr)
+define double @test20(i32 %a, ...) {
+; CHECK-LABEL: test20:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    mov w8, #-56 // =0xffffffc8
+; CHECK-NEXT:    add x10, sp, #8
+; CHECK-NEXT:    add x9, sp, #96
+; CHECK-NEXT:    str x8, [sp, #88]
+; CHECK-NEXT:    add x10, x10, #56
+; CHECK-NEXT:    ldrsw x8, [sp, #88]
+; CHECK-NEXT:    stp x1, x2, [sp, #8]
+; CHECK-NEXT:    stp x3, x4, [sp, #24]
+; CHECK-NEXT:    stp x5, x6, [sp, #40]
+; CHECK-NEXT:    stp x7, x9, [sp, #56]
+; CHECK-NEXT:    str x10, [sp, #72]
+; CHECK-NEXT:    tbz w8, #31, .LBB7_3
+; CHECK-NEXT:  // %bb.1: // %vaarg.maybe_reg
+; CHECK-NEXT:    add w9, w8, #8
+; CHECK-NEXT:    cmn w8, #8
+; CHECK-NEXT:    str w9, [sp, #88]
+; CHECK-NEXT:    b.gt .LBB7_3
+; CHECK-NEXT:  // %bb.2: // %vaarg.in_reg
+; CHECK-NEXT:    ldr x9, [sp, #72]
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    b .LBB7_4
+; CHECK-NEXT:  .LBB7_3: // %vaarg.on_stack
+; CHECK-NEXT:    ldr x8, [sp, #64]
+; CHECK-NEXT:    add x9, x8, #8
+; CHECK-NEXT:    str x9, [sp, #64]
+; CHECK-NEXT:  .LBB7_4: // %vaarg.end
+; CHECK-NEXT:    ldr x0, [x8]
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+entry:
+  %vl = alloca %struct.__va_list, align 8
+  call void @llvm.va_start(ptr nonnull %vl)
+  %gr_offs_p = getelementptr inbounds %struct.__va_list, ptr %vl, i64 0, i32 3
+  %gr_offs = load i32, ptr %gr_offs_p, align 8
+  %0 = icmp sgt i32 %gr_offs, -1
+  br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg
+
+vaarg.maybe_reg:                                  ; preds = %entry
+  %new_reg_offs = add nsw i32 %gr_offs, 8
+  store i32 %new_reg_offs, ptr %gr_offs_p, align 8
+  %inreg = icmp slt i32 %gr_offs, -7
+  br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
+
+vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
+  %reg_top_p = getelementptr inbounds %struct.__va_list, ptr %vl, i64 0, i32 1
+  %reg_top = load ptr, ptr %reg_top_p, align 8
+  %1 = sext i32 %gr_offs to i64
+  %2 = getelementptr inbounds i8, ptr %reg_top, i64 %1
+  br label %vaarg.end
+
+vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg, %entry
+  %stack = load ptr, ptr %vl, align 8
+  %new_stack = getelementptr inbounds i8, ptr %stack, i64 8
+  store ptr %new_stack, ptr %vl, align 8
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
+  %vaargs.addr = phi ptr [ %2, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
+  %3 = load double, ptr %vaargs.addr, align 8
+  ret double %3
+}
+
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-no-scratch-reg.mir b/llvm/test/CodeGen/AArch64/stack-probing-no-scratch-reg.mir
index f2d79bd720690..a9c9b5ff60e45 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-no-scratch-reg.mir
+++ b/llvm/test/CodeGen/AArch64/stack-probing-no-scratch-reg.mir
@@ -29,6 +29,7 @@ tracksRegLiveness: true
 liveins:
   - { reg: '$w0', virtual-reg: '' }
 frameInfo:
+  adjustsStack:    true
   localFrameSize:  150000
 stack:
   - { id: 0, name: a, type: default, offset: 0, size: 150000, alignment: 8,
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir b/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir
index 83aa90d389a4a..985ec35213970 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir
+++ b/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir
@@ -31,6 +31,7 @@ tracksRegLiveness: true
 liveins:
   - { reg: '$w0', virtual-reg: '' }
 frameInfo:
+  adjustsStack:    true
   localFrameSize:  150000
 stack:
   - { id: 0, name: a, type: default, offset: 0, size: 150000, alignment: 8,
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll b/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll
index d8969fc9bebdb..22d177ca3267e 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll
@@ -20,10 +20,10 @@ entry:
 ; CHECK-LABEL: define void @OneVarNoInit(
 ; CHECK-DAG:  [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
 ; CHECK-DAG:  [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], {{.*}}, i64 0)
-; CHECK-DAG:  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TX]])
+; CHECK-DAG:  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[X]])
 ; CHECK-DAG:  call void @llvm.aarch64.settag(ptr [[TX]], i64 16)
 ; CHECK-DAG:  call void @use(ptr nonnull [[TX]])
-; CHECK-DAG:  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[TX]])
+; CHECK-DAG:  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[X]])
 
 define void @OneVarInitConst() sanitize_memtag {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-stack-coloring.ll b/llvm/test/CodeGen/AArch64/stack-tagging-stack-coloring.ll
index 6eb72013fb0ed..81349620fb772 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-stack-coloring.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-stack-coloring.ll
@@ -1,20 +1,20 @@
 ; Test that storage for allocas with disjoint lifetimes is reused with stack
 ; tagging.
 
-; RUN: opt -S -aarch64-stack-tagging %s -o - | \
-; RUN:   llc -no-stack-coloring=false -o - | \
+; RUN: opt -S -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 %s -o - | \
+; RUN:   llc --mattr=+mte -no-stack-coloring=false -o - | \
 ; RUN:   FileCheck %s --check-prefix=COLOR
-; RUN: opt -S -aarch64-stack-tagging %s -o - | \
-; RUN:   llc -no-stack-coloring=true -o - | \
+; RUN: opt -S -aarch64-stack-tagging %s -stack-tagging-use-stack-safety=0 -o - | \
+; RUN:   llc --mattr=+mte -no-stack-coloring=true -o - | \
 ; RUN:   FileCheck %s --check-prefix=NOCOLOR
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-android29"
+target triple = "aarch64"
 
-; COLOR: sub	sp, sp, #192
-; NOCOLOR: sub	sp, sp, #320
+; COLOR: sub	sp, sp, #208
+; NOCOLOR: sub	sp, sp, #336
 
-define i32 @myCall_w2(i32 %in) sanitize_hwaddress {
+define i32 @myCall_w2(i32 %in) sanitize_memtag {
 entry:
   %a = alloca [17 x ptr], align 8
   %a2 = alloca [16 x ptr], align 8
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
index 06f8cd5241ebf..aa9cccc58712d 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
@@ -27,7 +27,7 @@ S1:
 ; CHECK: call void @llvm.aarch64.settag(ptr %w, i64 48)
 ; CHECK-NOT: settag{{.*}}%v
   call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %w) #1
-; CHECK: call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %w.tag)
+; CHECK: call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %w)
   %b1 = icmp eq i32 %t1, 0
   br i1 %b1, label %S2, label %S3
 ; CHECK-NOT: settag
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 66b49466cc736..66ef436f48c63 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4,11 +4,6 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT
 
-; CHECK-GI-BASE:        warning: Instruction selection used fallback path for test_udot_v24i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_udot_v48i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_sdot_v24i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_sdot_v48i8
-
 define i32 @addv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: addv_v2i32:
 ; CHECK:       // %bb.0: // %entry
@@ -2070,126 +2065,50 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
 ; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
-; CHECK-GI-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v4.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    umlal v6.4s, v4.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    umlal v6.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v6.4s, v5.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v5.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    umull v7.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull v0.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s2
+; CHECK-GI-BASE-NEXT:    fmov w9, s3
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    fmov d3, d3
-; CHECK-GI-DOT-NEXT:    fmov d4, d4
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    udot v5.4s, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    udot v1.4s, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -2257,243 +2176,91 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-GI-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT:    ushll2 v16.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v17.8h, v7.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v5.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    umull2 v18.4s, v6.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    umull v19.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umull v5.4s, v6.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v7.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v3.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v7.8h, v4.16b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v18.4s, v17.8h, v16.8h
-; CHECK-GI-BASE-NEXT:    umlal v5.4s, v17.4h, v16.4h
-; CHECK-GI-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v18.4s, v7.8h, v6.8h
-; CHECK-GI-BASE-NEXT:    umlal v5.4s, v7.4h, v6.4h
-; CHECK-GI-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    umull v18.4s, v4.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    umull2 v19.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v7.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    umull2 v7.4s, v7.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    umull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
+; CHECK-GI-BASE-NEXT:    umull v19.4s, v5.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v5.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v6.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    umull2 v6.4s, v6.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s16
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s2
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s7
+; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w12
+; CHECK-GI-BASE-NEXT:    fmov w11, s4
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w10, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
-; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
-; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
-; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
-; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
-; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
-; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
-; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
-; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
-; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
-; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
-; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
-; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
-; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
-; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
-; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    udot v7.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    udot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
+; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    udot v1.4s, v6.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    udot v2.4s, v16.16b, v7.16b
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
-; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
 ; CHECK-GI-DOT-NEXT:    fmov w8, s0
 ; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    fmov w10, s2
 ; CHECK-GI-DOT-NEXT:    add w8, w8, w9
-; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    fmov w9, s2
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
 ; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1
@@ -2648,126 +2415,50 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
 ; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
-; CHECK-GI-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    smull v6.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v4.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    smlal v6.4s, v4.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    smlal v6.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    smull v6.4s, v5.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v5.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    smull2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    smull v7.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s2
+; CHECK-GI-BASE-NEXT:    fmov w9, s3
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    fmov d3, d3
-; CHECK-GI-DOT-NEXT:    fmov d4, d4
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
-; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    sdot v5.4s, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    sdot v1.4s, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -2835,243 +2526,91 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-GI-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT:    sshll2 v16.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v17.8h, v7.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v5.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    smull2 v18.4s, v6.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    smull v19.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smull v5.4s, v6.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v7.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v3.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v7.8h, v4.16b, #0
-; CHECK-GI-BASE-NEXT:    smlal2 v18.4s, v17.8h, v16.8h
-; CHECK-GI-BASE-NEXT:    smlal v5.4s, v17.4h, v16.4h
-; CHECK-GI-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    smlal2 v18.4s, v7.8h, v6.8h
-; CHECK-GI-BASE-NEXT:    smlal v5.4s, v7.4h, v6.4h
-; CHECK-GI-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    smull v18.4s, v4.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    smull2 v19.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v7.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    smull2 v7.4s, v7.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v17.8h, v17.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    smull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
+; CHECK-GI-BASE-NEXT:    smull v19.4s, v5.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v5.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v6.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    smull2 v6.4s, v6.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s16
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s2
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s7
+; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w12
+; CHECK-GI-BASE-NEXT:    fmov w11, s4
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w10, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
-; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
-; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
-; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
-; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
-; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
-; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
-; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
-; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
-; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
-; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
-; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
-; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
-; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
-; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
-; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    sdot v7.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    sdot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
+; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    sdot v1.4s, v6.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    sdot v2.4s, v16.16b, v7.16b
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
-; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
 ; CHECK-GI-DOT-NEXT:    fmov w8, s0
 ; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    fmov w10, s2
 ; CHECK-GI-DOT-NEXT:    add w8, w8, w9
-; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    fmov w9, s2
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
 ; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1
diff --git a/llvm/test/CodeGen/AArch64/wrong-callee-save-size-after-livedebugvariables.mir b/llvm/test/CodeGen/AArch64/wrong-callee-save-size-after-livedebugvariables.mir
index 53a8612a7fae7..8e1142474447e 100644
--- a/llvm/test/CodeGen/AArch64/wrong-callee-save-size-after-livedebugvariables.mir
+++ b/llvm/test/CodeGen/AArch64/wrong-callee-save-size-after-livedebugvariables.mir
@@ -64,6 +64,7 @@
 name:            foo
 tracksRegLiveness: true
 frameInfo:
+  adjustsStack:    true
   hasCalls:        true
 fixedStack:      []
 stack:
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 54b29be2132cd..716d2398996be 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -305,15 +305,14 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d1, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    movi v0.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    and x8, x2, #0xff
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v1.d[1], x1
-; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -470,15 +469,14 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI27_0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI27_0]
 ; CHECK-GI-NEXT:    and x8, x2, #0x3ff
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index d36f5c0ea89d9..a6f9bb7ee055d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4142,11 +4142,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4162,7 +4162,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4179,7 +4179,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX6-NEXT:    s_ashr_i32 s5, s7, 15
-; GFX6-NEXT:    s_add_u32 s2, s2, 0xffff8000
+; GFX6-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
@@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX8-NEXT:    s_ashr_i32 s5, s7, 15
-; GFX8-NEXT:    s_add_u32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
@@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX9-LABEL: saddsat_i48_vs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[0:1], 16
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4529,11 +4529,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4546,7 +4546,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4560,7 +4560,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX6-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
@@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX8-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX8-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
@@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -4866,21 +4866,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v7, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[6:7]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, 0x80000000, v2
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4896,10 +4895,10 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[4:5]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v12
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT:    v_add_co_u32 v3, s7, 0x80000000, v4
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
@@ -4921,8 +4920,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, 0, v[6:7]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v12
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s2, s1
@@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s0
@@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
@@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
@@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
@@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s0
@@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX10-NEXT:    s_add_u32 s0, s2, s6
 ; GFX10-NEXT:    s_addc_u32 s1, s3, s7
@@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], 0
 ; GFX11-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX11-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX11-NEXT:    s_add_u32 s0, s2, s6
 ; GFX11-NEXT:    s_addc_u32 s1, s3, s7
@@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5132,7 +5131,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
@@ -5179,7 +5178,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
@@ -5226,7 +5225,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
@@ -5269,7 +5268,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
-; GFX10-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5310,7 +5309,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX11-NEXT:    s_ashr_i32 s0, s9, 31
-; GFX11-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5412,9 +5411,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX9-NEXT:    v_bfrev_b32_e32 v6, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v3, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_add_u32_e32 v6, 0x80000000, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -5440,7 +5438,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX10-NEXT:    v_add_co_u32 v6, s0, 0x80000000, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0x80000000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5467,7 +5465,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-NEXT:    v_add_co_u32 v6, null, 0x80000000, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x80000000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -5569,9 +5567,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5597,9 +5594,9 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v3, s0, 0x80000000, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5627,15 +5624,14 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5762,12 +5758,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
@@ -5786,11 +5781,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT:    v_add_u32_e32 v7, 0x80000000, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 0x80000000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
@@ -5832,18 +5827,18 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v17
-; GFX10-NEXT:    v_add_co_u32 v7, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, s4, 0x80000000, v3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v16, v3, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc_lo
+; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v17, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s4
@@ -5882,18 +5877,17 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_add_co_u32 v7, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_add_co_u32 v4, null, 0x80000000, v3
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v17, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s0
@@ -5927,7 +5921,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
@@ -5960,7 +5954,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
@@ -6011,7 +6005,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
@@ -6050,7 +6044,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@@ -6101,7 +6095,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
@@ -6140,7 +6134,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -6184,7 +6178,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    s_ashr_i32 s10, s17, 31
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT:    s_add_u32 s11, s10, 0x80000000
+; GFX10-NEXT:    s_add_i32 s11, s10, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX10-NEXT:    s_add_u32 s0, s4, s12
@@ -6221,7 +6215,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s10, vcc_lo
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
@@ -6261,7 +6255,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
 ; GFX11-NEXT:    s_ashr_i32 s10, s17, 31
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT:    s_add_u32 s11, s10, 0x80000000
+; GFX11-NEXT:    s_add_i32 s11, s10, 0x80000000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX11-NEXT:    s_add_u32 s0, s4, s12
@@ -6299,7 +6293,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 0a6b7af2f78d4..84906c01a4698 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -3091,253 +3091,252 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0, v1
-; GISEL-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], 0, 0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v1
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v7
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v4
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 0, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v4
-; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6]
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8]
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], v11, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v0, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v11, v7
+; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], 0, v2
-; GISEL-NEXT:    v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v13, v2
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v13
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v6, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], 0, v2
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v2
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v6, v1
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v4
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v5
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0
-; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v2, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7]
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v11, 0
+; GISEL-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v13, v[5:6]
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v11, v[5:6]
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v18, v1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v1
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v14, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v17
-; GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2]
-; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v18, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v17, v13, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v18, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v13, v10, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], 0, v12
-; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v1
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v13, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v8, 0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v16
+; GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v11, v[1:2]
+; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v8, v[5:6]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v16, v13, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], 0, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v6
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v6
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v5
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v12, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; GISEL-NEXT:    v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v6
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v8, v4
+; GISEL-NEXT:    v_addc_u32_e64 v5, s[4:5], v11, v5, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v0, v9, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1]
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v4, v0
+; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[0:1]
 ; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v9, v5
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v12, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v2
-; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v6, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v4
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v2
-; GISEL-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v7, v9, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v8
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v8, v10, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v7
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v7
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index c455b24313ddc..83ebc84e1f84a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3034,253 +3034,251 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
+; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], 0, 0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
-; GISEL-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v1
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v7
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v4
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 0, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v4
-; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1]
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v11, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v11, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], 0, v0
-; GISEL-NEXT:    v_addc_u32_e64 v4, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v4
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v5, v3, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v10
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v6, v9, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v10, vcc, v5, v3, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v4
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v6, v7, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v7, v1
+; GISEL-NEXT:    v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v0
-; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v16, s[4:5], 0, v2
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, v4, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v15, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v7
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v3
-; GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v16, v18, v[0:1]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v4
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
+; GISEL-NEXT:    v_sub_i32_e64 v15, s[4:5], 0, v2
+; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v15, v[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v14, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v19, v0, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v0, v18, v5
-; GISEL-NEXT:    v_mul_lo_u32 v19, v15, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v13, v3, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v13, v15, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v18, v0, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v4
+; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v5
+; GISEL-NEXT:    v_mul_hi_u32 v19, v14, v4
+; GISEL-NEXT:    v_subb_u32_e32 v10, vcc, v10, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v18, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v19, v0
-; GISEL-NEXT:    v_mul_hi_u32 v19, v15, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v17, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT:    v_mul_hi_u32 v18, v14, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v19, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT:    v_mul_hi_u32 v6, v18, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v0
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v18, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v13, 0
-; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v10, v1
-; GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v15, v[0:1]
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v10, v18, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v15, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v5
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], 0, v12
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v15, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v0
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_mul_hi_u32 v5, v17, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v18, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v11, v1
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1]
+; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v11, v18, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v17, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v4
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], 0, v12
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v6
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v15, v0, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v4
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v14, v0
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v10, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v5
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v14, v4
+; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v0
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v9, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v0, v7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v7, v[0:1]
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v0, v6
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1]
 ; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v9, v[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v8, v5
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v10, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v10, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v4
-; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v6, v4, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v3, v2
-; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v4, v2
+; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v4
-; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v6, v4, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 61e1e67b7ae36..320dfbb4980e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4142,11 +4142,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4162,7 +4162,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4179,7 +4179,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX6-NEXT:    s_ashr_i32 s5, s7, 15
-; GFX6-NEXT:    s_add_u32 s2, s2, 0xffff8000
+; GFX6-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
@@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX8-NEXT:    s_ashr_i32 s5, s7, 15
-; GFX8-NEXT:    s_add_u32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
@@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX9-LABEL: ssubsat_i48_vs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[0:1], 16
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4529,11 +4529,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4546,7 +4546,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4560,7 +4560,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX6-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
@@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX8-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX8-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
@@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -4866,21 +4866,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v2, v6
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[6:7]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, 0x80000000, v2
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4896,10 +4895,10 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[4:5]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v12
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT:    v_add_co_u32 v3, s7, 0x80000000, v4
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
@@ -4921,8 +4920,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, 0, v[6:7]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v12
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s2, s1
@@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s0
@@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
@@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
@@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
@@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s0
@@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[4:5], 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX10-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX10-NEXT:    s_subb_u32 s1, s3, s7
@@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[4:5], 0
 ; GFX11-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX11-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX11-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX11-NEXT:    s_subb_u32 s1, s3, s7
@@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5134,7 +5133,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
@@ -5183,7 +5182,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
@@ -5232,7 +5231,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
@@ -5274,7 +5273,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s11
@@ -5317,7 +5316,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX11-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
@@ -5427,9 +5426,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5456,7 +5454,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v3, s0, 0x80000000, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5484,8 +5482,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5594,9 +5591,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5625,7 +5621,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v3, s0, 0x80000000, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5652,12 +5648,12 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s4
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v2
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5805,9 +5801,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc
@@ -5831,8 +5826,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 0x80000000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, 0x80000000, v6
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
@@ -5877,18 +5872,18 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v21
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
-; GFX10-NEXT:    v_add_co_u32 v7, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, s4, 0x80000000, v3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v3, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc_lo
+; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v20, v6, s4
@@ -5931,18 +5926,16 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v21
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6
+; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_add_co_u32 v7, null, 0x80000000, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_add_co_u32 v4, null, 0x80000000, v3
-; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v3, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v3 :: v_dual_and_b32 v5, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v20, v6, s0
@@ -5978,7 +5971,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s19, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s17
@@ -6013,7 +6006,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
@@ -6066,7 +6059,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s19, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s17
@@ -6107,7 +6100,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@@ -6160,7 +6153,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s19, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s17
@@ -6201,7 +6194,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -6244,7 +6237,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    s_ashr_i32 s8, s17, 31
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
-; GFX10-NEXT:    s_add_u32 s9, s8, 0x80000000
+; GFX10-NEXT:    s_add_i32 s9, s8, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
@@ -6273,7 +6266,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX10-NEXT:    s_and_b32 s5, 1, s5
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
@@ -6326,7 +6319,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    s_ashr_i32 s8, s19, 31
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
-; GFX11-NEXT:    s_add_u32 s9, s8, 0x80000000
+; GFX11-NEXT:    s_add_i32 s9, s8, 0x80000000
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
@@ -6357,7 +6350,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s18
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 887c43f5fce59..d15551365707b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2062,13 +2062,9 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v2, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
@@ -2077,10 +2073,6 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 5c6bb6dea1646..07480a0ce0c2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2480,13 +2480,9 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v2, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
@@ -2495,10 +2491,6 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index 192bf7c249817..93b9aeac3cd3f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -1197,3 +1197,54 @@ reallyfinally:
   store <5 x double> %val, ptr %out, align 1
   ret void
 }
+
+define amdgpu_kernel void @pr85718(i1 %Bool, ptr %Ptr, <4 x float> %Vec1, <4 x float> %Vec2) {
+; OPT-LABEL: @pr85718(
+; OPT-NEXT:  BB0:
+; OPT-NEXT:    [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true
+; OPT-NEXT:    br label [[BB1:%.*]]
+; OPT:       BB1:
+; OPT-NEXT:    [[TMP0:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE0:%.*]], [[BB2:%.*]] ], [ [[LARGEPHI_EXTRACTSLICE1:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0:%.*]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE3:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE4:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE6:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE7:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE9:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE10:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE0]], float [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE1]], float [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE2]], float [[TMP3]], i64 3
+; OPT-NEXT:    store <4 x float> [[LARGEPHI_INSERTSLICE3]], ptr [[PTR:%.*]], align 128
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1]] = extractelement <4 x float> [[VEC2:%.*]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4]] = extractelement <4 x float> [[VEC2]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7]] = extractelement <4 x float> [[VEC2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10]] = extractelement <4 x float> [[VEC2]], i64 3
+; OPT-NEXT:    br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]]
+; OPT:       BB2:
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0]] = extractelement <4 x float> [[I]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3]] = extractelement <4 x float> [[I]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6]] = extractelement <4 x float> [[I]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9]] = extractelement <4 x float> [[I]], i64 3
+; OPT-NEXT:    br label [[BB1]]
+;
+; NOOPT-LABEL: @pr85718(
+; NOOPT-NEXT:  BB0:
+; NOOPT-NEXT:    [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true
+; NOOPT-NEXT:    br label [[BB1:%.*]]
+; NOOPT:       BB1:
+; NOOPT-NEXT:    [[PHI:%.*]] = phi <4 x float> [ [[I]], [[BB2:%.*]] ], [ [[VEC2:%.*]], [[BB1]] ], [ zeroinitializer, [[BB0:%.*]] ]
+; NOOPT-NEXT:    store <4 x float> [[PHI]], ptr [[PTR:%.*]], align 128
+; NOOPT-NEXT:    br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]]
+; NOOPT:       BB2:
+; NOOPT-NEXT:    br label [[BB1]]
+;
+BB0:
+  %I = insertelement <4 x float> %Vec1, float 4.200000e+01, i1 true
+  br label %BB1
+
+BB1:                                              ; preds = %BB0, %BB1, %BB2
+  %PHI = phi <4 x float> [ %I, %BB2 ], [ %Vec2, %BB1 ], [ zeroinitializer, %BB0 ]
+  store <4 x float> %PHI, ptr %Ptr, align 128
+  br i1 %Bool, label %BB1, label %BB2
+
+BB2:                                               ; preds = %BB1
+  br label %BB1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index c1da29ecc2c2f..3228962ed01f7 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -14,6 +14,8 @@
 ---
 name:            test_av_spill_cross_bb_usage
 tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
 machineFunctionInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
new file mode 100644
index 0000000000000..1eb2771618dce
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -0,0 +1,344 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s
+
+; TODO: Add global-isel when it can support bf16
+define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
+; GCN-LABEL: v_test_cvt_bf16_f32_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %cvt = fpext bfloat %v to float
+  ret float %cvt
+}
+define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
+; GCN-LABEL: v_test_cvt_bf16_f32_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshl_b32 s0, s0, 16
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %cvt = fpext bfloat %v to float
+  ret float %cvt
+}
+define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
+; GCN-LABEL: v_test_cvt_v2f32_v2bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v2, v2, v0, s0
+; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GCN-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GCN-NEXT:    v_add3_u32 v2, v2, v1, s0
+; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GCN-NEXT:    s_mov_b32 s0, 0x7060302
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %res = fptrunc <2 x float> %src to <2 x bfloat>
+  %cast = bitcast <2 x bfloat> %res to float
+  ret float %cast
+}
+define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
+; GCN-LABEL: v_test_cvt_v2f32_v2bf16_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bfe_u32 s2, s1, 0x10010
+; GCN-NEXT:    s_add_i32 s2, s2, s1
+; GCN-NEXT:    s_or_b32 s4, s1, 0x400000
+; GCN-NEXT:    s_add_i32 s5, s2, 0x7fff
+; GCN-NEXT:    v_cmp_u_f32_e64 s[2:3], s1, s1
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    s_cselect_b32 s2, s4, s5
+; GCN-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GCN-NEXT:    s_add_i32 s1, s1, s0
+; GCN-NEXT:    s_or_b32 s3, s0, 0x400000
+; GCN-NEXT:    s_add_i32 s4, s1, 0x7fff
+; GCN-NEXT:    v_cmp_u_f32_e64 s[0:1], s0, s0
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s3, s4
+; GCN-NEXT:    s_pack_hh_b32_b16 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %res = fptrunc <2 x float> %src to <2 x bfloat>
+  %cast = bitcast <2 x bfloat> %res to float
+  ret float %cast
+}
+define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
+; GCN-LABEL: v_test_cvt_f32_bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GCN-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %trunc = fptrunc float %src to bfloat
+  %ext = fpext bfloat %trunc to float
+  ret float %ext
+}
+define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
+; GCN-LABEL: v_test_cvt_v2f64_v2bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT:    v_and_b32_e32 v7, 1, v6
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v4, v6, v4
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT:    s_brev_b32 s4, 1
+; GCN-NEXT:    v_and_or_b32 v5, v1, s4, v4
+; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT:    s_movk_i32 s5, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v5, s5
+; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-NEXT:    v_cvt_f32_f64_e64 v5, |v[2:3]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
+; GCN-NEXT:    v_and_b32_e32 v6, 1, v5
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v0, v5, v0
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    v_and_or_b32 v1, v3, s4, v0
+; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; GCN-NEXT:    v_add3_u32 v0, v0, v1, s5
+; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; GCN-NEXT:    s_mov_b32 s0, 0x7060302
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %res = fptrunc <2 x double> %src to <2 x bfloat>
+  %cast = bitcast <2 x bfloat> %res to float
+  ret float %cast
+}
+define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
+; GCN-LABEL: fptrunc_f32_f32_to_v2bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v2, v2, v0, s0
+; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GCN-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GCN-NEXT:    v_add3_u32 v2, v2, v1, s0
+; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GCN-NEXT:    s_mov_b32 s0, 0x7060302
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %a.cvt = fptrunc float %a to bfloat
+  %b.cvt = fptrunc float %b to bfloat
+  %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
+  %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
+  %ret = bitcast <2 x bfloat> %v2.2 to float
+  ret float %ret
+}
+define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
+; GCN-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; GCN-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v3, v3, v2, s0
+; GCN-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GCN-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
+; GCN-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GCN-NEXT:    v_add3_u32 v3, v3, v2, s0
+; GCN-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GCN-NEXT:    v_cmp_u_f32_e64 vcc, |v1|, |v1|
+; GCN-NEXT:    s_mov_b32 s0, 0x7060302
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GCN-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %a.neg = fneg float %a
+  %a.cvt = fptrunc float %a.neg to bfloat
+  %b.abs = call float @llvm.fabs.f32(float %b)
+  %b.cvt = fptrunc float %b.abs to bfloat
+  %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
+  %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
+  %ret = bitcast <2 x bfloat> %v2.2 to float
+  ret float %ret
+}
+define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
+; GCN-LABEL: fptrunc_f32_to_bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GCN-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.cvt = fptrunc float %a to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
+; GCN-LABEL: fptrunc_f32_to_bf16_abs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
+; GCN-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f32_e64 vcc, |v0|, |v0|
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.abs = call float @llvm.fabs.f32(float %a)
+  %a.cvt = fptrunc float %a.abs to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
+; GCN-LABEL: fptrunc_f32_to_bf16_neg:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
+; GCN-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.neg = fneg float %a
+  %a.cvt = fptrunc float %a.neg to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
+; GCN-LABEL: fptrunc_f64_to_bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT:    v_and_b32_e32 v7, 1, v6
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v4, v6, v4
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT:    s_brev_b32 s0, 1
+; GCN-NEXT:    v_and_or_b32 v5, v1, s0, v4
+; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.cvt = fptrunc double %a to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
+; GCN-LABEL: fptrunc_f64_to_bf16_neg:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GCN-NEXT:    v_and_b32_e32 v8, 1, v7
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v4, v7, v4
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    s_brev_b32 s4, 1
+; GCN-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT:    v_and_or_b32 v5, v6, s4, v4
+; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.neg = fneg double %a
+  %a.cvt = fptrunc double %a.neg to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
+; GCN-LABEL: fptrunc_f64_to_bf16_abs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GCN-NEXT:    v_and_b32_e32 v8, 1, v7
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v4, v7, v4
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT:    s_brev_b32 s0, 1
+; GCN-NEXT:    v_and_or_b32 v5, v6, s0, v4
+; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.abs = call double @llvm.fabs.f64(double %a)
+  %a.cvt = fptrunc double %a.abs to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
index 2ed6d7fd0f598..6beccce9400e5 100644
--- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
@@ -1,10 +1,12 @@
 ; RUN: llc --amdgpu-disable-structurizer -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
 ; RUN: llc --amdgpu-disable-structurizer -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s
+; RUN: llc --amdgpu-disable-structurizer -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL
 
 ; CHECK-LABEL: name:            basic_call
-;       CHECK:    [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
 ;        ISEL:    {{.*}} SI_CALL_ISEL {{.*}}, @foo, [[TOKEN]], csr_amdgpu, {{.*}}
 ;      DEADMI:    {{.*}} SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]]
+;       GISEL:    {{.*}} G_SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]]
 define i32 @basic_call(i32 %src) #0 {
   %t = call token @llvm.experimental.convergence.entry()
   %r = call i32 @foo(i32 %src) [ "convergencectrl"(token %t) ]
@@ -12,10 +14,11 @@ define i32 @basic_call(i32 %src) #0 {
 }
 
 ; CHECK-LABEL: name:            basic_intrinsic
-;       CHECK:    [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
 ;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
 ;  DEADMI-NOT:    CONVERGENCECTRL_GLUE
-;       CHECK:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]]
 define i32 @basic_intrinsic(i32 %src) #0 {
   %t = call token @llvm.experimental.convergence.anchor()
   %r = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t) ]
@@ -30,12 +33,13 @@ define i32 @uncontrolled_call(i32 %src) #0 {
 }
 
 ; CHECK-LABEL: name:            basic_branch
-;       CHECK:  bb.0.entry:
-;       CHECK:    [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR
-;       CHECK:  bb.1.then:
+;       CHECK:  bb.[[#]].entry:
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+;       CHECK:  bb.[[#]].then:
 ;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
 ;  DEADMI-NOT:    CONVERGENCECTRL_GLUE
-;       CHECK:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]]
 define i32 @basic_branch(i32 %src, i1 %cond) #0 {
 entry:
   %t = call token @llvm.experimental.convergence.anchor()
@@ -52,12 +56,13 @@ else:
 }
 
 ; CHECK-LABEL: name:            basic_loop
-;       CHECK:    [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR
-;       CHECK:  bb.1.loop:
-;       CHECK:    [[LOOP:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_LOOP [[TOKEN]]
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+;       CHECK:  bb.[[#]].loop:
+;       CHECK:    [[LOOP:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_LOOP [[TOKEN]]
 ;        ISEL:    CONVERGENCECTRL_GLUE [[LOOP]]
 ;  DEADMI-NOT:    CONVERGENCECTRL_GLUE
-;       CHECK:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[LOOP]]
 define i32 @basic_loop(i32 %src, i1 %cond) #0 {
   %t1 = call token @llvm.experimental.convergence.anchor()
   br label %loop
@@ -71,6 +76,38 @@ end:
   ret i32 %r
 }
 
+; CHECK-LABEL: name: nested
+;       CHECK:    [[ENTRY:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
+;       CHECK:    [[ANCHOR:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ANCHOR]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ANCHOR]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ENTRY]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ENTRY]]
+define i32 @nested(i32 %src) #0 {
+  %t1 = call token @llvm.experimental.convergence.entry()
+  %t2 = call token @llvm.experimental.convergence.anchor()
+  %r2 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t2) ]
+  %r1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t1) ]
+  %sum = add i32 %r1, %r2
+  ret i32 %sum
+}
+
+; COM: FIXME: Tokens on tail-call have not been implemented for SelectionDAG
+; COM:        yet; the corresponding checks have been commented out.
+;
+; CHECK-LABEL: name:            tail_call_void_func_void
+;       GISEL:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
+; COM:  CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
+; COM:   ISEL:    {{.*}} SI_CALL_ISEL {{.*}}, @external_void_func_void, [[TOKEN]], csr_amdgpu, {{.*}}
+; COM: DEADMI:    {{.*}} SI_CALL {{.*}}, @external_void_func_void, csr_amdgpu, {{.*}}, implicit [[TOKEN]]
+;       GISEL:    {{.*}} SI_TCRETURN {{.*}}, @external_void_func_void, 0, csr_amdgpu, implicit [[TOKEN]]
+define void @tail_call_void_func_void() #0 {
+  %t1 = call token @llvm.experimental.convergence.entry()
+  tail call void @external_void_func_void() [ "convergencectrl"(token %t1) ]
+  ret void
+}
+
+declare hidden void @external_void_func_void() #0
 declare i32 @foo(i32 %x) #0
 
 declare i32 @llvm.amdgcn.readfirstlane(i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 6fa7df913812a..18d2e52e8f900 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -618,16 +618,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
 define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
 ; SI-LABEL: test_isinf_pattern_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s1, 0x7f800000
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
-; SI-NEXT:    v_cmp_eq_f32_e32 vcc, s1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    s_cmpk_eq_i32 s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isinf_pattern_f16:
@@ -667,16 +667,19 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
 define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_0_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s1, 0x1f8
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT:    s_cmpk_lg_i32 s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isfinite_pattern_0_f16:
@@ -718,16 +721,19 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
 define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_4_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s1, 0x1f8
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT:    s_cmpk_lt_i32 s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isfinite_pattern_4_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
new file mode 100644
index 0000000000000..b2311a87059c3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -0,0 +1,1510 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define i128 @fptosi_f64_to_i128(double %x) {
+; SDAG-LABEL: fptosi_f64_to_i128:
+; SDAG:       ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; SDAG-NEXT:    v_bfe_u32 v6, v5, 20, 11
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    s_mov_b64 s[4:5], 0x3fe
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB0_10
+; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0xffffff7f
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_7
+; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v9, vcc, -1, v0
+; SDAG-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, -1, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0x432
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xfffff, v5
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, -1, 1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v5, 0x100000, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_4
+; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0x473, v6
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xfffffb8d, v6
+; SDAG-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v1, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, 0, v1, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v11, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v11, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; SDAG-NEXT:    v_mul_lo_u32 v6, v11, v6
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v11, v[2:3]
+; SDAG-NEXT:    v_mul_lo_u32 v10, v10, v12
+; SDAG-NEXT:    v_add3_u32 v5, v5, v6, v13
+; SDAG-NEXT:    v_mov_b32_e32 v6, v2
+; SDAG-NEXT:    v_mov_b32_e32 v2, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[1:2]
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v9, v12, v[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v7, v8, v[5:6]
+; SDAG-NEXT:    ; implicit-def: $vgpr11
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    v_add3_u32 v4, v10, v4, v9
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:  .LBB0_4: ; %Flow
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[12:13], s[12:13]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_6
+; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x433, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v0, v4, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v1, v5, s[6:7]
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v11, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v11, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v7, v4
+; SDAG-NEXT:    v_mov_b32_e32 v4, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[3:4]
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v10, v6, v[3:4]
+; SDAG-NEXT:    v_mad_i32_i24 v3, v9, v5, v3
+; SDAG-NEXT:  .LBB0_6: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB0_7: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:  ; %bb.9: ; %Flow3
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB0_10: ; %fp-to-i-cleanup
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptosi_f64_to_i128:
+; GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v5, v1
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
+; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB0_10
+; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_7
+; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 3, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 4, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 5, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 6, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 7, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 8, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 9, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 10, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 11, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 12, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 13, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 14, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v20, 15, v0
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v20
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v20
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v9, v0, v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x433
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff, v5
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    v_or_b32_e32 v5, 0x100000, v2
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_4
+; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr6
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:  .LBB0_4: ; %Flow
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[16:17]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_6
+; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT:    v_sub_co_u32_e32 v6, vcc, 0x433, v6
+; GISEL-NEXT:    v_subrev_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, 0
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:  .LBB0_6: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:  .LBB0_7: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_9
+; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT:    v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GISEL-NEXT:  .LBB0_9: ; %Flow3
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB0_10: ; %fp-to-i-cleanup
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptosi double %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptoui_f64_to_i128(double %x) {
+; SDAG-LABEL: fptoui_f64_to_i128:
+; SDAG:       ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; SDAG-NEXT:    v_bfe_u32 v6, v5, 20, 11
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    s_mov_b64 s[4:5], 0x3fe
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB1_10
+; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0xffffff7f
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_7
+; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v9, vcc, -1, v0
+; SDAG-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, -1, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0x432
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xfffff, v5
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, -1, 1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v5, 0x100000, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_4
+; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0x473, v6
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xfffffb8d, v6
+; SDAG-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v1, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, 0, v1, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v11, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v11, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; SDAG-NEXT:    v_mul_lo_u32 v6, v11, v6
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v11, v[2:3]
+; SDAG-NEXT:    v_mul_lo_u32 v10, v10, v12
+; SDAG-NEXT:    v_add3_u32 v5, v5, v6, v13
+; SDAG-NEXT:    v_mov_b32_e32 v6, v2
+; SDAG-NEXT:    v_mov_b32_e32 v2, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[1:2]
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v9, v12, v[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v7, v8, v[5:6]
+; SDAG-NEXT:    ; implicit-def: $vgpr11
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    v_add3_u32 v4, v10, v4, v9
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:  .LBB1_4: ; %Flow
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[12:13], s[12:13]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_6
+; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x433, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v0, v4, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v1, v5, s[6:7]
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v11, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v11, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v7, v4
+; SDAG-NEXT:    v_mov_b32_e32 v4, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[3:4]
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v10, v6, v[3:4]
+; SDAG-NEXT:    v_mad_i32_i24 v3, v9, v5, v3
+; SDAG-NEXT:  .LBB1_6: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB1_7: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:  ; %bb.9: ; %Flow3
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB1_10: ; %fp-to-i-cleanup
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptoui_f64_to_i128:
+; GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v5, v1
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
+; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB1_10
+; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_7
+; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 3, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 4, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 5, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 6, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 7, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 8, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 9, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 10, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 11, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 12, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 13, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 14, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v20, 15, v0
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v20
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v20
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v9, v0, v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x433
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff, v5
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    v_or_b32_e32 v5, 0x100000, v2
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_4
+; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr6
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:  .LBB1_4: ; %Flow
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[16:17]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_6
+; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT:    v_sub_co_u32_e32 v6, vcc, 0x433, v6
+; GISEL-NEXT:    v_subrev_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, 0
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:  .LBB1_6: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:  .LBB1_7: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_9
+; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT:    v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GISEL-NEXT:  .LBB1_9: ; %Flow3
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB1_10: ; %fp-to-i-cleanup
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptoui double %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptosi_f32_to_i128(float %x) {
+; SDAG-LABEL: fptosi_f32_to_i128:
+; SDAG:       ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_bfe_u32 v5, v4, 23, 8
+; SDAG-NEXT:    s_movk_i32 s4, 0x7e
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB2_10
+; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0xffffff7f
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_7
+; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v9, vcc, -1, v0
+; SDAG-NEXT:    v_addc_co_u32_e64 v11, s[6:7], 0, -1, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0x95
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v6, 0x800000, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_4
+; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v5
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v5
+; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff6a, v5
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v13, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v13, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v14, v8, v2
+; SDAG-NEXT:    v_mul_lo_u32 v15, v10, v3
+; SDAG-NEXT:    v_mov_b32_e32 v6, v1
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v12, v10, v[6:7]
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, v5
+; SDAG-NEXT:    v_mov_b32_e32 v5, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v13, v8, v[4:5]
+; SDAG-NEXT:    v_add3_u32 v3, v3, v15, v14
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v9, v13, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mul_lo_u32 v3, v9, v12
+; SDAG-NEXT:    v_mul_lo_u32 v7, v11, v13
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[5:6]
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v1
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:  .LBB2_4: ; %Flow
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_6
+; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v5
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:  .LBB2_6: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB2_7: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:  ; %bb.9: ; %Flow3
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB2_10: ; %fp-to-i-cleanup
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptosi_f32_to_i128:
+; GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 23, v[4:5]
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB2_10
+; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_7
+; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v5, 3, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 4, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 5, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 6, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 7, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 8, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 9, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 10, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 11, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 12, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 13, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 14, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 15, v0
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v5
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v9, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v8, v0, v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x96
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    v_or_b32_e32 v4, 0x800000, v2
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr6
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:  .LBB2_4: ; %Flow
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT:    v_sub_co_u32_e32 v3, vcc, 0x96, v6
+; GISEL-NEXT:    v_subrev_u32_e32 v2, 64, v3
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v3, v[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[1:2], v2, 0
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:  .LBB2_6: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB2_7: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_9
+; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT:    v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GISEL-NEXT:  .LBB2_9: ; %Flow3
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB2_10: ; %fp-to-i-cleanup
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptosi float %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptoui_f32_to_i128(float %x) {
+; SDAG-LABEL: fptoui_f32_to_i128:
+; SDAG:       ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_bfe_u32 v5, v4, 23, 8
+; SDAG-NEXT:    s_movk_i32 s4, 0x7e
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB3_10
+; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0xffffff7f
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_7
+; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v9, vcc, -1, v0
+; SDAG-NEXT:    v_addc_co_u32_e64 v11, s[6:7], 0, -1, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0x95
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v6, 0x800000, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_4
+; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v5
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v5
+; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff6a, v5
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v13, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v13, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v14, v8, v2
+; SDAG-NEXT:    v_mul_lo_u32 v15, v10, v3
+; SDAG-NEXT:    v_mov_b32_e32 v6, v1
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v12, v10, v[6:7]
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, v5
+; SDAG-NEXT:    v_mov_b32_e32 v5, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v13, v8, v[4:5]
+; SDAG-NEXT:    v_add3_u32 v3, v3, v15, v14
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v9, v13, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mul_lo_u32 v3, v9, v12
+; SDAG-NEXT:    v_mul_lo_u32 v7, v11, v13
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[5:6]
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v1
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:  .LBB3_4: ; %Flow
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_6
+; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v5
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:  .LBB3_6: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB3_7: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:  ; %bb.9: ; %Flow3
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB3_10: ; %fp-to-i-cleanup
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptoui_f32_to_i128:
+; GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 23, v[4:5]
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB3_10
+; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_7
+; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v5, 3, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 4, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 5, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 6, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 7, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 8, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 9, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 10, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 11, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 12, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 13, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 14, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 15, v0
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v5
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v9, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v8, v0, v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x96
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    v_or_b32_e32 v4, 0x800000, v2
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr6
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:  .LBB3_4: ; %Flow
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT:    v_sub_co_u32_e32 v3, vcc, 0x96, v6
+; GISEL-NEXT:    v_subrev_u32_e32 v2, 64, v3
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v3, v[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[1:2], v2, 0
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:  .LBB3_6: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB3_7: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_9
+; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT:    v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GISEL-NEXT:  .LBB3_9: ; %Flow3
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB3_10: ; %fp-to-i-cleanup
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptoui float %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptosi_f16_to_i128(half %x) {
+; GCN-LABEL: fptosi_f16_to_i128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptosi half %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptoui_f16_to_i128(half %x) {
+; GCN-LABEL: fptoui_f16_to_i128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptoui half %x to i128
+  ret i128 %cvt
+}
+
+; FIXME: ExpandLargeFpConvert asserts on bfloat
+; define i128 @fptosi_bf16_to_i128(bfloat %x) {
+;   %cvt = fptosi bfloat %x to i128
+;   ret i128 %cvt
+; }
+
+; define i128 @fptoui_bf16_to_i128(bfloat %x) {
+;   %cvt = fptoui bfloat %x to i128
+;   ret i128 %cvt
+; }
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 3a0b8259d0849..e361aa4db2aa9 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1705,16 +1705,16 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX6-NEXT:    v_min_f32_e32 v7, 0x3f7fffff, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v1, vcc
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX6-NEXT:    s_movk_i32 s10, 0x204
+; GFX6-NEXT:    v_mov_b32_e32 v8, 0x204
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
-; GFX6-NEXT:    v_cmp_class_f32_e64 s[8:9], v0, s10
+; GFX6-NEXT:    v_cmp_class_f32_e32 vcc, v0, v8
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v7, 0, s[8:9]
-; GFX6-NEXT:    v_cmp_class_f32_e64 s[8:9], v1, s10
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v7, 0, vcc
+; GFX6-NEXT:    v_cmp_class_f32_e32 vcc, v1, v8
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v6, 0, s[8:9]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v6, 0, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1722,19 +1722,19 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX7-LABEL: safe_math_fract_v2f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0x204
 ; GFX7-NEXT:    v_fract_f32_e32 v6, v0
-; GFX7-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s8
+; GFX7-NEXT:    v_cmp_class_f32_e32 vcc, v0, v8
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_floor_f32_e32 v4, v0
 ; GFX7-NEXT:    v_fract_f32_e32 v7, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX7-NEXT:    v_cmp_neq_f32_e64 vcc, |v1|, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e32 vcc, v1, v8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    v_floor_f32_e32 v5, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v7, 0, vcc
 ; GFX7-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -1742,15 +1742,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX8-LABEL: safe_math_fract_v2f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0x204
 ; GFX8-NEXT:    v_fract_f32_e32 v6, v0
-; GFX8-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s4
+; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v0, v8
 ; GFX8-NEXT:    v_floor_f32_e32 v4, v0
 ; GFX8-NEXT:    v_fract_f32_e32 v7, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX8-NEXT:    v_cmp_neq_f32_e64 vcc, |v1|, s4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v1, v8
 ; GFX8-NEXT:    v_floor_f32_e32 v5, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, 0, vcc
 ; GFX8-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1759,14 +1759,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v6, v0
-; GFX11-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX11-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x204
 ; GFX11-NEXT:    v_fract_f32_e32 v7, v1
 ; GFX11-NEXT:    v_floor_f32_e32 v4, v0
 ; GFX11-NEXT:    v_floor_f32_e32 v5, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX11-NEXT:    v_cmp_class_f32_e64 s0, v1, 0x204
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v7, 0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
@@ -1937,21 +1938,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX6-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    v_floor_f32_e32 v3, v0
-; GFX6-NEXT:    v_sub_f32_e32 v4, v0, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_min_f32_e32 v4, 0x3f7fe000, v4
-; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX6-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s8
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; GFX6-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GFX6-NEXT:    v_floor_f32_e32 v4, v3
+; GFX6-NEXT:    v_sub_f32_e32 v5, v3, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_min_f32_e32 v5, 0x3f7fe000, v5
+; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-NEXT:    buffer_store_short v4, v[1:2], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1959,21 +1961,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX7-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    v_floor_f32_e32 v3, v0
-; GFX7-NEXT:    v_sub_f32_e32 v4, v0, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_min_f32_e32 v4, 0x3f7fe000, v4
-; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX7-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s8
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; GFX7-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    v_floor_f32_e32 v4, v3
+; GFX7-NEXT:    v_sub_f32_e32 v5, v3, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_min_f32_e32 v5, 0x3f7fe000, v5
+; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX7-NEXT:    buffer_store_short v4, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2062,12 +2065,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX6-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX6-NEXT:    v_floor_f32_e32 v6, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
 ; GFX6-NEXT:    v_floor_f32_e32 v8, v5
@@ -2080,10 +2083,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX6-NEXT:    v_cmp_neq_f32_e32 vcc, s8, v0
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
-; GFX6-NEXT:    v_cmp_neq_f32_e32 vcc, s8, v1
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
@@ -2098,12 +2101,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX7-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX7-NEXT:    v_floor_f32_e32 v6, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
 ; GFX7-NEXT:    v_floor_f32_e32 v8, v5
@@ -2116,10 +2119,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX7-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, s8, v0
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
-; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, s8, v1
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
@@ -2133,16 +2136,16 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    s_movk_i32 s6, 0x204
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x204
 ; GFX8-NEXT:    v_floor_f16_e32 v4, v3
 ; GFX8-NEXT:    v_floor_f16_e32 v5, v0
 ; GFX8-NEXT:    v_fract_f16_e32 v6, v3
-; GFX8-NEXT:    v_cmp_class_f16_e64 s[4:5], v3, s6
+; GFX8-NEXT:    v_cmp_class_f16_e32 vcc, v3, v7
 ; GFX8-NEXT:    v_pack_b32_f16 v4, v5, v4
 ; GFX8-NEXT:    v_fract_f16_e32 v5, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, 0, s[4:5]
-; GFX8-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, s6
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, 0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, 0, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e32 vcc, v0, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
 ; GFX8-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; GFX8-NEXT:    global_store_dword v[1:2], v4, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -2237,19 +2240,19 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
 ; GFX6-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GFX6-NEXT:    s_movk_i32 s10, 0x204
-; GFX6-NEXT:    v_cmp_class_f64_e64 s[8:9], v[0:1], s10
+; GFX6-NEXT:    v_mov_b32_e32 v14, 0x204
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v13, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v12, 0, s[8:9]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v13, 0, s[8:9]
-; GFX6-NEXT:    v_cmp_class_f64_e64 s[8:9], v[2:3], s10
+; GFX6-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v14
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v13, 0, vcc
+; GFX6-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v14
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v10, 0, s[8:9]
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v11, 0, s[8:9]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v11, 0, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2257,39 +2260,39 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX7-LABEL: safe_math_fract_v2f64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0x204
+; GFX7-NEXT:    v_mov_b32_e32 v6, 0x204
 ; GFX7-NEXT:    v_fract_f64_e32 v[10:11], v[0:1]
-; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[0:1], s4
+; GFX7-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v6
 ; GFX7-NEXT:    v_fract_f64_e32 v[12:13], v[2:3]
-; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[2:3], s4
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v6
 ; GFX7-NEXT:    v_floor_f64_e32 v[8:9], v[2:3]
 ; GFX7-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v10, 0, s[8:9]
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v11, 0, s[8:9]
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[10:11]
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[10:11]
-; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s8, s10
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v11, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[4:5]
+; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_v2f64:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s6, 0x204
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x204
 ; GFX8-NEXT:    v_fract_f64_e32 v[10:11], v[0:1]
-; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], s6
+; GFX8-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v6
 ; GFX8-NEXT:    v_fract_f64_e32 v[12:13], v[2:3]
-; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[2:3], s6
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v6
 ; GFX8-NEXT:    v_floor_f64_e32 v[8:9], v[2:3]
 ; GFX8-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v10, 0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v11, 0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[6:7]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v11, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[4:5]
 ; GFX8-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index e3fada3459a07..538ef42121b83 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -1,71 +1,43 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
-; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-DPP %s
+
+; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations.
+; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction
+; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes.
 
 define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
-; IR-ITERATIVE:       2:
-; IR-ITERATIVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-ITERATIVE-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-ITERATIVE-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-ITERATIVE:       14:
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    br label [[TMP16]]
-; IR-ITERATIVE:       16:
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
-; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
-; IR-ITERATIVE-NEXT:    br label [[TMP24]]
-; IR-ITERATIVE:       24:
-; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-ITERATIVE-NEXT:    ret float [[TMP25]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
-; IR-DPP:       2:
-; IR-DPP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-DPP-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-DPP-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-DPP-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-DPP-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-DPP:       14:
-; IR-DPP-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    br label [[TMP16]]
-; IR-DPP:       16:
-; IR-DPP-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-DPP-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
-; IR-DPP-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-DPP-NEXT:    [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-DPP-NEXT:    [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
-; IR-DPP-NEXT:    [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
-; IR-DPP-NEXT:    br label [[TMP24]]
-; IR-DPP:       24:
-; IR-DPP-NEXT:    [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-DPP-NEXT:    ret float [[TMP25]]
+; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR:       2:
+; IR-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
+; IR-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
+; IR-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
+; IR-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR:       14:
+; IR-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    br label [[TMP16]]
+; IR:       16:
+; IR-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
+; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
+; IR-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
+; IR-NEXT:    [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
+; IR-NEXT:    [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
+; IR-NEXT:    [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
+; IR-NEXT:    br label [[TMP24]]
+; IR:       24:
+; IR-NEXT:    [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
+; IR-NEXT:    ret float [[TMP25]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret float %result
@@ -411,7 +383,6 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
   ret float %result
 }
 
-
 define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
 ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
 ; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
@@ -514,61 +485,33 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 }
 
 define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; IR-ITERATIVE:       2:
-; IR-ITERATIVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-ITERATIVE:       10:
-; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    br label [[TMP12]]
-; IR-ITERATIVE:       12:
-; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
-; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
-; IR-ITERATIVE-NEXT:    br label [[TMP20]]
-; IR-ITERATIVE:       20:
-; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT:    ret float [[TMP21]]
-;
-; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; IR-DPP:       2:
-; IR-DPP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-DPP:       10:
-; IR-DPP-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    br label [[TMP12]]
-; IR-DPP:       12:
-; IR-DPP-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-DPP-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-DPP-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
-; IR-DPP-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-DPP-NEXT:    [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-DPP-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
-; IR-DPP-NEXT:    br label [[TMP20]]
-; IR-DPP:       20:
-; IR-DPP-NEXT:    [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
-; IR-DPP-NEXT:    ret float [[TMP21]]
+; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; IR:       2:
+; IR-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR:       10:
+; IR-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    br label [[TMP12]]
+; IR:       12:
+; IR-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
+; IR-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
+; IR-NEXT:    [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
+; IR-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
+; IR-NEXT:    [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
+; IR-NEXT:    br label [[TMP20]]
+; IR:       20:
+; IR-NEXT:    [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
+; IR-NEXT:    ret float [[TMP21]]
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
@@ -1007,159 +950,109 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
   ret float %result
 }
 
-
 define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
-
 define amdgpu_ps float @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
   ret float %result
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index f87932b9361ed..cc7a45cbb6e37 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -1,55 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
-; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-DPP %s
+
+; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations.
+; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction
+; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes.
 
 define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
-; IR-ITERATIVE:       2:
-; IR-ITERATIVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-ITERATIVE-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-ITERATIVE-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-ITERATIVE:       14:
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    br label [[TMP16]]
-; IR-ITERATIVE:       16:
-; IR-ITERATIVE-NEXT:    br label [[TMP17]]
-; IR-ITERATIVE:       17:
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
-; IR-DPP:       2:
-; IR-DPP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-DPP-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-DPP-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-DPP-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-DPP-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-DPP:       14:
-; IR-DPP-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    br label [[TMP16]]
-; IR-DPP:       16:
-; IR-DPP-NEXT:    br label [[TMP17]]
-; IR-DPP:       17:
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR:       2:
+; IR-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
+; IR-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
+; IR-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
+; IR-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR:       14:
+; IR-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    br label [[TMP16]]
+; IR:       16:
+; IR-NEXT:    br label [[TMP17]]
+; IR:       17:
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret void
@@ -325,7 +305,6 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_uni_value_agent_scope_stri
   ret void
 }
 
-
 define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
 ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
 ; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
@@ -409,45 +388,25 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
 }
 
 define amdgpu_ps void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
-; IR-ITERATIVE:       2:
-; IR-ITERATIVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-ITERATIVE:       10:
-; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    br label [[TMP12]]
-; IR-ITERATIVE:       12:
-; IR-ITERATIVE-NEXT:    br label [[TMP13]]
-; IR-ITERATIVE:       13:
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
-; IR-DPP:       2:
-; IR-DPP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-DPP:       10:
-; IR-DPP-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    br label [[TMP12]]
-; IR-DPP:       12:
-; IR-DPP-NEXT:    br label [[TMP13]]
-; IR-DPP:       13:
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR:       2:
+; IR-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR:       10:
+; IR-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    br label [[TMP12]]
+; IR:       12:
+; IR-NEXT:    br label [[TMP13]]
+; IR:       13:
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
@@ -797,159 +756,109 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
   ret void
 }
 
-
 define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
-
 define amdgpu_ps void @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
new file mode 100644
index 0000000000000..bfeb214c5af8f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -0,0 +1,1618 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define float @sitofp_i128_to_f32(i128 %x) {
+; SDAG-LABEL: sitofp_i128_to_f32:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB0_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
+; SDAG-NEXT:    v_xor_b32_e32 v0, v5, v0
+; SDAG-NEXT:    v_xor_b32_e32 v1, v5, v1
+; SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
+; SDAG-NEXT:    v_xor_b32_e32 v2, v5, v2
+; SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v6, v5, v3
+; SDAG-NEXT:    v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v2, v4
+; SDAG-NEXT:    v_add_u32_e32 v2, 32, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v5
+; SDAG-NEXT:    v_min_u32_e32 v2, v2, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v0
+; SDAG-NEXT:    v_add_u32_e32 v6, 32, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v7, v1
+; SDAG-NEXT:    v_min_u32_e32 v6, v6, v7
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_add_u32_e32 v6, 64, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v6, 0x80, v7
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x7f, v7
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v6
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff98, v7
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr6
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 25, v6
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v6
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB0_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v12, 0x66, v7
+; SDAG-NEXT:    v_sub_u32_e32 v10, 64, v12
+; SDAG-NEXT:    v_lshrrev_b64 v[8:9], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[10:11], v10, v[4:5]
+; SDAG-NEXT:    v_sub_u32_e32 v13, 38, v7
+; SDAG-NEXT:    v_or_b32_e32 v11, v9, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT:    v_lshrrev_b64 v[8:9], v13, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; SDAG-NEXT:    v_add_u32_e32 v14, 26, v7
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[10:11], v13, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[12:13], v14, v[4:5]
+; SDAG-NEXT:    v_subrev_u32_e32 v7, 38, v7
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, v8, v0, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[7:8], v7, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v11, v13, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v14, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v5
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v8, v15, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v1, v9
+; SDAG-NEXT:  .LBB0_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB0_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v4, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_and_b32_e32 v4, 0x4000000, v0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; SDAG-NEXT:    v_alignbit_b32 v8, v1, v0, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_alignbit_b32 v8, v1, v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v2, v6
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB0_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x80000000, v3
+; SDAG-NEXT:    v_lshl_add_u32 v1, v2, 23, 1.0
+; SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffff, v8
+; SDAG-NEXT:    v_or3_b32 v4, v2, v0, v1
+; SDAG-NEXT:  .LBB0_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sitofp_i128_to_f32:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB0_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v0, v6, v0
+; GISEL-NEXT:    v_xor_b32_e32 v1, v6, v1
+; GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_xor_b32_e32 v2, v6, v2
+; GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v6, v3
+; GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v7, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v7, 32, v7
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v8, 0x80, v5
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x7f, v5
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 24, v8
+; GISEL-NEXT:    ; implicit-def: $vgpr4
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr2
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 26, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB0_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v4
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v4, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v4
+; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v12, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v13, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v9, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v10, v1, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v5, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v14, 64, v5
+; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v16, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v14, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v11, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v12, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v11, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v13, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB0_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB0_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v7, v8
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB0_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x80000000, v6
+; GISEL-NEXT:    v_lshl_add_u32 v1, v7, 23, 1.0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT:    v_or3_b32 v4, v2, v0, v1
+; GISEL-NEXT:  .LBB0_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = sitofp i128 %x to float
+  ret float %cvt
+}
+
+define float @uitofp_i128_to_f32(i128 %x) {
+; SDAG-LABEL: uitofp_i128_to_f32:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB1_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ffbh_u32_e32 v4, v2
+; SDAG-NEXT:    v_add_u32_e32 v4, 32, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v3
+; SDAG-NEXT:    v_min_u32_e32 v4, v4, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v0
+; SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v1
+; SDAG-NEXT:    v_min_u32_e32 v5, v5, v6
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_add_u32_e32 v5, 64, v5
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v5, 0x80, v6
+; SDAG-NEXT:    v_sub_u32_e32 v4, 0x7f, v6
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v5
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff98, v6
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr6
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 25, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB1_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v11, 0x66, v6
+; SDAG-NEXT:    v_sub_u32_e32 v9, 64, v11
+; SDAG-NEXT:    v_lshrrev_b64 v[7:8], v11, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[9:10], v9, v[2:3]
+; SDAG-NEXT:    v_sub_u32_e32 v12, 38, v6
+; SDAG-NEXT:    v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v7, v9
+; SDAG-NEXT:    v_lshrrev_b64 v[7:8], v12, v[2:3]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
+; SDAG-NEXT:    v_add_u32_e32 v13, 26, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[9:10], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[11:12], v13, v[2:3]
+; SDAG-NEXT:    v_subrev_u32_e32 v6, 38, v6
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v7, v0, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[6:7], v6, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v11, v9
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v13, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v7, v14, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v7
+; SDAG-NEXT:    v_mov_b32_e32 v1, v8
+; SDAG-NEXT:  .LBB1_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB1_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v2, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SDAG-NEXT:    v_alignbit_b32 v7, v1, v0, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_alignbit_b32 v7, v1, v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v4, v5
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB1_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v7
+; SDAG-NEXT:    v_lshl_or_b32 v0, v4, 23, v0
+; SDAG-NEXT:    v_add_u32_e32 v4, 1.0, v0
+; SDAG-NEXT:  .LBB1_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: uitofp_i128_to_f32:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB1_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v6, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v6, 32, v6
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x80, v5
+; GISEL-NEXT:    v_sub_u32_e32 v6, 0x7f, v5
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 24, v7
+; GISEL-NEXT:    ; implicit-def: $vgpr4
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr7
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr2
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 26, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB1_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v4
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v4, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v12, 64, v4
+; GISEL-NEXT:    v_or_b32_e32 v10, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v12, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v1, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v5, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v5
+; GISEL-NEXT:    v_or_b32_e32 v14, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[10:11], v13, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v11, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v8, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v9, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v10, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v12, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB1_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB1_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v7
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB1_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_add_u32 v0, v6, 23, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fffff
+; GISEL-NEXT:    v_and_or_b32 v4, v4, v1, v0
+; GISEL-NEXT:  .LBB1_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = uitofp i128 %x to float
+  ret float %cvt
+}
+
+define double @sitofp_i128_to_f64(i128 %x) {
+; SDAG-LABEL: sitofp_i128_to_f64:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_or_b32_e32 v1, v5, v3
+; SDAG-NEXT:    v_or_b32_e32 v0, v4, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB2_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; SDAG-NEXT:    v_xor_b32_e32 v4, v0, v4
+; SDAG-NEXT:    v_xor_b32_e32 v5, v0, v5
+; SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v0
+; SDAG-NEXT:    v_xor_b32_e32 v2, v0, v2
+; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v0, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v1, v0, v3
+; SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, v2, v0, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, v1, v0, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v0, v6
+; SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v7
+; SDAG-NEXT:    v_min_u32_e32 v0, v0, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v4
+; SDAG-NEXT:    v_add_u32_e32 v1, 32, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v2, v5
+; SDAG-NEXT:    v_min_u32_e32 v1, v1, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT:    v_add_u32_e32 v1, 64, v1
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v1, v0, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v8, 0x80, v9
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x7f, v9
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 54, v8
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v6, 0xffffffb5, v9
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 54, v8
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 55, v8
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB2_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v12, 0x49, v9
+; SDAG-NEXT:    v_sub_u32_e32 v10, 64, v12
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v12, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
+; SDAG-NEXT:    v_sub_u32_e32 v13, 9, v9
+; SDAG-NEXT:    v_or_b32_e32 v11, v1, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v0, v10
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v13, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; SDAG-NEXT:    v_add_u32_e32 v16, 55, v9
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[10:11], v12, v[6:7]
+; SDAG-NEXT:    v_lshrrev_b64 v[12:13], v13, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[14:15], v16, v[6:7]
+; SDAG-NEXT:    v_add_u32_e32 v9, -9, v9
+; SDAG-NEXT:    v_or_b32_e32 v15, v15, v13
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v12
+; SDAG-NEXT:    v_lshlrev_b64 v[12:13], v9, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v11, 0, v11, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v13, v15, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; SDAG-NEXT:    v_lshlrev_b64 v[4:5], v16, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v12, v14, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v9, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_or_b32_e32 v5, v5, v7
+; SDAG-NEXT:    v_or_b32_e32 v4, v4, v6
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v6, v10
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_mov_b32_e32 v7, v11
+; SDAG-NEXT:  .LBB2_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB2_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v0, 31, v5
+; SDAG-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v6, v6, v0
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v0, 2, v4
+; SDAG-NEXT:    v_and_or_b32 v0, v0, 1, v4
+; SDAG-NEXT:    v_add_co_u32_e32 v4, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], 2, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v7, 30, v6
+; SDAG-NEXT:    v_or_b32_e32 v10, v1, v7
+; SDAG-NEXT:    v_and_b32_e32 v1, 0x800000, v5
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], 3, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 29, v6
+; SDAG-NEXT:    v_or_b32_e32 v10, v1, v2
+; SDAG-NEXT:    v_mov_b32_e32 v2, v8
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB2_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v1, 0x80000000, v3
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
+; SDAG-NEXT:    v_lshl_add_u32 v2, v2, 20, v3
+; SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff, v10
+; SDAG-NEXT:    v_or3_b32 v1, v3, v1, v2
+; SDAG-NEXT:  .LBB2_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sitofp_i128_to_f64:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v1
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_or_b32_e32 v0, v4, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v5, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB2_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v0, v6, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v6, v5
+; GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_xor_b32_e32 v2, v6, v2
+; GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v6, v3
+; GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v7, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v7, 32, v7
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v8, 0x80, v9
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x7f, v9
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 53, v8
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffffb5, v9
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 55, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 55, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB2_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v14, 0x49, v9
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v14
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GISEL-NEXT:    v_or_b32_e32 v10, v4, v10
+; GISEL-NEXT:    v_or_b32_e32 v11, v5, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v15, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v14, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GISEL-NEXT:    v_add_u32_e32 v14, 55, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v4, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v12, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v14, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GISEL-NEXT:    v_or_b32_e32 v16, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v17, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v15, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v12, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v11, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v12, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v13, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB2_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB2_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[9:10], 1, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
+; GISEL-NEXT:    v_or_b32_e32 v11, v2, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v9
+; GISEL-NEXT:    v_mov_b32_e32 v1, v10
+; GISEL-NEXT:    v_mov_b32_e32 v2, v11
+; GISEL-NEXT:    v_mov_b32_e32 v3, v12
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v3, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0
+; GISEL-NEXT:    v_and_b32_e32 v10, 0x800000, v1
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
+; GISEL-NEXT:    v_lshl_or_b32 v10, v2, 30, v5
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v7, v8
+; GISEL-NEXT:    v_lshl_or_b32 v10, v2, 29, v5
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB2_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x80000000, v6
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xfffff
+; GISEL-NEXT:    v_lshl_add_u32 v1, v7, 20, v1
+; GISEL-NEXT:    v_and_or_b32 v2, v10, v2, v0
+; GISEL-NEXT:    v_and_or_b32 v0, v4, -1, 0
+; GISEL-NEXT:    v_or3_b32 v1, v2, v1, 0
+; GISEL-NEXT:  .LBB2_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = sitofp i128 %x to double
+  ret double %cvt
+}
+
+define double @uitofp_i128_to_f64(i128 %x) {
+; SDAG-LABEL: uitofp_i128_to_f64:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB3_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ffbh_u32_e32 v4, v2
+; SDAG-NEXT:    v_add_u32_e32 v4, 32, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v3
+; SDAG-NEXT:    v_min_u32_e32 v4, v4, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v0
+; SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v1
+; SDAG-NEXT:    v_min_u32_e32 v5, v5, v6
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_add_u32_e32 v5, 64, v5
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v5, v4, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v7, 0x80, v8
+; SDAG-NEXT:    v_sub_u32_e32 v6, 0x7f, v8
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 54, v7
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffffb5, v8
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 54, v7
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 55, v7
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB3_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v11, 0x49, v8
+; SDAG-NEXT:    v_sub_u32_e32 v9, 64, v11
+; SDAG-NEXT:    v_lshrrev_b64 v[4:5], v11, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[9:10], v9, v[2:3]
+; SDAG-NEXT:    v_sub_u32_e32 v12, 9, v8
+; SDAG-NEXT:    v_or_b32_e32 v10, v5, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v4, v9
+; SDAG-NEXT:    v_lshrrev_b64 v[4:5], v12, v[2:3]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
+; SDAG-NEXT:    v_add_u32_e32 v15, 55, v8
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[9:10], v11, v[2:3]
+; SDAG-NEXT:    v_lshrrev_b64 v[11:12], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[13:14], v15, v[2:3]
+; SDAG-NEXT:    v_add_u32_e32 v8, -9, v8
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v12
+; SDAG-NEXT:    v_or_b32_e32 v13, v13, v11
+; SDAG-NEXT:    v_lshlrev_b64 v[11:12], v8, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v12, v14, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v15, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v11, v13, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_mov_b32_e32 v2, v9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v4, v4, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    v_mov_b32_e32 v3, v10
+; SDAG-NEXT:  .LBB3_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB3_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 31, v1
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v3, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; SDAG-NEXT:    v_and_b32_e32 v3, 0x800000, v1
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; SDAG-NEXT:    v_alignbit_b32 v9, v2, v1, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; SDAG-NEXT:    v_alignbit_b32 v9, v2, v1, 3
+; SDAG-NEXT:    v_mov_b32_e32 v6, v7
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB3_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xfffff, v9
+; SDAG-NEXT:    v_lshl_or_b32 v0, v6, 20, v0
+; SDAG-NEXT:    v_add_u32_e32 v5, 0x3ff00000, v0
+; SDAG-NEXT:  .LBB3_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: uitofp_i128_to_f64:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB3_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v6, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v6, 32, v6
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x80, v8
+; GISEL-NEXT:    v_sub_u32_e32 v6, 0x7f, v8
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 53, v7
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffffb5, v8
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v1, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr7
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 55, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 55, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB3_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v13, 0x49, v8
+; GISEL-NEXT:    v_sub_u32_e32 v9, 64, v13
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v13, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[9:10], v9, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v14, 64, v13
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v13, v[2:3]
+; GISEL-NEXT:    v_or_b32_e32 v9, v4, v9
+; GISEL-NEXT:    v_or_b32_e32 v10, v5, v10
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v14, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
+; GISEL-NEXT:    v_add_u32_e32 v8, 55, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v12, 64, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v4, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v5, v1, s[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v8, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[12:13], v12, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v8
+; GISEL-NEXT:    v_or_b32_e32 v16, v4, v12
+; GISEL-NEXT:    v_or_b32_e32 v17, v5, v13
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v15, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v12, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v13, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v4, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v5, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v8, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v12, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v8, v14, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mov_b32_e32 v3, v11
+; GISEL-NEXT:  .LBB3_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB3_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], 1, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
+; GISEL-NEXT:    v_or_b32_e32 v10, v10, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mov_b32_e32 v3, v11
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v4, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GISEL-NEXT:    v_and_b32_e32 v9, 0x800000, v1
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_lshlrev_b64 v[8:9], 30, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 2, v1
+; GISEL-NEXT:    v_or_b32_e32 v9, v5, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 29, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v9, v0, v2
+; GISEL-NEXT:    v_mov_b32_e32 v6, v7
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB3_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GISEL-NEXT:    v_lshl_add_u32 v0, v6, 20, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff, v9
+; GISEL-NEXT:    v_and_or_b32 v4, v4, -1, 0
+; GISEL-NEXT:    v_or3_b32 v5, v1, v0, 0
+; GISEL-NEXT:  .LBB3_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = uitofp i128 %x to double
+  ret double %cvt
+}
+
+define half @sitofp_i128_to_f16(i128 %x) {
+; SDAG-LABEL: sitofp_i128_to_f16:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB4_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
+; SDAG-NEXT:    v_xor_b32_e32 v0, v5, v0
+; SDAG-NEXT:    v_xor_b32_e32 v1, v5, v1
+; SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
+; SDAG-NEXT:    v_xor_b32_e32 v2, v5, v2
+; SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v6, v5, v3
+; SDAG-NEXT:    v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v2, v4
+; SDAG-NEXT:    v_add_u32_e32 v2, 32, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v5
+; SDAG-NEXT:    v_min_u32_e32 v2, v2, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v0
+; SDAG-NEXT:    v_add_u32_e32 v6, 32, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v7, v1
+; SDAG-NEXT:    v_min_u32_e32 v6, v6, v7
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_add_u32_e32 v6, 64, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v6, 0x80, v7
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x7f, v7
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v6
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff98, v7
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr6
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB4_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 25, v6
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB4_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v6
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB4_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v12, 0x66, v7
+; SDAG-NEXT:    v_sub_u32_e32 v10, 64, v12
+; SDAG-NEXT:    v_lshrrev_b64 v[8:9], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[10:11], v10, v[4:5]
+; SDAG-NEXT:    v_sub_u32_e32 v13, 38, v7
+; SDAG-NEXT:    v_or_b32_e32 v11, v9, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT:    v_lshrrev_b64 v[8:9], v13, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; SDAG-NEXT:    v_add_u32_e32 v14, 26, v7
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[10:11], v13, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[12:13], v14, v[4:5]
+; SDAG-NEXT:    v_subrev_u32_e32 v7, 38, v7
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, v8, v0, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[7:8], v7, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v11, v13, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v14, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v5
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v8, v15, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v1, v9
+; SDAG-NEXT:  .LBB4_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB4_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v4, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_and_b32_e32 v4, 0x4000000, v0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; SDAG-NEXT:    v_alignbit_b32 v8, v1, v0, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_alignbit_b32 v8, v1, v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v2, v6
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB4_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x80000000, v3
+; SDAG-NEXT:    v_lshl_add_u32 v1, v2, 23, 1.0
+; SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffff, v8
+; SDAG-NEXT:    v_or3_b32 v0, v2, v0, v1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; SDAG-NEXT:  .LBB4_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sitofp_i128_to_f16:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB4_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v0, v6, v0
+; GISEL-NEXT:    v_xor_b32_e32 v1, v6, v1
+; GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_xor_b32_e32 v2, v6, v2
+; GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v6, v3
+; GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v7, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v7, 32, v7
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v8, 0x80, v5
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x7f, v5
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 24, v8
+; GISEL-NEXT:    ; implicit-def: $vgpr4
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr2
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB4_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 26, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB4_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB4_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v4
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v4, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v4
+; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v12, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v13, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v9, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v10, v1, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v5, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v14, 64, v5
+; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v16, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v14, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v11, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v12, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v11, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v13, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB4_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB4_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v7, v8
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB4_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x80000000, v6
+; GISEL-NEXT:    v_lshl_add_u32 v1, v7, 23, 1.0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT:    v_or3_b32 v0, v2, v0, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GISEL-NEXT:  .LBB4_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = sitofp i128 %x to half
+  ret half %cvt
+}
+
+define half @uitofp_i128_to_f16(i128 %x) {
+; SDAG-LABEL: uitofp_i128_to_f16:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB5_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ffbh_u32_e32 v4, v2
+; SDAG-NEXT:    v_add_u32_e32 v4, 32, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v3
+; SDAG-NEXT:    v_min_u32_e32 v4, v4, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v0
+; SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v1
+; SDAG-NEXT:    v_min_u32_e32 v5, v5, v6
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_add_u32_e32 v5, 64, v5
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v5, 0x80, v6
+; SDAG-NEXT:    v_sub_u32_e32 v4, 0x7f, v6
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v5
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff98, v6
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr6
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB5_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 25, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB5_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB5_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v11, 0x66, v6
+; SDAG-NEXT:    v_sub_u32_e32 v9, 64, v11
+; SDAG-NEXT:    v_lshrrev_b64 v[7:8], v11, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[9:10], v9, v[2:3]
+; SDAG-NEXT:    v_sub_u32_e32 v12, 38, v6
+; SDAG-NEXT:    v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v7, v9
+; SDAG-NEXT:    v_lshrrev_b64 v[7:8], v12, v[2:3]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
+; SDAG-NEXT:    v_add_u32_e32 v13, 26, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[9:10], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[11:12], v13, v[2:3]
+; SDAG-NEXT:    v_subrev_u32_e32 v6, 38, v6
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v7, v0, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[6:7], v6, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v11, v9
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v13, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v7, v14, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v7
+; SDAG-NEXT:    v_mov_b32_e32 v1, v8
+; SDAG-NEXT:  .LBB5_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB5_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v2, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SDAG-NEXT:    v_alignbit_b32 v7, v1, v0, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_alignbit_b32 v7, v1, v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v4, v5
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB5_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v7
+; SDAG-NEXT:    v_lshl_or_b32 v0, v4, 23, v0
+; SDAG-NEXT:    v_add_u32_e32 v0, 1.0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; SDAG-NEXT:  .LBB5_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: uitofp_i128_to_f16:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB5_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v6, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v6, 32, v6
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x80, v5
+; GISEL-NEXT:    v_sub_u32_e32 v6, 0x7f, v5
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 24, v7
+; GISEL-NEXT:    ; implicit-def: $vgpr4
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr7
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr2
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB5_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 26, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB5_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB5_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v4
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v4, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v12, 64, v4
+; GISEL-NEXT:    v_or_b32_e32 v10, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v12, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v1, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v5, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v5
+; GISEL-NEXT:    v_or_b32_e32 v14, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[10:11], v13, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v11, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v8, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v9, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v10, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v12, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB5_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB5_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v7
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB5_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_add_u32 v0, v6, 23, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fffff
+; GISEL-NEXT:    v_and_or_b32 v0, v4, v1, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GISEL-NEXT:  .LBB5_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = uitofp i128 %x to half
+  ret half %cvt
+}
+
+; FIXME: ExpandLargeFpConvert asserts on bfloat
+; define bfloat @sitofp_i128_to_bf16(i128 %x) {
+;   %cvt = sitofp i128 %x to bfloat
+;   ret bfloat %cvt
+; }
+
+; define bfloat @uitofp_i128_to_bf16(i128 %x) {
+;   %cvt = uitofp i128 %x to bfloat
+;   ret bfloat %cvt
+; }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 7d8650e30ba25..dc840a8dc2160 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -199,13 +199,13 @@
 ; GCN-O1-NEXT:      Uniformity Analysis
 ; GCN-O1-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-NEXT:      Expand Atomic instructions
-; GCN-O1-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-NEXT:      Dominator Tree Construction
+; GCN-O1-NEXT:      Natural Loop Information
+; GCN-O1-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-NEXT:      Cycle Info Analysis
 ; GCN-O1-NEXT:      Uniformity Analysis
 ; GCN-O1-NEXT:      AMDGPU IR optimizations
 ; GCN-O1-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT:      Natural Loop Information
 ; GCN-O1-NEXT:      Canonicalize natural loops
 ; GCN-O1-NEXT:      Scalar Evolution Analysis
 ; GCN-O1-NEXT:      Loop Pass Manager
@@ -478,9 +478,9 @@
 ; GCN-O1-OPTS-NEXT:      Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-OPTS-NEXT:      Expand Atomic instructions
-; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
+; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Canonicalize natural loops
 ; GCN-O1-OPTS-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O1-OPTS-NEXT:      Lazy Block Frequency Analysis
@@ -787,9 +787,9 @@
 ; GCN-O2-NEXT:      Uniformity Analysis
 ; GCN-O2-NEXT:      AMDGPU atomic optimizations
 ; GCN-O2-NEXT:      Expand Atomic instructions
-; GCN-O2-NEXT:      AMDGPU Promote Alloca
 ; GCN-O2-NEXT:      Dominator Tree Construction
 ; GCN-O2-NEXT:      Natural Loop Information
+; GCN-O2-NEXT:      AMDGPU Promote Alloca
 ; GCN-O2-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
 ; GCN-O2-NEXT:      Scalar Evolution Analysis
 ; GCN-O2-NEXT:      Straight line strength reduction
@@ -1100,9 +1100,9 @@
 ; GCN-O3-NEXT:      Uniformity Analysis
 ; GCN-O3-NEXT:      AMDGPU atomic optimizations
 ; GCN-O3-NEXT:      Expand Atomic instructions
-; GCN-O3-NEXT:      AMDGPU Promote Alloca
 ; GCN-O3-NEXT:      Dominator Tree Construction
 ; GCN-O3-NEXT:      Natural Loop Information
+; GCN-O3-NEXT:      AMDGPU Promote Alloca
 ; GCN-O3-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
 ; GCN-O3-NEXT:      Scalar Evolution Analysis
 ; GCN-O3-NEXT:      Straight line strength reduction
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
new file mode 100644
index 0000000000000..fdcb1773d0a3f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
+
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16)
+
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v48, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT:    v_add_nc_u32_e32 v0, s0, v28
+; GCN-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
+; GCN-NEXT:    ds_load_b128 v[8:11], v0
+; GCN-NEXT:    ds_load_b128 v[12:15], v0 offset:512
+; GCN-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
+; GCN-NEXT:    ds_load_b128 v[20:23], v0 offset:3072
+; GCN-NEXT:    ds_load_b128 v[24:27], v0 offset:5120
+; GCN-NEXT:    ds_load_b128 v[4:7], v0 offset:11280
+; GCN-NEXT:    ds_load_b128 v[0:3], v0 offset:11264
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x6
+; GCN-NEXT:    v_mov_b32_e32 v31, v11
+; GCN-NEXT:    s_wait_dscnt 0x5
+; GCN-NEXT:    v_mov_b32_e32 v35, v15
+; GCN-NEXT:    s_wait_dscnt 0x4
+; GCN-NEXT:    v_mov_b32_e32 v39, v19
+; GCN-NEXT:    s_wait_dscnt 0x3
+; GCN-NEXT:    v_mov_b32_e32 v43, v23
+; GCN-NEXT:    s_wait_dscnt 0x2
+; GCN-NEXT:    v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
+; GCN-NEXT:    v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
+; GCN-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
+; GCN-NEXT:    v_mov_b32_e32 v32, v12
+; GCN-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
+; GCN-NEXT:    v_mov_b32_e32 v36, v16
+; GCN-NEXT:    v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
+; GCN-NEXT:    v_mov_b32_e32 v40, v20
+; GCN-NEXT:    v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
+; GCN-NEXT:    v_mov_b32_e32 v44, v24
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
+; GCN-NEXT:    ds_store_b128 v49, v[28:31]
+; GCN-NEXT:    ds_store_b128 v50, v[32:35] offset:512
+; GCN-NEXT:    ds_store_b128 v50, v[36:39] offset:1024
+; GCN-NEXT:    ds_store_b128 v50, v[40:43] offset:1536
+; GCN-NEXT:    ds_store_b128 v50, v[44:47] offset:2048
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
+; GCN-NEXT:    s_endpgm
+;
+; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
+; EXACTCUTOFF:       ; %bb.0: ; %entry
+; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v28, 4, v0
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v48, 0
+; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v0, s0, v28
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v0
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[12:15], v0 offset:512
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[20:23], v0 offset:3072
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[24:27], v0 offset:5120
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v0 offset:11280
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v0 offset:11264
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x6
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v31, v11
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x5
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v35, v15
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x4
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v39, v19
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x3
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v43, v23
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x2
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v32, v12
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v36, v16
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v40, v20
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v44, v24
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
+; EXACTCUTOFF-NEXT:    ds_store_b128 v49, v[28:31]
+; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[32:35] offset:512
+; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[36:39] offset:1024
+; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[40:43] offset:1536
+; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[44:47] offset:2048
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
+  %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32
+  %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
+  %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64
+  %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
+  %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96
+  %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
+  %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128
+  %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
+  %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192
+  %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
+  %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
+  %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
+  %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
+  %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
+  %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
+  %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
+  store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
+  %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
+  store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
+  %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
+  store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
+  %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
+  store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
+  %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
+  store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
+  ; 7 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0)
+  ; 5 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0)
+  ; 5 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_lshl_add_u32 v17, v0, 5, s0
+; GCN-NEXT:    v_lshl_add_u32 v0, v0, 4, s1
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:1024
+; GCN-NEXT:    ds_load_b128 v[1:4], v17
+; GCN-NEXT:    ds_load_b128 v[5:8], v17 offset:16
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x2
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16]
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:2560
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16] offset:512
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:4608
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16] offset:1024
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:7168
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16] offset:1536
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:10240
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16] offset:2048
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    s_endpgm
+;
+; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
+; EXACTCUTOFF:       ; %bb.0: ; %entry
+; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v18, 0
+; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
+; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v17, v0, 5, s0
+; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v0, v0, 4, s1
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:1024
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[1:4], v17
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[5:8], v17 offset:16
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x2
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16]
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:2560
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16] offset:512
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:4608
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16] offset:1024
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:7168
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16] offset:1536
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:10240
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16] offset:2048
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx
+  %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
+  %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64
+  %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
+  %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96
+  %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
+  %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128
+  %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
+  %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160
+  %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
+  %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192
+  %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
+  %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
+  %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
+  %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
+  %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
+  %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
+  %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
+  store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
+  %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
+  store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
+  %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
+  store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
+  %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
+  store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
+  %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
+  store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
+  ; 3 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ; 1 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ; 1 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ; 1 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ; 1 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
new file mode 100644
index 0000000000000..538ce15979de8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -0,0 +1,305 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK:           .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT:  - .api:            Vulkan
+; CHECK-NEXT:    .compute_registers:
+; CHECK-NEXT:      .tg_size_en:     true
+; CHECK-NEXT:      .tgid_x_en:      false
+; CHECK-NEXT:      .tgid_y_en:      false
+; CHECK-NEXT:      .tgid_z_en:      false
+; CHECK-NEXT:      .tidig_comp_cnt: 0x1
+; CHECK-NEXT:    .hardware_stages:
+; CHECK-NEXT:      .cs:
+; CHECK-NEXT:        .checksum_value: 0x9444d7d0
+; CHECK-NEXT:        .debug_mode:     0
+; CHECK-NEXT:        .excp_en:        0
+; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .ieee_mode:      true
+; CHECK-NEXT:        .image_op:       false
+; CHECK-NEXT:        .lds_size:       0x200
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .sgpr_limit:     0x6a
+; CHECK-NEXT:        .threadgroup_dimensions:
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:          - 0x400
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:        .trap_present:   false
+; CHECK-NEXT:        .user_data_reg_map:
+; CHECK-NEXT:          - 0x10000000
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:        .user_sgprs:     0x3
+; CHECK-NEXT:        .vgpr_limit:     0x100
+; CHECK-NEXT:        .wavefront_size: 0x40
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK:    .registers:      {}
+; CHECK-NEXT:    .shader_functions:
+; CHECK-NEXT:      dynamic_stack:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:      dynamic_stack_loop:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x3
+; CHECK-NEXT:      multiple_stack:
+; CHECK-NEXT:        .backend_stack_size: 0x24
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x21
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x24
+; CHECK-NEXT:        .vgpr_count:     0x3
+; CHECK-NEXT:      no_stack:
+; CHECK-NEXT:        .backend_stack_size: 0
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x20
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:      no_stack_call:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x3
+; CHECK-NEXT:      no_stack_extern_call:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x29
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      no_stack_extern_call_many_args:
+; CHECK-NEXT:        .backend_stack_size: 0x90
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x29
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x90
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      no_stack_indirect_call:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x29
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      simple_lds:
+; CHECK-NEXT:        .backend_stack_size: 0
+; CHECK-NEXT:        .lds_size:       0x100
+; CHECK-NEXT:        .sgpr_count:     0x20
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:      simple_lds_recurse:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0x100
+; CHECK-NEXT:        .sgpr_count:     0x24
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x29
+; CHECK-NEXT:      simple_stack:
+; CHECK-NEXT:        .backend_stack_size: 0x14
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x21
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x14
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:      simple_stack_call:
+; CHECK-NEXT:        .backend_stack_size: 0x20
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT:        .vgpr_count:     0x4
+; CHECK-NEXT:      simple_stack_extern_call:
+; CHECK-NEXT:        .backend_stack_size: 0x20
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x29
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      simple_stack_indirect_call:
+; CHECK-NEXT:        .backend_stack_size: 0x20
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x29
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      simple_stack_recurse:
+; CHECK-NEXT:        .backend_stack_size: 0x20
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x24
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT:        .vgpr_count:     0x2a
+; CHECK:amdpal.version:
+; CHECK-NEXT:  - 0x3
+; CHECK-NEXT:  - 0
+; CHECK-NEXT:...
+; CHECK-NEXT:        .end_amdgpu_pal_metadata
+
+declare amdgpu_gfx float @extern_func(float) #0
+declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
+
+@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define amdgpu_gfx float @no_stack(float %arg0) #0 {
+  %add = fadd float %arg0, 1.0
+  ret float %add
+}
+
+define amdgpu_gfx float @simple_stack(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %add = fadd float %arg0, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %add = fadd float %arg0, %val
+  %stack2 = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack2
+  %val2 = load volatile float, ptr addrspace(5) %stack2
+  %add2 = fadd float %add, %val2
+  ret float %add2
+}
+
+define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
+bb0:
+  %cmp = fcmp ogt float %arg0, 0.0
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %add = fadd float %arg0, %val
+  br label %bb2
+
+bb2:
+  %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
+  ret float %res
+}
+
+define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
+bb0:
+  br label %bb1
+
+bb1:
+  %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %add = fadd float %arg0, %val
+  %cmp = icmp sgt i32 %ctr, 0
+  %newctr = sub i32 %ctr, 1
+  br i1 %cmp, label %bb1, label %bb2
+
+bb2:
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
+  %res = call amdgpu_gfx float @simple_stack(float %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %res = call amdgpu_gfx float @simple_stack(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
+  %res = call amdgpu_gfx float @extern_func(float %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %res = call amdgpu_gfx float @extern_func(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
+  %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
+  %fptr = load ptr, ptr addrspace(4) @funcptr
+  call amdgpu_gfx void %fptr()
+  ret float %arg0
+}
+
+define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %fptr = load ptr, ptr addrspace(4) @funcptr
+  call amdgpu_gfx void %fptr()
+  %add = fadd float %arg0, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+@lds = internal addrspace(3) global [64 x float] undef
+
+define amdgpu_gfx float @simple_lds(float %arg0) #0 {
+  %val = load float, ptr addrspace(3) @lds
+  ret float %val
+}
+
+define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
+  %val = load float, ptr addrspace(3) @lds
+  %res = call amdgpu_gfx float @simple_lds_recurse(float %val)
+  ret float %res
+}
+
+attributes #0 = { nounwind }
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
new file mode 100644
index 0000000000000..ab03177d1edc5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK:      Scoring:   %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-NEXT:   => Final Score:1
+; CHECK-NEXT: Scoring:   %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
+; CHECK-NEXT:   [+1]:   %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
+; CHECK-NEXT:   [+1]:   store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
+; CHECK-NEXT:   [+1]:   %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT:   => Final Score:4
+; CHECK-NEXT: Sorted Worklist:
+; CHECK-NEXT:     %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:     %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+define amdgpu_kernel void @simple_users_scores() #0 {
+entry:
+  ; should get a score of 1
+  %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+  ; should get a score of 4
+  %manyusers = alloca [4 x i64], align 4, addrspace(5)
+
+  store i32 42, ptr addrspace(5) %simpleuser
+
+  %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+  %v0 = load i8, ptr addrspace(5)  %manyusers.1
+  %v0.ext = zext i8 %v0 to i32
+  store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+
+  %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
+  %v1 = load i8, ptr addrspace(5)  %manyusers.2
+  %v1.ext = zext i8 %v0 to i32
+  store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+
+  ret void
+}
+
+; CHECK:      Scoring:   %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+5]:   store i32 32, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT:   [+9]:   store i32 32, ptr addrspace(5) %stack.1, align 4
+; CHECK-NEXT:   [+5]:   %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
+; CHECK-NEXT:   [+1]:   store i32 64, ptr addrspace(5) %stack.2, align 4
+; CHECK-NEXT:   [+9]:   %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
+; CHECK-NEXT:   => Final Score:30
+define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
+entry:
+  ; should get a score of 1
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
+  %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+
+  store i32 42, ptr addrspace(5) %stack
+  br label %loop.outer
+
+loop.outer:
+  store i32 32, ptr addrspace(5) %stack
+  %outer.cmp = load i1, ptr addrspace(5) %stack.1
+  br label %loop.inner
+
+loop.inner:
+  store i32 32, ptr addrspace(5) %stack.1
+  %inner.cmp = load i1, ptr addrspace(5) %stack.2
+  br i1 %inner.cmp, label %loop.inner, label %loop.outer
+
+exit:
+  store i32 64, ptr addrspace(5) %stack.2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AVR/bug-81911.ll b/llvm/test/CodeGen/AVR/bug-81911.ll
new file mode 100644
index 0000000000000..2a22666a1ff92
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/bug-81911.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=avr -mcpu=atmega328 -O1 -verify-machineinstrs | FileCheck %s
+
+define internal i8 @main() {
+; CHECK-LABEL: main:
+; CHECK:       ; %bb.0: ; %bb0
+; CHECK-NEXT:    push r2
+; CHECK-NEXT:    push r3
+; CHECK-NEXT:    push r4
+; CHECK-NEXT:    push r5
+; CHECK-NEXT:    push r6
+; CHECK-NEXT:    push r7
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    push r11
+; CHECK-NEXT:    push r12
+; CHECK-NEXT:    push r13
+; CHECK-NEXT:    push r14
+; CHECK-NEXT:    push r15
+; CHECK-NEXT:    push r16
+; CHECK-NEXT:    push r17
+; CHECK-NEXT:    push r28
+; CHECK-NEXT:    push r29
+; CHECK-NEXT:    in r28, 61
+; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    sbiw r28, 13
+; CHECK-NEXT:    in r0, 63
+; CHECK-NEXT:    cli
+; CHECK-NEXT:    out 62, r29
+; CHECK-NEXT:    out 63, r0
+; CHECK-NEXT:    out 61, r28
+; CHECK-NEXT:    ldi r16, 0
+; CHECK-NEXT:    ldi r17, 0
+; CHECK-NEXT:    ldi r18, -1
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ldi r24, 123
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    std Y+1, r24 ; 1-byte Folded Spill
+; CHECK-NEXT:    movw r24, r28
+; CHECK-NEXT:    adiw r24, 6
+; CHECK-NEXT:    std Y+3, r25 ; 2-byte Folded Spill
+; CHECK-NEXT:    std Y+2, r24 ; 2-byte Folded Spill
+; CHECK-NEXT:    movw r8, r16
+; CHECK-NEXT:    movw r6, r16
+; CHECK-NEXT:    movw r4, r16
+; CHECK-NEXT:    movw r2, r16
+; CHECK-NEXT:    rjmp .LBB0_2
+; CHECK-NEXT:  .LBB0_1: ; %bb1
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    andi r30, 1
+; CHECK-NEXT:    ldd r31, Y+4 ; 1-byte Folded Reload
+; CHECK-NEXT:    dec r31
+; CHECK-NEXT:    cpi r30, 0
+; CHECK-NEXT:    movw r8, r18
+; CHECK-NEXT:    movw r6, r20
+; CHECK-NEXT:    movw r4, r22
+; CHECK-NEXT:    movw r2, r24
+; CHECK-NEXT:    mov r18, r31
+; CHECK-NEXT:    brne .LBB0_2
+; CHECK-NEXT:    rjmp .LBB0_4
+; CHECK-NEXT:  .LBB0_2: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    std Y+4, r18 ; 1-byte Folded Spill
+; CHECK-NEXT:    movw r18, r8
+; CHECK-NEXT:    movw r20, r6
+; CHECK-NEXT:    movw r22, r4
+; CHECK-NEXT:    movw r24, r2
+; CHECK-NEXT:    ldi r26, 10
+; CHECK-NEXT:    ldi r27, 0
+; CHECK-NEXT:    movw r10, r26
+; CHECK-NEXT:    movw r12, r16
+; CHECK-NEXT:    movw r14, r16
+; CHECK-NEXT:    call __udivdi3
+; CHECK-NEXT:    std Y+13, r25
+; CHECK-NEXT:    std Y+12, r24
+; CHECK-NEXT:    std Y+11, r23
+; CHECK-NEXT:    std Y+10, r22
+; CHECK-NEXT:    std Y+9, r21
+; CHECK-NEXT:    std Y+8, r20
+; CHECK-NEXT:    std Y+7, r19
+; CHECK-NEXT:    std Y+6, r18
+; CHECK-NEXT:    ldd r30, Y+2 ; 2-byte Folded Reload
+; CHECK-NEXT:    ldd r31, Y+3 ; 2-byte Folded Reload
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    ldi r30, 1
+; CHECK-NEXT:    cp r8, r1
+; CHECK-NEXT:    cpc r9, r1
+; CHECK-NEXT:    cpc r6, r16
+; CHECK-NEXT:    cpc r7, r17
+; CHECK-NEXT:    cpc r4, r16
+; CHECK-NEXT:    cpc r5, r17
+; CHECK-NEXT:    cpc r2, r16
+; CHECK-NEXT:    cpc r3, r17
+; CHECK-NEXT:    breq .LBB0_3
+; CHECK-NEXT:    rjmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: ; %bb1
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mov r30, r1
+; CHECK-NEXT:    rjmp .LBB0_1
+; CHECK-NEXT:  .LBB0_4: ; %bb3
+; CHECK-NEXT:    ldd r24, Y+1 ; 1-byte Folded Reload
+; CHECK-NEXT:    std Y+5, r24
+; CHECK-NEXT:    movw r24, r28
+; CHECK-NEXT:    adiw r24, 5
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    ldd r24, Y+5
+; CHECK-NEXT:    adiw r28, 13
+; CHECK-NEXT:    in r0, 63
+; CHECK-NEXT:    cli
+; CHECK-NEXT:    out 62, r29
+; CHECK-NEXT:    out 63, r0
+; CHECK-NEXT:    out 61, r28
+; CHECK-NEXT:    pop r29
+; CHECK-NEXT:    pop r28
+; CHECK-NEXT:    pop r17
+; CHECK-NEXT:    pop r16
+; CHECK-NEXT:    pop r15
+; CHECK-NEXT:    pop r14
+; CHECK-NEXT:    pop r13
+; CHECK-NEXT:    pop r12
+; CHECK-NEXT:    pop r11
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    pop r7
+; CHECK-NEXT:    pop r6
+; CHECK-NEXT:    pop r5
+; CHECK-NEXT:    pop r4
+; CHECK-NEXT:    pop r3
+; CHECK-NEXT:    pop r2
+; CHECK-NEXT:    ret
+bb0:
+  %0 = alloca i64
+  %1 = alloca i8
+  %2 = tail call i8 asm sideeffect "ldi ${0}, 123", "=&r,~{sreg},~{memory}"()
+
+  br label %bb1
+
+bb1:
+  %3 = phi i64 [ %5, %bb1 ], [ 0, %bb0 ]
+  %4 = phi i8 [ %6, %bb1 ], [ 0, %bb0 ]
+  %5 = udiv i64 %3, 10
+  %6 = add i8 %4, 1
+
+  store i64 %5, ptr %0
+  call void asm sideeffect "", "r,~{memory}"(ptr %0)
+
+  %7 = icmp eq i64 %3, 0
+  %8 = icmp eq i8 %6, 0
+
+  br i1 %7, label %bb3, label %bb1
+
+bb3:
+  store i8 %2, ptr %1
+  call void asm sideeffect "", "r,~{memory}"(ptr %1)
+
+  %9 = load i8, ptr %1
+
+  ret i8 %9
+}
diff --git a/llvm/test/CodeGen/BPF/addr-space-globals.ll b/llvm/test/CodeGen/BPF/addr-space-globals.ll
index 878ba0dfce6cd..73e80b7a0400b 100644
--- a/llvm/test/CodeGen/BPF/addr-space-globals.ll
+++ b/llvm/test/CodeGen/BPF/addr-space-globals.ll
@@ -18,7 +18,7 @@
 
 ; Verify that a,b,c reside in the same section
 
-; CHECK:     .section .arena.272,"aw",@progbits
+; CHECK:     .section .addr_space.272,"aw",@progbits
 ; CHECK-NOT: .section
 ; CHECK:     .globl  a
 ; CHECK:     .ascii  "\001\002"
diff --git a/llvm/test/CodeGen/BPF/addr-space-globals2.ll b/llvm/test/CodeGen/BPF/addr-space-globals2.ll
index d1e2318948751..5944cb27ee18a 100644
--- a/llvm/test/CodeGen/BPF/addr-space-globals2.ll
+++ b/llvm/test/CodeGen/BPF/addr-space-globals2.ll
@@ -14,12 +14,12 @@
 
 ; Verify that a,b reside in separate sections
 
-; CHECK:     .section .arena.1,"aw",@progbits
+; CHECK:     .section .addr_space.1,"aw",@progbits
 ; CHECK-NOT: .section
 ; CHECK:     .globl  a
 ; CHECK:     .ascii  "\001\002"
 
-; CHECK:     .section .arena.2,"aw",@progbits
+; CHECK:     .section .addr_space.2,"aw",@progbits
 ; CHECK-NOT: .section
 ; CHECK:     .globl  b
 ; CHECK:     .ascii  "\003\004"
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll
index 865fefeac335d..d027216e4213d 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll
@@ -3,10 +3,11 @@
 
 target triple = "dxil-pc-shadermodel6.7-library"
 
-; CHECK: ; Shader Flags Value: 0x00000021
+; CHECK: ; Shader Flags Value: 0x00000044
 ; CHECK: ; Note: shader requires additional functionality:
 ; CHECK-NEXT: ;       Double-precision floating point
 ; CHECK-NEXT: ;       Double-precision extensions for 11.1
+; CHECK-NEXT: ; Note: extra DXIL module flags:
 ; CHECK-NEXT: {{^;$}}
 define double @div(double %a, double %b) {
   %res = fdiv double %a, %b
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll
index f90db61661f09..c1a4c219a1695 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll
@@ -3,10 +3,12 @@
 
 target triple = "dxil-pc-shadermodel6.7-library"
 
-; CHECK: ; Shader Flags Value: 0x00000001
+; CHECK: ; Shader Flags Value: 0x00000004
 ; CHECK: ; Note: shader requires additional functionality:
 ; CHECK-NEXT: ;       Double-precision floating point
+; CHECK-NEXT: ; Note: extra DXIL module flags:
 ; CHECK-NEXT: {{^;$}}
+
 define double @add(double %a, double %b) {
   %sum = fadd double %a, %b
   ret double %sum
diff --git a/llvm/test/CodeGen/DirectX/any.ll b/llvm/test/CodeGen/DirectX/any.ll
new file mode 100644
index 0000000000000..e8d87075d65cf
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/any.ll
@@ -0,0 +1,113 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for any are generated for float and half.
+
+; CHECK-LABEL: any_bool
+; CHECK: icmp ne i1 %{{.*}}, false
+define noundef i1 @any_bool(i1 noundef %p0) {
+entry:
+  %p0.addr = alloca i8, align 1
+  %frombool = zext i1 %p0 to i8
+  store i8 %frombool, ptr %p0.addr, align 1
+  %0 = load i8, ptr %p0.addr, align 1
+  %tobool = trunc i8 %0 to i1
+  %dx.any = call i1 @llvm.dx.any.i1(i1 %tobool)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_int64_t
+; CHECK: icmp ne i64 %{{.*}}, 0
+define noundef i1 @any_int64_t(i64 noundef %p0) {
+entry:
+  %p0.addr = alloca i64, align 8
+  store i64 %p0, ptr %p0.addr, align 8
+  %0 = load i64, ptr %p0.addr, align 8
+  %dx.any = call i1 @llvm.dx.any.i64(i64 %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_int
+; CHECK: icmp ne i32 %{{.*}}, 0
+define noundef i1 @any_int(i32 noundef %p0) {
+entry:
+  %p0.addr = alloca i32, align 4
+  store i32 %p0, ptr %p0.addr, align 4
+  %0 = load i32, ptr %p0.addr, align 4
+  %dx.any = call i1 @llvm.dx.any.i32(i32 %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_int16_t
+; CHECK: icmp ne i16 %{{.*}}, 0
+define noundef i1 @any_int16_t(i16 noundef %p0) {
+entry:
+  %p0.addr = alloca i16, align 2
+  store i16 %p0, ptr %p0.addr, align 2
+  %0 = load i16, ptr %p0.addr, align 2
+  %dx.any = call i1 @llvm.dx.any.i16(i16 %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_double
+; CHECK: fcmp une double %{{.*}}, 0.000000e+00
+define noundef i1 @any_double(double noundef %p0) {
+entry:
+  %p0.addr = alloca double, align 8
+  store double %p0, ptr %p0.addr, align 8
+  %0 = load double, ptr %p0.addr, align 8
+  %dx.any = call i1 @llvm.dx.any.f64(double %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_float
+; CHECK: fcmp une float %{{.*}}, 0.000000e+00
+define noundef i1 @any_float(float noundef %p0) {
+entry:
+  %p0.addr = alloca float, align 4
+  store float %p0, ptr %p0.addr, align 4
+  %0 = load float, ptr %p0.addr, align 4
+  %dx.any = call i1 @llvm.dx.any.f32(float %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_half
+; CHECK: fcmp une half %{{.*}}, 0xH0000
+define noundef i1 @any_half(half noundef %p0) {
+entry:
+  %p0.addr = alloca half, align 2
+  store half %p0, ptr %p0.addr, align 2
+  %0 = load half, ptr %p0.addr, align 2
+  %dx.any = call i1 @llvm.dx.any.f16(half %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_bool4
+; CHECK: icmp ne <4 x i1> %extractvec, zeroinitialize
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 0
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 1
+; CHECK: or i1  %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 2
+; CHECK: or i1  %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 3
+; CHECK: or i1  %{{.*}}, %{{.*}}
+define noundef i1 @any_bool4(<4 x i1> noundef %p0) {
+entry:
+  %p0.addr = alloca i8, align 1
+  %insertvec = shufflevector <4 x i1> %p0, <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  %0 = bitcast <8 x i1> %insertvec to i8
+  store i8 %0, ptr %p0.addr, align 1
+  %load_bits = load i8, ptr %p0.addr, align 1
+  %1 = bitcast i8 %load_bits to <8 x i1>
+  %extractvec = shufflevector <8 x i1> %1, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %dx.any = call i1 @llvm.dx.any.v4i1(<4 x i1> %extractvec)
+  ret i1 %dx.any
+}
+
+declare i1 @llvm.dx.any.v4i1(<4 x i1>)
+declare i1 @llvm.dx.any.i1(i1)
+declare i1 @llvm.dx.any.i16(i16)
+declare i1 @llvm.dx.any.i32(i32)
+declare i1 @llvm.dx.any.i64(i64)
+declare i1 @llvm.dx.any.f16(half)
+declare i1 @llvm.dx.any.f32(float)
+declare i1 @llvm.dx.any.f64(double)
diff --git a/llvm/test/CodeGen/DirectX/clamp-vec.ll b/llvm/test/CodeGen/DirectX/clamp-vec.ll
new file mode 100644
index 0000000000000..d4f33a18b7157
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/clamp-vec.ll
@@ -0,0 +1,74 @@
+; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s
+
+; Make sure dxil operation function calls for clamp are generated for float/int/uint vectors.
+
+; CHECK-LABEL: clamp_half3
+define noundef <3 x half> @clamp_half3(<3 x half> noundef %a, <3 x half> noundef %b, <3 x half> noundef %c) {
+entry:
+  ; CHECK: call <3 x half> @llvm.maxnum.v3f16(<3 x half>  %a, <3 x half>  %b)
+  ; CHECK: call <3 x half> @llvm.minnum.v3f16(<3 x half>  %{{.*}}, <3 x half>  %c)
+  %dx.clamp = call <3 x half> @llvm.dx.clamp.v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c)
+  ret <3 x half> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_float4
+define noundef <4 x float> @clamp_float4(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c) {
+entry:
+  ; CHECK: call <4 x float> @llvm.maxnum.v4f32(<4 x float>  %a, <4 x float>  %b)
+  ; CHECK: call <4 x float> @llvm.minnum.v4f32(<4 x float>  %{{.*}}, <4 x float>  %c)
+  %dx.clamp = call <4 x float> @llvm.dx.clamp.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_double2
+define noundef <2 x double> @clamp_double2(<2 x double> noundef %a, <2 x double> noundef %b, <2 x double> noundef %c) {
+entry:
+  ; CHECK: call <2 x double> @llvm.maxnum.v2f64(<2 x double>  %a, <2 x double>  %b)
+  ; CHECK: call <2 x double> @llvm.minnum.v2f64(<2 x double>  %{{.*}}, <2 x double>  %c)
+  %dx.clamp = call <2 x double> @llvm.dx.clamp.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_int4
+define noundef <4 x i32> @clamp_int4(<4 x i32> noundef %a, <4 x i32> noundef %b, <4 x i32> noundef %c) {
+entry:
+  ; CHECK: call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ; CHECK: call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %c)
+  %dx.clamp = call <4 x i32> @llvm.dx.clamp.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_uint16_t3
+define noundef <3 x i16> @clamp_uint16_t3(<3 x i16> noundef %a, <3 x i16> noundef %b, <3 x i16> noundef %c) {
+entry:
+  ; CHECK: call <3 x i16> @llvm.umax.v3i16(<3 x i16>  %a, <3 x i16>  %b)
+  ; CHECK: call <3 x i16> @llvm.umin.v3i16(<3 x i16>  %{{.*}}, <3 x i16>  %c)
+  %dx.clamp = call <3 x i16> @llvm.dx.uclamp.v3i16(<3 x i16> %a, <3 x i16> %b, <3 x i16> %c)
+  ret <3 x i16> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_uint4
+define noundef <4 x i32> @clamp_uint4(<4 x i32> noundef %a, <4 x i32> noundef %b, <4 x i32> noundef %c) {
+entry:
+  ; CHECK: call <4 x i32> @llvm.umax.v4i32(<4 x i32>  %a, <4 x i32>  %b)
+  ; CHECK: call <4 x i32> @llvm.umin.v4i32(<4 x i32>  %{{.*}}, <4 x i32>  %c)
+  %dx.clamp = call <4 x i32> @llvm.dx.uclamp.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_uint64_t4
+define noundef <2 x i64> @clamp_uint64_t4(<2 x i64> noundef %a, <2 x i64> noundef %b, <2 x i64> noundef %c) {
+entry:
+  ; CHECK: call <2 x i64> @llvm.umax.v2i64(<2 x i64>  %a, <2 x i64>  %b)
+  ; CHECK: call <2 x i64> @llvm.umin.v2i64(<2 x i64>  %{{.*}}, <2 x i64>  %c)
+  %dx.clamp = call <2 x i64> @llvm.dx.uclamp.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+  ret <2 x i64> %dx.clamp
+}
+
+declare <3 x half> @llvm.dx.clamp.v3f16(<3 x half>, <3 x half>, <3 x half>)
+declare <4 x float> @llvm.dx.clamp.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.dx.clamp.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x i32> @llvm.dx.clamp.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <3 x i16> @llvm.dx.uclamp.v3i32(<3 x i16>, <3 x i32>, <3 x i16>)
+declare <4 x i32> @llvm.dx.uclamp.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.dx.uclamp.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/DirectX/clamp.ll b/llvm/test/CodeGen/DirectX/clamp.ll
new file mode 100644
index 0000000000000..f122313b8d7dc
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/clamp.ll
@@ -0,0 +1,94 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for clamp/uclamp are generated for half/float/double/i16/i32/i64.
+
+; CHECK-LABEL:test_clamp_i16
+define noundef i16 @test_clamp_i16(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 37, i16 %{{.*}}, i16 %{{.*}})
+; CHECK: call i16 @dx.op.binary.i16(i32 38, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.dx.clamp.i16(i16 %a, i16 %b, i16 %c)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_clamp_i32
+define noundef i32 @test_clamp_i32(i32 noundef %a, i32 noundef %b, i32 noundef %c) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 37, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: call i32 @dx.op.binary.i32(i32 38, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.dx.clamp.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_clamp_i64
+define noundef i64 @test_clamp_i64(i64 noundef %a, i64 noundef %b, i64 noundef %c) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 37, i64 %a, i64 %b)
+; CHECK: call i64 @dx.op.binary.i64(i32 38, i64 %{{.*}}, i64 %c)
+  %0 = call i64 @llvm.dx.clamp.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %0
+}
+
+; CHECK-LABEL:test_clamp_half
+define noundef half @test_clamp_half(half noundef %a, half noundef %b, half noundef %c) {
+entry:
+; CHECK: call half @dx.op.binary.f16(i32 35, half %{{.*}}, half %{{.*}})
+; CHECK: call half @dx.op.binary.f16(i32 36, half %{{.*}}, half %{{.*}})
+  %0 = call half @llvm.dx.clamp.f16(half %a, half %b, half %c)
+  ret half %0
+}
+
+; CHECK-LABEL:test_clamp_float
+define noundef float @test_clamp_float(float noundef %a, float noundef %b, float noundef %c) {
+entry:
+; CHECK: call float @dx.op.binary.f32(i32 35, float %{{.*}}, float %{{.*}})
+; CHECK: call float @dx.op.binary.f32(i32 36, float %{{.*}}, float %{{.*}})
+  %0 = call float @llvm.dx.clamp.f32(float %a, float %b, float %c)
+  ret float %0
+}
+
+; CHECK-LABEL:test_clamp_double
+define noundef double @test_clamp_double(double noundef %a, double noundef %b, double noundef %c) {
+entry:
+; CHECK: call double @dx.op.binary.f64(i32 35, double %{{.*}}, double %{{.*}})
+; CHECK: call double @dx.op.binary.f64(i32 36, double %{{.*}}, double %{{.*}})
+  %0 = call double @llvm.dx.clamp.f64(double %a, double %b, double %c)
+  ret double %0
+}
+
+; CHECK-LABEL:test_uclamp_i16
+define noundef i16 @test_uclamp_i16(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 39, i16 %{{.*}}, i16 %{{.*}})
+; CHECK: call i16 @dx.op.binary.i16(i32 40, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.dx.uclamp.i16(i16 %a, i16 %b, i16 %c)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_uclamp_i32
+define noundef i32 @test_uclamp_i32(i32 noundef %a, i32 noundef %b, i32 noundef %c) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 39, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: call i32 @dx.op.binary.i32(i32 40, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.dx.uclamp.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_uclamp_i64
+define noundef i64 @test_uclamp_i64(i64 noundef %a, i64 noundef %b, i64 noundef %c) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 39, i64 %a, i64 %b)
+; CHECK: call i64 @dx.op.binary.i64(i32 40, i64 %{{.*}}, i64 %c)
+  %0 = call i64 @llvm.dx.uclamp.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %0
+}
+
+declare half @llvm.dx.clamp.f16(half, half, half)
+declare float @llvm.dx.clamp.f32(float, float, float)
+declare double @llvm.dx.clamp.f64(double, double, double)
+declare i16 @llvm.dx.clamp.i16(i16, i16, i16)
+declare i32 @llvm.dx.clamp.i32(i32, i32, i32)
+declare i64 @llvm.dx.clamp.i64(i64, i64, i64)
+declare i16 @llvm.dx.uclamp.i16(i16, i16, i16)
+declare i32 @llvm.dx.uclamp.i32(i32, i32, i32)
+declare i64 @llvm.dx.uclamp.i64(i64, i64, i64)
diff --git a/llvm/test/CodeGen/DirectX/exp-vec.ll b/llvm/test/CodeGen/DirectX/exp-vec.ll
new file mode 100644
index 0000000000000..c937155719054
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/exp-vec.ll
@@ -0,0 +1,17 @@
+; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s
+
+; Make sure dxil operation function calls for exp are generated for float and half.
+
+; CHECK-LABEL: exp_float4
+; CHECK: fmul <4 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>,  %{{.*}}
+; CHECK: call <4 x float> @llvm.exp2.v4f32(<4 x float>  %{{.*}})
+define noundef <4 x float> @exp_float4(<4 x float> noundef %p0) {
+entry:
+  %p0.addr = alloca <4 x float>, align 16
+  store <4 x float> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x float>, ptr %p0.addr, align 16
+  %elt.exp = call <4 x float> @llvm.exp.v4f32(<4 x float> %0)
+  ret <4 x float> %elt.exp
+}
+
+declare <4 x float> @llvm.exp.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/exp.ll b/llvm/test/CodeGen/DirectX/exp.ll
new file mode 100644
index 0000000000000..fdafc1438cf0e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/exp.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for exp are generated for float and half.
+
+; CHECK-LABEL: exp_float
+; CHECK: fmul float 0x3FF7154760000000, %{{.*}}
+; CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}})
+define noundef float @exp_float(float noundef %a) {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, ptr %a.addr, align 4
+  %0 = load float, ptr %a.addr, align 4
+  %elt.exp = call float @llvm.exp.f32(float %0)
+  ret float %elt.exp
+}
+
+; CHECK-LABEL: exp_half
+; CHECK: fmul half 0xH3DC5, %{{.*}}
+; CHECK: call half @dx.op.unary.f16(i32 21, half %{{.*}})
+; Function Attrs: noinline nounwind optnone
+define noundef half @exp_half(half noundef %a) {
+entry:
+  %a.addr = alloca half, align 2
+  store half %a, ptr %a.addr, align 2
+  %0 = load half, ptr %a.addr, align 2
+  %elt.exp = call half @llvm.exp.f16(half %0)
+  ret half %elt.exp
+}
+
+declare half @llvm.exp.f16(half)
+declare float @llvm.exp.f32(float)
diff --git a/llvm/test/CodeGen/DirectX/fmax.ll b/llvm/test/CodeGen/DirectX/fmax.ll
new file mode 100644
index 0000000000000..aff722c29309c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/fmax.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for fmax are generated for half/float/double.
+
+; CHECK-LABEL:test_fmax_half
+define noundef half @test_fmax_half(half noundef %a, half noundef %b) {
+entry:
+; CHECK: call half @dx.op.binary.f16(i32 35, half %{{.*}}, half %{{.*}})
+  %0 = call half @llvm.maxnum.f16(half %a, half %b)
+  ret half %0
+}
+
+; CHECK-LABEL:test_fmax_float
+define noundef float @test_fmax_float(float noundef %a, float noundef %b) {
+entry:
+; CHECK: call float @dx.op.binary.f32(i32 35, float %{{.*}}, float %{{.*}})
+  %0 = call float @llvm.maxnum.f32(float %a, float %b)
+  ret float %0
+}
+
+; CHECK-LABEL:test_fmax_double
+define noundef double @test_fmax_double(double noundef %a, double noundef %b) {
+entry:
+; CHECK: call double @dx.op.binary.f64(i32 35, double %{{.*}}, double %{{.*}})
+  %0 = call double @llvm.maxnum.f64(double %a, double %b)
+  ret double %0
+}
+
+declare half @llvm.maxnum.f16(half, half)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/CodeGen/DirectX/fmin.ll b/llvm/test/CodeGen/DirectX/fmin.ll
new file mode 100644
index 0000000000000..2f7c209f0278a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/fmin.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for fmin are generated for half/float/double.
+
+; CHECK-LABEL:test_fmin_half
+define noundef half @test_fmin_half(half noundef %a, half noundef %b) {
+entry:
+; CHECK: call half @dx.op.binary.f16(i32 36, half %{{.*}}, half %{{.*}})
+  %0 = call half @llvm.minnum.f16(half %a, half %b)
+  ret half %0
+}
+
+; CHECK-LABEL:test_fmin_float
+define noundef float @test_fmin_float(float noundef %a, float noundef %b) {
+entry:
+; CHECK: call float @dx.op.binary.f32(i32 36, float %{{.*}}, float %{{.*}})
+  %0 = call float @llvm.minnum.f32(float %a, float %b)
+  ret float %0
+}
+
+; CHECK-LABEL:test_fmin_double
+define noundef double @test_fmin_double(double noundef %a, double noundef %b) {
+entry:
+; CHECK: call double @dx.op.binary.f64(i32 36, double %{{.*}}, double %{{.*}})
+  %0 = call double @llvm.minnum.f64(double %a, double %b)
+  ret double %0
+}
+
+declare half @llvm.minnum.f16(half, half)
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
diff --git a/llvm/test/CodeGen/DirectX/idot.ll b/llvm/test/CodeGen/DirectX/idot.ll
new file mode 100644
index 0000000000000..9f89a8d6d340d
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/idot.ll
@@ -0,0 +1,100 @@
+; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+
+; Make sure dxil operation function calls for dot are generated for int/uint vectors.
+
+; CHECK-LABEL: dot_int16_t2
+define noundef i16 @dot_int16_t2(<2 x i16> noundef %a, <2 x i16> noundef %b) {
+entry:
+; CHECK: extractelement <2 x i16> %a, i64 0
+; CHECK: extractelement <2 x i16> %b, i64 0
+; CHECK: mul i16 %{{.*}}, %{{.*}}
+; CHECK: extractelement <2 x i16> %a, i64 1
+; CHECK: extractelement <2 x i16> %b, i64 1
+; EXPCHECK: call i16 @llvm.dx.imad.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; DOPCHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+  %dx.dot = call i16 @llvm.dx.sdot.v3i16(<2 x i16> %a, <2 x i16> %b)
+  ret i16 %dx.dot
+}
+
+; CHECK-LABEL: sdot_int4
+define noundef i32 @sdot_int4(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+entry:
+; CHECK: extractelement <4 x i32> %a, i64 0
+; CHECK: extractelement <4 x i32> %b, i64 0
+; CHECK: mul i32 %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i32> %a, i64 1
+; CHECK: extractelement <4 x i32> %b, i64 1
+; EXPCHECK: call i32 @llvm.dx.imad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: extractelement <4 x i32> %a, i64 2
+; CHECK: extractelement <4 x i32> %b, i64 2
+; EXPCHECK: call i32 @llvm.dx.imad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: extractelement <4 x i32> %a, i64 3
+; CHECK: extractelement <4 x i32> %b, i64 3
+; EXPCHECK: call i32 @llvm.dx.imad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  %dx.dot = call i32 @llvm.dx.sdot.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret i32 %dx.dot
+}
+
+; CHECK-LABEL: dot_uint16_t3
+define noundef i16 @dot_uint16_t3(<3 x i16> noundef %a, <3 x i16> noundef %b) {
+entry:
+; CHECK: extractelement <3 x i16> %a, i64 0
+; CHECK: extractelement <3 x i16> %b, i64 0
+; CHECK: mul i16 %{{.*}}, %{{.*}}
+; CHECK: extractelement <3 x i16> %a, i64 1
+; CHECK: extractelement <3 x i16> %b, i64 1
+; EXPCHECK: call i16 @llvm.dx.umad.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; DOPCHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; CHECK: extractelement <3 x i16> %a, i64 2
+; CHECK: extractelement <3 x i16> %b, i64 2
+; EXPCHECK: call i16 @llvm.dx.umad.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; DOPCHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+  %dx.dot = call i16 @llvm.dx.udot.v3i16(<3 x i16> %a, <3 x i16> %b)
+  ret i16 %dx.dot
+}
+
+; CHECK-LABEL: dot_uint4
+define noundef i32 @dot_uint4(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+entry:
+; CHECK: extractelement <4 x i32> %a, i64 0
+; CHECK: extractelement <4 x i32> %b, i64 0
+; CHECK: mul i32 %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i32> %a, i64 1
+; CHECK: extractelement <4 x i32> %b, i64 1
+; EXPCHECK: call i32 @llvm.dx.umad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: extractelement <4 x i32> %a, i64 2
+; CHECK: extractelement <4 x i32> %b, i64 2
+; EXPCHECK: call i32 @llvm.dx.umad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: extractelement <4 x i32> %a, i64 3
+; CHECK: extractelement <4 x i32> %b, i64 3
+; EXPCHECK: call i32 @llvm.dx.umad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  %dx.dot = call i32 @llvm.dx.udot.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret i32 %dx.dot
+}
+
+; CHECK-LABEL: dot_uint64_t4
+define noundef i64 @dot_uint64_t4(<2 x i64> noundef %a, <2 x i64> noundef %b) {
+entry:
+; CHECK: extractelement <2 x i64> %a, i64 0
+; CHECK: extractelement <2 x i64> %b, i64 0
+; CHECK: mul i64 %{{.*}}, %{{.*}}
+; CHECK: extractelement <2 x i64> %a, i64 1
+; CHECK: extractelement <2 x i64> %b, i64 1
+; EXPCHECK: call i64 @llvm.dx.umad.i64(i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+; DOPCHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+  %dx.dot = call i64 @llvm.dx.udot.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret i64 %dx.dot
+}
+
+declare i16 @llvm.dx.sdot.v2i16(<2 x i16>, <2 x i16>)
+declare i32 @llvm.dx.sdot.v4i32(<4 x i32>, <4 x i32>)
+declare i16 @llvm.dx.udot.v3i32(<3 x i16>, <3 x i16>)
+declare i32 @llvm.dx.udot.v4i32(<4 x i32>, <4 x i32>)
+declare i64 @llvm.dx.udot.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/DirectX/lerp.ll b/llvm/test/CodeGen/DirectX/lerp.ll
new file mode 100644
index 0000000000000..ebd7e133b5163
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/lerp.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for lerp are generated for float and half.
+
+; CHECK-LABEL: lerp_half
+; CHECK: fsub half %{{.*}}, %{{.*}}
+; CHECK: fmul half %{{.*}}, %{{.*}}
+; CHECK: fadd half %{{.*}}, %{{.*}}
+define noundef half @lerp_half(half noundef %p0) {
+entry:
+  %p0.addr = alloca half, align 2
+  store half %p0, ptr %p0.addr, align 2
+  %0 = load half, ptr %p0.addr, align 2
+  %1 = load half, ptr %p0.addr, align 2
+  %2 = load half, ptr %p0.addr, align 2
+  %dx.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
+  ret half %dx.lerp
+}
+
+; CHECK-LABEL: lerp_float
+; CHECK: fsub float %{{.*}}, %{{.*}}
+; CHECK: fmul float %{{.*}}, %{{.*}}
+; CHECK: fadd float %{{.*}}, %{{.*}}
+define noundef float @lerp_float(float noundef %p0, float noundef %p1) {
+entry:
+  %p1.addr = alloca float, align 4
+  %p0.addr = alloca float, align 4
+  store float %p1, ptr %p1.addr, align 4
+  store float %p0, ptr %p0.addr, align 4
+  %0 = load float, ptr %p0.addr, align 4
+  %1 = load float, ptr %p0.addr, align 4
+  %2 = load float, ptr %p0.addr, align 4
+  %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
+  ret float %dx.lerp
+}
+
+; CHECK-LABEL: lerp_float4
+; CHECK: fsub <4 x float> %{{.*}}, %{{.*}}
+; CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
+; CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
+define noundef <4 x float> @lerp_float4(<4 x float> noundef %p0, <4 x float> noundef %p1) {
+entry:
+  %p1.addr = alloca <4 x float>, align 16
+  %p0.addr = alloca <4 x float>, align 16
+  store <4 x float> %p1, ptr %p1.addr, align 16
+  store <4 x float> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x float>, ptr %p0.addr, align 16
+  %1 = load <4 x float>, ptr %p0.addr, align 16
+  %2 = load <4 x float>, ptr %p0.addr, align 16
+  %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+  ret <4 x float> %dx.lerp
+}
+
+declare half @llvm.dx.lerp.f16(half, half, half)
+declare float @llvm.dx.lerp.f32(float, float, float)
+declare <4 x float> @llvm.dx.lerp.v4f32(<4 x float>, <4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/lib_entry.ll b/llvm/test/CodeGen/DirectX/lib_entry.ll
index 9208d6d3f3224..5254a08805588 100644
--- a/llvm/test/CodeGen/DirectX/lib_entry.ll
+++ b/llvm/test/CodeGen/DirectX/lib_entry.ll
@@ -7,7 +7,7 @@ target triple = "dxil-unknown-shadermodel6.7-library"
 ; Make sure generate empty entry for lib profile.
 ;CHECK:![[empty_entry]] = !{null, !"", null, null, ![[shader_flags:[0-9]+]]}
 ; Make sure double is marked for shader flags.
-;CHECK:![[shader_flags]] = !{i32 0, i64 1}
+;CHECK:![[shader_flags]] = !{i32 0, i64 4}
 ;CHECK:![[entry]] = !{ptr @entry, !"entry", null, null, ![[extra:[0-9]+]]}
 ;CHECK:![[extra]] = !{i32 8, i32 5, i32 4, ![[numthreads:[0-9]+]]}
 ;CHECK:![[numthreads]] = !{i32 1, i32 2, i32 1}
diff --git a/llvm/test/CodeGen/DirectX/rcp.ll b/llvm/test/CodeGen/DirectX/rcp.ll
new file mode 100644
index 0000000000000..65abe832db53f
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rcp.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for rcp are generated for float, double, and half.
+
+; CHECK-LABEL: rcp_float4
+; CHECK: fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %{{.*}}
+define noundef <4 x float> @rcp_float4(<4 x float> noundef %p0) {
+entry:
+  %p0.addr = alloca <4 x float>, align 16
+  store <4 x float> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x float>, ptr %p0.addr, align 16
+  %dx.rcp = call <4 x float> @llvm.dx.rcp.v4f32(<4 x float> %0)
+  ret <4 x float> %dx.rcp
+}
+
+; CHECK-LABEL: rcp_double4
+; CHECK: fdiv <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %{{.*}}
+define noundef <4 x double> @rcp_double4(<4 x double> noundef %p0) {
+entry:
+  %p0.addr = alloca <4 x double>, align 16
+  store <4 x double> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x double>, ptr %p0.addr, align 16
+  %dx.rcp = call <4 x double> @llvm.dx.rcp.v4f64(<4 x double> %0)
+  ret <4 x double> %dx.rcp
+}
+
+; CHECK-LABEL: rcp_half4
+; CHECK: fdiv <4 x half> <half  0xH3C00, half  0xH3C00, half  0xH3C00, half  0xH3C00>, %{{.*}} 
+define noundef <4 x half> @rcp_half4(<4 x half> noundef %p0) {
+entry:
+  %p0.addr = alloca <4 x half>, align 16
+  store <4 x half> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x half>, ptr %p0.addr, align 16
+  %dx.rcp = call <4 x half> @llvm.dx.rcp.v4f16(<4 x half> %0)
+  ret <4 x half> %dx.rcp
+}
+
+; CHECK-LABEL: rcp_half
+; CHECK: fdiv half 0xH3C00, %{{.*}} 
+define noundef half @rcp_half(half noundef %p0) {
+entry:
+  %p0.addr = alloca half, align 2
+  store half %p0, ptr %p0.addr, align 2
+  %0 = load half, ptr %p0.addr, align 2
+  %dx.rcp = call half @llvm.dx.rcp.f16(half %0)
+  ret half %dx.rcp
+}
+
+declare half @llvm.dx.rcp.f16(half)
+declare <4 x half> @llvm.dx.rcp.v4f16(<4 x half>)
+declare <4 x float> @llvm.dx.rcp.v4f32(<4 x float>)
+declare <4 x double> @llvm.dx.rcp.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/DirectX/rsqrt.ll b/llvm/test/CodeGen/DirectX/rsqrt.ll
new file mode 100644
index 0000000000000..52af0e62220b3
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rsqrt.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for rsqrt are generated for float and half.
+
+; CHECK-LABEL: rsqrt_float
+; CHECK: call float @dx.op.unary.f32(i32 25, float %{{.*}})
+define noundef float @rsqrt_float(float noundef %a) {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, ptr %a.addr, align 4
+  %0 = load float, ptr %a.addr, align 4
+  %dx.rsqrt = call float @llvm.dx.rsqrt.f32(float %0)
+  ret float %dx.rsqrt
+}
+
+; CHECK-LABEL: rsqrt_half
+; CHECK: call half @dx.op.unary.f16(i32 25, half %{{.*}})
+define noundef half @rsqrt_half(half noundef %a) {
+entry:
+  %a.addr = alloca half, align 2
+  store half %a, ptr %a.addr, align 2
+  %0 = load half, ptr %a.addr, align 2
+  %dx.rsqrt = call half @llvm.dx.rsqrt.f16(half %0)
+  ret half %dx.rsqrt
+}
+
+declare half @llvm.dx.rsqrt.f16(half)
+declare float @llvm.dx.rsqrt.f32(float)
diff --git a/llvm/test/CodeGen/DirectX/rsqrt_error.ll b/llvm/test/CodeGen/DirectX/rsqrt_error.ll
new file mode 100644
index 0000000000000..9cd5002c20f7e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rsqrt_error.ll
@@ -0,0 +1,14 @@
+; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+
+; DXIL operation rsqrt does not support double overload type
+; CHECK: LLVM ERROR: Invalid Overload Type
+
+; Function Attrs: noinline nounwind optnone
+define noundef double @rsqrt_double(double noundef %a) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  store double %a, ptr %a.addr, align 8
+  %0 = load double, ptr %a.addr, align 8
+  %dx.rsqrt = call double @llvm.dx.rsqrt.f64(double %0)
+  ret double %dx.rsqrt
+}
diff --git a/llvm/test/CodeGen/DirectX/smax.ll b/llvm/test/CodeGen/DirectX/smax.ll
new file mode 100644
index 0000000000000..8b2406782c093
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/smax.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for smax are generated for i16/i32/i64.
+
+; CHECK-LABEL:test_smax_i16
+define noundef i16 @test_smax_i16(i16 noundef %a, i16 noundef %b) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 37, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.smax.i16(i16 %a, i16 %b)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_smax_i32
+define noundef i32 @test_smax_i32(i32 noundef %a, i32 noundef %b) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 37, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_smax_i64
+define noundef i64 @test_smax_i64(i64 noundef %a, i64 noundef %b) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 37, i64 %{{.*}}, i64 %{{.*}})
+  %0 = call i64 @llvm.smax.i64(i64 %a, i64 %b)
+  ret i64 %0
+}
+
+declare i16 @llvm.smax.i16(i16, i16)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i64 @llvm.smax.i64(i64, i64)
diff --git a/llvm/test/CodeGen/DirectX/smin.ll b/llvm/test/CodeGen/DirectX/smin.ll
new file mode 100644
index 0000000000000..b2b40a1b62433
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/smin.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for smin are generated for i16/i32/i64.
+
+; CHECK-LABEL:test_smin_i16
+define noundef i16 @test_smin_i16(i16 noundef %a, i16 noundef %b) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 38, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.smin.i16(i16 %a, i16 %b)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_smin_i32
+define noundef i32 @test_smin_i32(i32 noundef %a, i32 noundef %b) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 38, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_smin_i64
+define noundef i64 @test_smin_i64(i64 noundef %a, i64 noundef %b) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 38, i64 %{{.*}}, i64 %{{.*}})
+  %0 = call i64 @llvm.smin.i64(i64 %a, i64 %b)
+  ret i64 %0
+}
+
+declare i16 @llvm.smin.i16(i16, i16)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i64 @llvm.smin.i64(i64, i64)
diff --git a/llvm/test/CodeGen/DirectX/umax.ll b/llvm/test/CodeGen/DirectX/umax.ll
index c7b6a87599279..be0f557fc8da6 100644
--- a/llvm/test/CodeGen/DirectX/umax.ll
+++ b/llvm/test/CodeGen/DirectX/umax.ll
@@ -1,30 +1,31 @@
 ; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
 
-; Make sure dxil operation function calls for umax are generated for i32/i64.
+; Make sure dxil operation function calls for umax are generated for i16/i32/i64.
 
-target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-pc-shadermodel6.7-library"
+; CHECK-LABEL:test_umax_i16
+define noundef i16 @test_umax_i16(i16 noundef %a, i16 noundef %b) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 39, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.umax.i16(i16 %a, i16 %b)
+  ret i16 %0
+}
 
 ; CHECK-LABEL:test_umax_i32
-; Function Attrs: noinline nounwind optnone
-define noundef i32 @test_umax_i32(i32 noundef %a, i32 noundef %b) #0 {
+define noundef i32 @test_umax_i32(i32 noundef %a, i32 noundef %b) {
 entry:
-; CHECK:call i32 @dx.op.binary.i32(i32 39, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: call i32 @dx.op.binary.i32(i32 39, i32 %{{.*}}, i32 %{{.*}})
   %0 = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   ret i32 %0
 }
 
 ; CHECK-LABEL:test_umax_i64
-define noundef i64 @test_umax_i64(i64 noundef %a, i64 noundef %b) #0 {
+define noundef i64 @test_umax_i64(i64 noundef %a, i64 noundef %b) {
 entry:
-; CHECK:call i64 @dx.op.binary.i64(i32 39, i64 %{{.*}}, i64 %{{.*}})
+; CHECK: call i64 @dx.op.binary.i64(i32 39, i64 %{{.*}}, i64 %{{.*}})
   %0 = call i64 @llvm.umax.i64(i64 %a, i64 %b)
   ret i64 %0
 }
 
-; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
-declare i32 @llvm.umax.i32(i32, i32) #1
-declare i64 @llvm.umax.i64(i64, i64) #1
-
-attributes #0 = { noinline nounwind }
-attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+declare i16 @llvm.umax.i16(i16, i16)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i64 @llvm.umax.i64(i64, i64)
diff --git a/llvm/test/CodeGen/DirectX/umin.ll b/llvm/test/CodeGen/DirectX/umin.ll
new file mode 100644
index 0000000000000..5051c71174489
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/umin.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for umin are generated for i16/i32/i64.
+
+; CHECK-LABEL:test_umin_i16
+define noundef i16 @test_umin_i16(i16 noundef %a, i16 noundef %b) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 40, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.umin.i16(i16 %a, i16 %b)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_umin_i32
+define noundef i32 @test_umin_i32(i32 noundef %a, i32 noundef %b) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 40, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_umin_i64
+define noundef i64 @test_umin_i64(i64 noundef %a, i64 noundef %b) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 40, i64 %{{.*}}, i64 %{{.*}})
+  %0 = call i64 @llvm.umin.i64(i64 %a, i64 %b)
+  ret i64 %0
+}
+
+declare i16 @llvm.umin.i16(i16, i16)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i64 @llvm.umin.i64(i64, i64)
diff --git a/llvm/test/CodeGen/Generic/ForceStackAlign.ll b/llvm/test/CodeGen/Generic/ForceStackAlign.ll
index 2c35ad350f047..7993b3eff65b6 100644
--- a/llvm/test/CodeGen/Generic/ForceStackAlign.ll
+++ b/llvm/test/CodeGen/Generic/ForceStackAlign.ll
@@ -8,7 +8,7 @@
 ; Stack realignment not supported.
 ; XFAIL: target=sparc{{.*}}
 
-; NVPTX cannot select dynamic_stackalloc
+; NVPTX can only select dynamic_stackalloc on sm_52+ and with ptx73+
 ; XFAIL: target=nvptx{{.*}}
 
 define i32 @f(ptr %p) nounwind {
diff --git a/llvm/test/CodeGen/Generic/gc-lowering.ll b/llvm/test/CodeGen/Generic/gc-lowering.ll
new file mode 100644
index 0000000000000..fa2e92ab495f4
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/gc-lowering.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes='require<collector-metadata>,function(gc-lowering)' < %s | FileCheck %s
+
+declare ptr @llvm_gc_allocate(i32)
+declare void @llvm_gc_initialize(i32)
+
+declare void @llvm.gcroot(ptr, ptr)
+declare void @llvm.gcwrite(ptr, ptr, ptr)
+
+define i32 @main() gc "shadow-stack" {
+; CHECK-LABEL: define i32 @main() gc "shadow-stack" {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr null, ptr [[A]], align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr null, ptr [[B]], align 8
+; CHECK-NEXT:    call void @llvm_gc_initialize(i32 1048576)
+; CHECK-NEXT:    call void @llvm.gcroot(ptr [[A]], ptr null)
+; CHECK-NEXT:    [[APTR:%.*]] = call ptr @llvm_gc_allocate(i32 10)
+; CHECK-NEXT:    store ptr [[APTR]], ptr [[A]], align 8
+; CHECK-NEXT:    call void @llvm.gcroot(ptr [[B]], ptr null)
+; CHECK-NEXT:    [[B_UPGRD_1:%.*]] = call ptr @llvm_gc_allocate(i32 8)
+; CHECK-NEXT:    store ptr [[B_UPGRD_1]], ptr [[B]], align 8
+; CHECK-NEXT:    [[B_1:%.*]] = load ptr, ptr [[B]], align 8
+; CHECK-NEXT:    [[A_1:%.*]] = load ptr, ptr [[A]], align 8
+; CHECK-NEXT:    store ptr [[A_1]], ptr [[B_1]], align 8
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %A = alloca ptr
+  %B = alloca ptr
+
+  call void @llvm_gc_initialize(i32 1048576)  ; Start with 1MB heap
+
+  ;; ptr A;
+  call void @llvm.gcroot(ptr %A, ptr null)
+
+  ;; A = gcalloc(10);
+  %Aptr = call ptr @llvm_gc_allocate(i32 10)
+  store ptr %Aptr, ptr %A
+
+  ;; ptr B;
+  call void @llvm.gcroot(ptr %B, ptr null)
+
+  ;; B = gcalloc(4);
+  %B.upgrd.1 = call ptr @llvm_gc_allocate(i32 8)
+  store ptr %B.upgrd.1, ptr %B
+
+  ;; *B = A;
+  %B.1 = load ptr, ptr %B
+  %A.1 = load ptr, ptr %A
+  call void @llvm.gcwrite(ptr %A.1, ptr %B.upgrd.1, ptr %B.1)
+
+  ret i32 0
+}
+
+define void @no_gc() {
+; CHECK-LABEL: define void @no_gc() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/build-attributes.ll b/llvm/test/CodeGen/Hexagon/build-attributes.ll
new file mode 100644
index 0000000000000..48ee31a4d19bd
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/build-attributes.ll
@@ -0,0 +1,16 @@
+;; Generate build attributes from llc.
+
+; RUN: llc -mtriple=hexagon-unknown-elf \
+; RUN:  -mattr=+hvxv73,+cabac,+v71,+hvx-ieee-fp,+hvx-length128b %s -o - | FileCheck  %s
+
+;      CHECK: .attribute      4, 71  // Tag_arch
+; CHECK-NEXT: .attribute      5, 73  // Tag_hvx_arch
+; CHECK-NEXT: .attribute      6, 1   // Tag_hvx_ieeefp
+; CHECK-NEXT: .attribute      7, 1   // Tag_hvx_qfloat
+; CHECK-NEXT: .attribute      8, 1   // Tag_zreg
+; CHECK-NEXT: .attribute      10, 1   // Tag_cabac
+
+define i32 @addi(i32 %a) {
+  %1 = add i32 %a, 1
+  ret i32 %1
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/Hexagon/livephysregs-regmask-clobber.mir b/llvm/test/CodeGen/Hexagon/livephysregs-regmask-clobber.mir
index 8f1cb42b96a6f..52213070f5356 100644
--- a/llvm/test/CodeGen/Hexagon/livephysregs-regmask-clobber.mir
+++ b/llvm/test/CodeGen/Hexagon/livephysregs-regmask-clobber.mir
@@ -17,6 +17,8 @@
 
 name: f0
 tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, offset: 0, size: 128, alignment: 128 }
   - { id: 1, offset: 128, size: 128, alignment: 128 }
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid4.mir b/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid4.mir
deleted file mode 100644
index d66dd10448692..0000000000000
--- a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid4.mir
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: not llc -mtriple=aarch64-- -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
-# When a low-level type is 0 bits
----
-name: test_scalar_size_0
-body: |
-  bb.0:
-    liveins: $x0
-    ; CHECK: [[@LINE+1]]:10: invalid size for scalar type
-    %0:_(s0) = G_IMPLICIT_DEF
-...
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir b/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir
index 698568701fb1a..632e5fa81db11 100644
--- a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir
@@ -5,6 +5,6 @@ name: test_vector_element_size_0
 body: |
   bb.0:
     liveins: $x0
-    ; CHECK: [[@LINE+1]]:15: invalid size for scalar type
+    ; CHECK: [[@LINE+1]]:15: invalid size for scalar element in vector
     %0:_(<2 x s0>) = G_IMPLICIT_DEF
 ...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
index e40d1879399ce..9831f786b847d 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
@@ -29,6 +29,8 @@ liveins:
   - { reg: '$vgpr0', virtual-reg: '' }
   - { reg: '$vgpr1', virtual-reg: '' }
   - { reg: '$vgpr2', virtual-reg: '' }
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, name: '', type: spill-slot, offset: 0, size: 8, alignment: 4,
       stack-id: sgpr-spill, callee-saved-register: '', callee-saved-restored: true,
diff --git a/llvm/test/CodeGen/Mips/avoid-zero-copy.mir b/llvm/test/CodeGen/Mips/avoid-zero-copy.mir
index 5c7cffd109ea6..e3990bdf9bc3f 100644
--- a/llvm/test/CodeGen/Mips/avoid-zero-copy.mir
+++ b/llvm/test/CodeGen/Mips/avoid-zero-copy.mir
@@ -19,6 +19,8 @@
 ...
 ---
 name:            a
+frameInfo:
+  adjustsStack:    true
 body:             |
   bb.0 (%ir-block.0):
     liveins: $a0_64, $t9_64, $ra_64, $fp_64, $gp_64
diff --git a/llvm/test/CodeGen/Mips/msa/emergency-spill.mir b/llvm/test/CodeGen/Mips/msa/emergency-spill.mir
index e1c7b2158d617..2089464528661 100644
--- a/llvm/test/CodeGen/Mips/msa/emergency-spill.mir
+++ b/llvm/test/CodeGen/Mips/msa/emergency-spill.mir
@@ -90,7 +90,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    16
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index 6a27c9f5dac9b..45c7ab980edd6 100644
--- a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -405,8 +405,9 @@ define void @uitofp(i32 %a) {
 ; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(uitofp)))
 ; MIPS64-N32-NEXT:    lui $2, 17200
 ; MIPS64-N32-NEXT:    sw $2, 12($sp)
-; MIPS64-N32-NEXT:    sll $2, $4, 0
-; MIPS64-N32-NEXT:    sw $2, 8($sp)
+; MIPS64R5-N32-NEXT:    sll $2, $4, 0
+; MIPS64R5-N32-NEXT:    sw $2, 8($sp)
+; MIPSR6-N32-NEXT:    sw $4, 8($sp)
 ; MIPS64-N32-NEXT:    lw $2, %got_page(.LCPI5_0)($1)
 ; MIPS64-N32-NEXT:    ldc1 $f0, %got_ofst(.LCPI5_0)($2)
 ; MIPS64-N32-NEXT:    ldc1 $f1, 8($sp)
@@ -430,8 +431,9 @@ define void @uitofp(i32 %a) {
 ; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(uitofp)))
 ; MIPS64-N64-NEXT:    lui $2, 17200
 ; MIPS64-N64-NEXT:    sw $2, 12($sp)
-; MIPS64-N64-NEXT:    sll $2, $4, 0
-; MIPS64-N64-NEXT:    sw $2, 8($sp)
+; MIPS64R5-N64-NEXT:    sll $2, $4, 0
+; MIPS64R5-N64-NEXT:    sw $2, 8($sp)
+; MIPSR6-N64-NEXT:    sw $4, 8($sp)
 ; MIPS64-N64-NEXT:    ld $2, %got_page(.LCPI5_0)($1)
 ; MIPS64-N64-NEXT:    ldc1 $f0, %got_ofst(.LCPI5_0)($2)
 ; MIPS64-N64-NEXT:    ldc1 $f1, 8($sp)
diff --git a/llvm/test/CodeGen/Mips/no-unaligned-access-r6.ll b/llvm/test/CodeGen/Mips/no-unaligned-access-r6.ll
new file mode 100644
index 0000000000000..0695868f9c803
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/no-unaligned-access-r6.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+;; Test the strict-align feature which is similar to AArch64/arm64-strict-align.ll.
+
+; RUN: llc --mtriple=mipsisa32r6 < %s | FileCheck %s --check-prefix=MIPS32R6-UNALIGNED
+; RUN: llc --mtriple=mipsisa32r6 --mattr=-strict-align < %s | FileCheck %s --check-prefix=MIPS32R6-UNALIGNED
+; RUN: llc --mtriple=mipsisa32r6 --mattr=+strict-align < %s | FileCheck %s --check-prefix=MIPS32R6-ALIGNED
+
+; RUN: llc --mtriple=mipsisa64r6 < %s | FileCheck %s --check-prefix=MIPS64R6-UNALIGNED
+; RUN: llc --mtriple=mipsisa64r6 --mattr=-strict-align < %s | FileCheck %s --check-prefix=MIPS64R6-UNALIGNED
+; RUN: llc --mtriple=mipsisa64r6 --mattr=+strict-align < %s | FileCheck %s --check-prefix=MIPS64R6-ALIGNED
+
+define i32 @f0(ptr %p) nounwind {
+; MIPS32R6-UNALIGNED-LABEL: f0:
+; MIPS32R6-UNALIGNED:       # %bb.0:
+; MIPS32R6-UNALIGNED-NEXT:    lw $2, 0($4)
+; MIPS32R6-UNALIGNED-NEXT:    jrc $ra
+;
+; MIPS32R6-ALIGNED-LABEL: f0:
+; MIPS32R6-ALIGNED:       # %bb.0:
+; MIPS32R6-ALIGNED-NEXT:    lhu $1, 2($4)
+; MIPS32R6-ALIGNED-NEXT:    lhu $2, 0($4)
+; MIPS32R6-ALIGNED-NEXT:    sll $2, $2, 16
+; MIPS32R6-ALIGNED-NEXT:    jr $ra
+; MIPS32R6-ALIGNED-NEXT:    or $2, $2, $1
+;
+; MIPS64R6-UNALIGNED-LABEL: f0:
+; MIPS64R6-UNALIGNED:       # %bb.0:
+; MIPS64R6-UNALIGNED-NEXT:    lw $2, 0($4)
+; MIPS64R6-UNALIGNED-NEXT:    jrc $ra
+;
+; MIPS64R6-ALIGNED-LABEL: f0:
+; MIPS64R6-ALIGNED:       # %bb.0:
+; MIPS64R6-ALIGNED-NEXT:    lhu $1, 2($4)
+; MIPS64R6-ALIGNED-NEXT:    lhu $2, 0($4)
+; MIPS64R6-ALIGNED-NEXT:    sll $2, $2, 16
+; MIPS64R6-ALIGNED-NEXT:    jr $ra
+; MIPS64R6-ALIGNED-NEXT:    or $2, $2, $1
+  %tmp = load i32, ptr %p, align 2
+  ret i32 %tmp
+}
+
+define i64 @f1(ptr %p) nounwind {
+; MIPS32R6-UNALIGNED-LABEL: f1:
+; MIPS32R6-UNALIGNED:       # %bb.0:
+; MIPS32R6-UNALIGNED-NEXT:    lw $2, 0($4)
+; MIPS32R6-UNALIGNED-NEXT:    lw $3, 4($4)
+; MIPS32R6-UNALIGNED-NEXT:    jrc $ra
+;
+; MIPS32R6-ALIGNED-LABEL: f1:
+; MIPS32R6-ALIGNED:       # %bb.0:
+; MIPS32R6-ALIGNED-NEXT:    lw $2, 0($4)
+; MIPS32R6-ALIGNED-NEXT:    lw $3, 4($4)
+; MIPS32R6-ALIGNED-NEXT:    jrc $ra
+;
+; MIPS64R6-UNALIGNED-LABEL: f1:
+; MIPS64R6-UNALIGNED:       # %bb.0:
+; MIPS64R6-UNALIGNED-NEXT:    ld $2, 0($4)
+; MIPS64R6-UNALIGNED-NEXT:    jrc $ra
+;
+; MIPS64R6-ALIGNED-LABEL: f1:
+; MIPS64R6-ALIGNED:       # %bb.0:
+; MIPS64R6-ALIGNED-NEXT:    lwu $1, 4($4)
+; MIPS64R6-ALIGNED-NEXT:    lwu $2, 0($4)
+; MIPS64R6-ALIGNED-NEXT:    dsll $2, $2, 32
+; MIPS64R6-ALIGNED-NEXT:    jr $ra
+; MIPS64R6-ALIGNED-NEXT:    or $2, $2, $1
+  %tmp = load i64, ptr %p, align 4
+  ret i64 %tmp
+}
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
new file mode 100644
index 0000000000000..9cc45fbe313b7
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK64
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx62 | FileCheck %s --check-prefixes=CHECKPTX62
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %val) {
+; CHECK-LABEL: test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK-NEXT:    atom.add.noftz.f16 %rs2, [%r1], %rs1;
+; CHECK-NEXT:    ld.param.u32 %r2, [test_param_1];
+; CHECK-NEXT:    mov.b16 %rs3, 0x3C00;
+; CHECK-NEXT:    atom.add.noftz.f16 %rs4, [%r1], %rs3;
+; CHECK-NEXT:    ld.param.u32 %r3, [test_param_2];
+; CHECK-NEXT:    atom.global.add.noftz.f16 %rs5, [%r2], %rs1;
+; CHECK-NEXT:    atom.shared.add.noftz.f16 %rs6, [%r3], %rs1;
+; CHECK-NEXT:    ret;
+;
+; CHECK64-LABEL: test(
+; CHECK64:       {
+; CHECK64-NEXT:    .reg .b16 %rs<7>;
+; CHECK64-NEXT:    .reg .b64 %rd<4>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT:  // %bb.0:
+; CHECK64-NEXT:    ld.param.u64 %rd1, [test_param_0];
+; CHECK64-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK64-NEXT:    atom.add.noftz.f16 %rs2, [%rd1], %rs1;
+; CHECK64-NEXT:    ld.param.u64 %rd2, [test_param_1];
+; CHECK64-NEXT:    mov.b16 %rs3, 0x3C00;
+; CHECK64-NEXT:    atom.add.noftz.f16 %rs4, [%rd1], %rs3;
+; CHECK64-NEXT:    ld.param.u64 %rd3, [test_param_2];
+; CHECK64-NEXT:    atom.global.add.noftz.f16 %rs5, [%rd2], %rs1;
+; CHECK64-NEXT:    atom.shared.add.noftz.f16 %rs6, [%rd3], %rs1;
+; CHECK64-NEXT:    ret;
+;
+; CHECKPTX62-LABEL: test(
+; CHECKPTX62:       {
+; CHECKPTX62-NEXT:    .reg .pred %p<5>;
+; CHECKPTX62-NEXT:    .reg .b16 %rs<19>;
+; CHECKPTX62-NEXT:    .reg .b32 %r<58>;
+; CHECKPTX62-EMPTY:
+; CHECKPTX62-NEXT:  // %bb.0:
+; CHECKPTX62-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECKPTX62-NEXT:    ld.param.u32 %r23, [test_param_2];
+; CHECKPTX62-NEXT:    ld.param.u32 %r22, [test_param_1];
+; CHECKPTX62-NEXT:    ld.param.u32 %r24, [test_param_0];
+; CHECKPTX62-NEXT:    and.b32 %r1, %r24, -4;
+; CHECKPTX62-NEXT:    and.b32 %r25, %r24, 3;
+; CHECKPTX62-NEXT:    shl.b32 %r2, %r25, 3;
+; CHECKPTX62-NEXT:    mov.b32 %r26, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX62-NEXT:    not.b32 %r3, %r27;
+; CHECKPTX62-NEXT:    ld.u32 %r54, [%r1];
+; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r28, %r54, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r28;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs4, %rs2, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r29, %rs4;
+; CHECKPTX62-NEXT:    shl.b32 %r30, %r29, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r31, %r54, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r32, %r31, %r30;
+; CHECKPTX62-NEXT:    atom.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p1, %r6, %r54;
+; CHECKPTX62-NEXT:    mov.u32 %r54, %r6;
+; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
+; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECKPTX62-NEXT:    ld.u32 %r55, [%r1];
+; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start9
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r33, %r55, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs6, %r33;
+; CHECKPTX62-NEXT:    mov.b16 %rs8, 0x3C00;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs9, %rs6, %rs8;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs9;
+; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r36, %r55, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX62-NEXT:    atom.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p2, %r9, %r55;
+; CHECKPTX62-NEXT:    mov.u32 %r55, %r9;
+; CHECKPTX62-NEXT:    @%p2 bra $L__BB0_3;
+; CHECKPTX62-NEXT:  // %bb.4: // %atomicrmw.end8
+; CHECKPTX62-NEXT:    and.b32 %r10, %r22, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r38, %r22, 3;
+; CHECKPTX62-NEXT:    and.b32 %r11, %r38, 24;
+; CHECKPTX62-NEXT:    shl.b32 %r40, %r26, %r11;
+; CHECKPTX62-NEXT:    not.b32 %r12, %r40;
+; CHECKPTX62-NEXT:    ld.global.u32 %r56, [%r10];
+; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start27
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r41, %r56, %r11;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs11, %r41;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs13, %rs11, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs13;
+; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r11;
+; CHECKPTX62-NEXT:    and.b32 %r44, %r56, %r12;
+; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX62-NEXT:    atom.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p3, %r15, %r56;
+; CHECKPTX62-NEXT:    mov.u32 %r56, %r15;
+; CHECKPTX62-NEXT:    @%p3 bra $L__BB0_5;
+; CHECKPTX62-NEXT:  // %bb.6: // %atomicrmw.end26
+; CHECKPTX62-NEXT:    and.b32 %r16, %r23, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r46, %r23, 3;
+; CHECKPTX62-NEXT:    and.b32 %r17, %r46, 24;
+; CHECKPTX62-NEXT:    shl.b32 %r48, %r26, %r17;
+; CHECKPTX62-NEXT:    not.b32 %r18, %r48;
+; CHECKPTX62-NEXT:    ld.shared.u32 %r57, [%r16];
+; CHECKPTX62-NEXT:  $L__BB0_7: // %atomicrmw.start45
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r49, %r57, %r17;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs15, %r49;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs17, %rs15, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r50, %rs17;
+; CHECKPTX62-NEXT:    shl.b32 %r51, %r50, %r17;
+; CHECKPTX62-NEXT:    and.b32 %r52, %r57, %r18;
+; CHECKPTX62-NEXT:    or.b32 %r53, %r52, %r51;
+; CHECKPTX62-NEXT:    atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p4, %r21, %r57;
+; CHECKPTX62-NEXT:    mov.u32 %r57, %r21;
+; CHECKPTX62-NEXT:    @%p4 bra $L__BB0_7;
+; CHECKPTX62-NEXT:  // %bb.8: // %atomicrmw.end44
+; CHECKPTX62-NEXT:    ret;
+  %r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
+  %r2 = atomicrmw fadd ptr %dp0, half 1.0 seq_cst
+  %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val seq_cst
+  %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val seq_cst
+  ret void
+}
+
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index e99d0fd05e346..6f2b5dcf47f13 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -175,6 +175,13 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
   ret float %ret
 }
 
+; CHECK-LABEL: atomicrmw_add_f16_generic
+define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
+; CHECK: atom.cas
+  %ret = atomicrmw fadd ptr %addr, half %val seq_cst
+  ret half %ret
+}
+
 ; CHECK-LABEL: atomicrmw_add_f32_addrspace1
 define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
 ; CHECK: atom.global.add.f32
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
new file mode 100644
index 0000000000000..3f929ec6a75d0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i16 @bswap16(i16 %a) {
+; CHECK-LABEL: bswap16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [bswap16_param_0];
+; CHECK-NEXT:    shr.u16 %rs2, %rs1, 8;
+; CHECK-NEXT:    shl.b16 %rs3, %rs1, 8;
+; CHECK-NEXT:    or.b16 %rs4, %rs3, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  %b = tail call i16 @llvm.bswap.i16(i16 %a)
+  ret i16 %b
+}
+
+
+define i32 @bswap32(i32 %a) {
+; CHECK-LABEL: bswap32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bswap32_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT:    ret;
+  %b = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %b
+}
+
+
+define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
+; CHECK-LABEL: bswapv2i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bswapv2i16_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 8961;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT:    ret;
+  %b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
+  ret <2 x i16> %b
+}
+
+define i64 @bswap64(i64 %a) {
+; CHECK-LABEL: bswap64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [bswap64_param_0];
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 291;
+; CHECK-NEXT:    mov.b64 %rd2, {%r4, %r2};
+; CHECK-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; CHECK-NEXT:    ret;
+  %b = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %b
+}
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
+declare i64 @llvm.bswap.i64(i64)
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 3ef55ca5309f8..09297fb819ce5 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -1,10 +1,44 @@
-; RUN: not llc -march=nvptx < %s 2>&1 | FileCheck %s
-; RUN: not llc -march=nvptx64 < %s 2>&1 | FileCheck %s
+; RUN: not llc < %s -march=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
+; RUN: not llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: dynamic alloca unsupported by NVPTX backend
+; RUN: llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32
+; RUN: llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
 
-define void @test_dynamic_stackalloc(i64 %n) {
-  %alloca = alloca i32, i64 %n
-  store volatile i32 0, ptr %alloca
-  ret void
+; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.
+
+; CHECK-LABEL: .visible .func  (.param .b32 func_retval0) test_dynamic_stackalloc(
+; CHECK-NOT: __local_depot
+
+; CHECK-32:       ld.param.u32  %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-32-NEXT:  mad.lo.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 1, 7;
+; CHECK-32-NEXT:  and.b32         %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
+; CHECK-32-NEXT:  alloca.u32  %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
+; CHECK-32-NEXT:  cvta.local.u32  %r[[ALLOCA]], %r[[ALLOCA]];
+; CHECK-32-NEXT:  { // callseq 0, 0
+; CHECK-32-NEXT:  .reg .b32 temp_param_reg;
+; CHECK-32-NEXT:  .param .b32 param0;
+; CHECK-32-NEXT:  st.param.b32  [param0+0], %r[[ALLOCA]];
+
+; CHECK-64:       ld.param.u64  %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-64-NEXT:  add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
+; CHECK-64-NEXT:  and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
+; CHECK-64-NEXT:  alloca.u64  %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
+; CHECK-64-NEXT:  cvta.local.u64  %rd[[ALLOCA]], %rd[[ALLOCA]];
+; CHECK-64-NEXT:  { // callseq 0, 0
+; CHECK-64-NEXT:  .reg .b32 temp_param_reg;
+; CHECK-64-NEXT:  .param .b64 param0;
+; CHECK-64-NEXT:  st.param.b64  [param0+0], %rd[[ALLOCA]];
+
+; CHECK-NEXT:     .param .b32 retval0;
+; CHECK-NEXT:     call.uni (retval0),
+; CHECK-NEXT:     bar,
+
+define i32 @test_dynamic_stackalloc(i64 %n) {
+  %alloca = alloca i8, i64 %n, align 16
+  %call = call i32 @bar(ptr %alloca)
+  ret i32 %call
 }
+
+declare i32 @bar(ptr)
diff --git a/llvm/test/CodeGen/PowerPC/aix-codemodel-attr.ll b/llvm/test/CodeGen/PowerPC/aix-codemodel-attr.ll
new file mode 100644
index 0000000000000..ef1156e338a89
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-codemodel-attr.ll
@@ -0,0 +1,166 @@
+; RUN: llc --verify-machineinstrs -mtriple powerpc-ibm-aix --code-model=small < \
+; RUN: %s | FileCheck --check-prefixes=CHECK,CHECK32,CHECK-SMALL,CHECK-SMALL32 %s
+
+; RUN: llc --verify-machineinstrs -mtriple powerpc-ibm-aix --code-model=large < \
+; RUN: %s | FileCheck --check-prefixes=CHECK,CHECK32,CHECK-LARGE,CHECK-LARGE32 %s
+
+; RUN: llc --verify-machineinstrs -mtriple powerpc64-ibm-aix --code-model=small < \
+; RUN: %s | FileCheck --check-prefixes=CHECK,CHECK64,CHECK-SMALL,CHECK-SMALL64 %s
+
+; RUN: llc --verify-machineinstrs -mtriple powerpc64-ibm-aix --code-model=large < \
+; RUN: %s | FileCheck --check-prefixes=CHECK,CHECK64,CHECK-LARGE,CHECK-LARGE64 %s
+
+@a = external dso_local global i32, code_model "small", align 4
+@b = external dso_local global i32, code_model "large", align 4
+@c = dso_local global i32 55, code_model "small", align 4
+@d = dso_local global i32 41, code_model "large", align 4
+@e = external dso_local global i32, align 4
+@f = dso_local global i32 2748, align 4
+
+@large_aliasee = global i32 10, code_model "large", align 4
+@small_aliasee = global i32 171, code_model "small", align 4
+@normal_aliasee = global i32 2748, align 4
+
+@al = alias i32, ptr @large_aliasee
+@as = alias i32, ptr @small_aliasee
+@an = alias i32, ptr @normal_aliasee
+
+define i32 @A() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @a, align 4
+  ret i32 %0
+}
+; CHECK32:  lwz [[SCRATCH:[0-9]+]], L..C[[TL_A:[0-9]+]](2)                         # @a
+; CHECK64:  ld [[SCRATCH:[0-9]+]], L..C[[TL_A:[0-9]+]](2)                         # @a
+; CHECK:    lwz 3, 0([[SCRATCH]])
+; CHECK:    blr
+
+define i32 @B() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @b, align 4
+  ret i32 %0
+}
+; CHECK:   addis [[HI:[0-9]+]], L..C[[TL_B:[0-9]+]]@u(2)
+; CHECK32: lwz [[ADDR:[0-9]+]], L..C[[TL_B]]@l([[HI]])
+; CHECK64: ld [[ADDR:[0-9]+]], L..C[[TL_B]]@l([[HI]])
+; CHECK:   lwz 3, 0([[ADDR]])
+; CHECK:   blr
+
+define i32 @C() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @c, align 4
+  ret i32 %0
+}
+; CHECK32:  lwz [[SCRATCH:[0-9]+]], L..C[[TL_C:[0-9]+]](2)                         # @c
+; CHECK64:  ld [[SCRATCH:[0-9]+]], L..C[[TL_C:[0-9]+]](2)                         # @c
+; CHECK:    lwz 3, 0([[SCRATCH]])
+; CHECK:    blr
+
+define i32 @D() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @d, align 4
+  ret i32 %0
+}
+; CHECK: addis [[HI:[0-9]+]], L..C[[TL_D:[0-9]+]]@u(2)
+; CHECK32: lwz [[ADDR:[0-9]+]], L..C[[TL_D]]@l([[HI]])
+; CHECK64: ld [[ADDR:[0-9]+]], L..C[[TL_D]]@l([[HI]])
+; CHECK: lwz 3, 0([[ADDR]])
+; CHECK: blr
+
+define i32 @E() {
+entry:
+  %0 = load i32, ptr @e, align 4
+  ret i32 %0
+}
+; CHECK-LARGE: addis [[HI:[0-9]+]], L..C[[TL_E:[0-9]+]]@u(2)
+; CHECK-LARGE32: lwz [[SCRATCH:[0-9]+]], L..C[[TL_E]]@l([[HI]])
+; CHECK-SMALL32: lwz [[SCRATCH:[0-9]+]], L..C[[TL_E:[0-9]+]](2)
+; CHECK-LARGE64: ld  [[SCRATCH:[0-9]+]], L..C[[TL_E]]@l([[HI]])
+; CHECK-SMALL64: ld  [[SCRATCH:[0-9]+]], L..C[[TL_E:[0-9]+]](2)
+; CHECK: lwz 3, 0([[SCRATCH]])
+; CHECK: blr
+
+define i32 @F() {
+entry:
+  %0 = load i32, ptr @f, align 4
+  ret i32 %0
+}
+; CHECK-LARGE: addis [[HI:[0-9]+]], L..C[[TL_F:[0-9]+]]@u(2)
+; CHECK-LARGE32: lwz [[SCRATCH:[0-9]+]], L..C[[TL_F]]@l([[HI]])
+; CHECK-SMALL32: lwz [[SCRATCH:[0-9]+]], L..C[[TL_F:[0-9]+]](2)
+; CHECK-LARGE64: ld [[SCRATCH:[0-9]+]], L..C[[TL_F]]@l([[HI]])
+; CHECK-SMALL64: ld  [[SCRATCH:[0-9]+]], L..C[[TL_F:[0-9]+]](2)
+; CHECK: lwz 3, 0([[SCRATCH]])
+; CHECK: blr
+
+define noundef nonnull ptr @addr_a() local_unnamed_addr {
+entry:
+  ret ptr @a
+}
+; CHECK32:  lwz 3, L..C[[TL_A]](2)                         # @a
+; CHECK64:  ld 3, L..C[[TL_A]](2)                         # @a
+; CHECK:    blr
+
+define noundef nonnull ptr @addr_b() local_unnamed_addr {
+entry:
+  ret ptr @b
+}
+; CHECK:    addis [[HI:[0-9]+]], L..C[[TL_B]]@u(2)
+; CHECK32:  lwz 3, L..C[[TL_B]]@l([[HI]])
+; CHECK64:  ld 3, L..C[[TL_B]]@l([[HI]])
+; CHECK:    blr
+
+
+define noundef nonnull ptr @addr_c() local_unnamed_addr {
+entry:
+  ret ptr @c
+}
+; CHECK32:  lwz 3, L..C[[TL_C]](2)                         # @c
+; CHECK64:  ld 3, L..C[[TL_C]](2)                         # @c
+; CHECK:    blr
+
+define noundef nonnull ptr @addr_d() local_unnamed_addr {
+entry:
+  ret ptr @d
+}
+; CHECK:   addis [[HI:[0-9]+]], L..C[[TL_D]]@u(2)
+; CHECK32: lwz 3, L..C[[TL_D]]@l([[HI]])
+; CHECK64: ld 3, L..C[[TL_D]]@l([[HI]])
+; CHECK:   blr
+
+define i32 @G() {
+   %tmp = load i32, ptr @al
+   ret i32 %tmp
+}
+; CHECK:   addis [[HI:[0-9]+]], L..C[[TL_AL:[0-9]+]]@u(2)
+; CHECK32: lwz [[ADDR:[0-9]+]], L..C[[TL_AL]]@l([[HI]])
+; CHECK64: ld  [[ADDR:[0-9]+]], L..C[[TL_AL]]@l([[HI]])
+; CHECK:   lwz 3, 0([[ADDR]])
+
+define i32 @H() {
+   %tmp = load i32, ptr @as
+   ret i32 %tmp
+}
+; CHECK32: lwz [[ADDR:[0-9]+]], L..C[[TL_AS:[0-9]+]](2)
+; CHECK64: ld [[ADDR:[0-9]+]], L..C[[TL_AS:[0-9]+]](2)
+; CHECK:   lwz 3, 0([[ADDR]])
+
+;; Check TOC entires have correct storage mapping class
+; CHECK:         L..C[[TL_A]]:
+; CHECK:           .tc a[TC],a[UA]
+; CHECK:         L..C[[TL_B]]:
+; CHECK:           .tc b[TE],b[UA]
+; CHECK:         L..C[[TL_C]]:
+; CHECK:           .tc c[TC],c[RW]
+; CHECK:         L..C[[TL_D]]:
+; CHECK:           .tc d[TE],d[RW]
+; CHECK:         L..C[[TL_E]]:
+; CHECK-SMALL:     .tc e[TC],e[UA]
+; CHECK-LARGE:     .tc e[TE],e[UA]
+; CHECK:         L..C[[TL_F]]:
+; CHECK-SMALL:     .tc f[TC],f[RW]
+; CHECK-LARGE:     .tc f[TE],f[RW]
+; CHECK:         L..C[[TL_AL]]:
+; CHECK:           .tc al[TE],al
+; CHECK:         L..C[[TL_AS]]:
+; CHECK:           .tc as[TC],as
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
index c1d1461664418..50ebe0471dcea 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple powerpc64le < %s | FileCheck %s
 
 ; Check constrained ops converted to call
-define void @test(ptr %cast) {
+define void @test(ptr %cast) strictfp {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %root
 ; CHECK-NEXT:    mflr 0
@@ -51,7 +51,7 @@ for.body:
 }
 
 ; Check constrained ops converted to native instruction
-define void @test2(ptr %cast) {
+define void @test2(ptr %cast) strictfp {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    li 4, 255
diff --git a/llvm/test/CodeGen/PowerPC/fp-classify.ll b/llvm/test/CodeGen/PowerPC/fp-classify.ll
index 7de35b880a5d9..f527b3c48040e 100644
--- a/llvm/test/CodeGen/PowerPC/fp-classify.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-classify.ll
@@ -57,30 +57,18 @@ entry:
 define zeroext i1 @abs_isinfq(fp128 %x) {
 ; P8-LABEL: abs_isinfq:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    mflr 0
-; P8-NEXT:    stdu 1, -48(1)
-; P8-NEXT:    std 0, 64(1)
-; P8-NEXT:    .cfi_def_cfa_offset 48
-; P8-NEXT:    .cfi_offset lr, 16
 ; P8-NEXT:    xxswapd 0, 34
-; P8-NEXT:    addi 3, 1, 32
+; P8-NEXT:    addi 3, 1, -16
+; P8-NEXT:    li 5, 32767
 ; P8-NEXT:    stxvd2x 0, 0, 3
-; P8-NEXT:    lbz 4, 47(1)
-; P8-NEXT:    clrlwi 4, 4, 25
-; P8-NEXT:    stb 4, 47(1)
-; P8-NEXT:    lxvd2x 0, 0, 3
-; P8-NEXT:    addis 3, 2, .LCPI2_0@toc@ha
-; P8-NEXT:    addi 3, 3, .LCPI2_0@toc@l
-; P8-NEXT:    xxswapd 34, 0
-; P8-NEXT:    lxvd2x 0, 0, 3
-; P8-NEXT:    xxswapd 35, 0
-; P8-NEXT:    bl __eqkf2
-; P8-NEXT:    nop
-; P8-NEXT:    cntlzw 3, 3
-; P8-NEXT:    srwi 3, 3, 5
-; P8-NEXT:    addi 1, 1, 48
-; P8-NEXT:    ld 0, 16(1)
-; P8-NEXT:    mtlr 0
+; P8-NEXT:    rldic 5, 5, 48, 1
+; P8-NEXT:    ld 4, -8(1)
+; P8-NEXT:    ld 3, -16(1)
+; P8-NEXT:    clrldi 4, 4, 1
+; P8-NEXT:    xor 4, 4, 5
+; P8-NEXT:    or 3, 3, 4
+; P8-NEXT:    cntlzd 3, 3
+; P8-NEXT:    rldicl 3, 3, 58, 63
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinfq:
@@ -99,12 +87,13 @@ entry:
 define zeroext i1 @abs_isinfornanf(float %x) {
 ; P8-LABEL: abs_isinfornanf:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    addis 3, 2, .LCPI3_0@toc@ha
-; P8-NEXT:    xsabsdp 0, 1
-; P8-NEXT:    lfs 1, .LCPI3_0@toc@l(3)
-; P8-NEXT:    li 3, 1
-; P8-NEXT:    fcmpu 0, 0, 1
-; P8-NEXT:    isellt 3, 0, 3
+; P8-NEXT:    xscvdpspn 0, 1
+; P8-NEXT:    lis 4, 32639
+; P8-NEXT:    ori 4, 4, 65535
+; P8-NEXT:    mffprwz 3, 0
+; P8-NEXT:    clrlwi 3, 3, 1
+; P8-NEXT:    sub 3, 4, 3
+; P8-NEXT:    rldicl 3, 3, 1, 63
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinfornanf:
@@ -123,12 +112,15 @@ entry:
 define zeroext i1 @abs_isinfornan(double %x) {
 ; P8-LABEL: abs_isinfornan:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
-; P8-NEXT:    xsabsdp 0, 1
-; P8-NEXT:    lfs 1, .LCPI4_0@toc@l(3)
-; P8-NEXT:    li 3, 1
-; P8-NEXT:    fcmpu 0, 0, 1
-; P8-NEXT:    isellt 3, 0, 3
+; P8-NEXT:    mffprd 3, 1
+; P8-NEXT:    li 4, -33
+; P8-NEXT:    rldicl 4, 4, 47, 1
+; P8-NEXT:    sradi 5, 4, 63
+; P8-NEXT:    clrldi 3, 3, 1
+; P8-NEXT:    rldicl 6, 3, 1, 63
+; P8-NEXT:    subc 3, 4, 3
+; P8-NEXT:    adde 3, 6, 5
+; P8-NEXT:    xori 3, 3, 1
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinfornan:
@@ -147,53 +139,18 @@ entry:
 define zeroext i1 @abs_isinfornanq(fp128 %x) {
 ; P8-LABEL: abs_isinfornanq:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    mflr 0
-; P8-NEXT:    stdu 1, -112(1)
-; P8-NEXT:    std 0, 128(1)
-; P8-NEXT:    .cfi_def_cfa_offset 112
-; P8-NEXT:    .cfi_offset lr, 16
-; P8-NEXT:    .cfi_offset r30, -16
-; P8-NEXT:    .cfi_offset v30, -48
-; P8-NEXT:    .cfi_offset v31, -32
-; P8-NEXT:    li 3, 64
 ; P8-NEXT:    xxswapd 0, 34
-; P8-NEXT:    std 30, 96(1) # 8-byte Folded Spill
-; P8-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
-; P8-NEXT:    li 3, 80
-; P8-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
-; P8-NEXT:    addi 3, 1, 48
+; P8-NEXT:    addi 3, 1, -16
+; P8-NEXT:    li 4, -3
 ; P8-NEXT:    stxvd2x 0, 0, 3
-; P8-NEXT:    lbz 4, 63(1)
-; P8-NEXT:    clrlwi 4, 4, 25
-; P8-NEXT:    stb 4, 63(1)
-; P8-NEXT:    lxvd2x 0, 0, 3
-; P8-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
-; P8-NEXT:    addi 3, 3, .LCPI5_0@toc@l
-; P8-NEXT:    xxswapd 63, 0
-; P8-NEXT:    lxvd2x 0, 0, 3
-; P8-NEXT:    vmr 2, 31
-; P8-NEXT:    xxswapd 62, 0
-; P8-NEXT:    vmr 3, 30
-; P8-NEXT:    bl __eqkf2
-; P8-NEXT:    nop
-; P8-NEXT:    cntlzw 3, 3
-; P8-NEXT:    vmr 2, 31
-; P8-NEXT:    vmr 3, 30
-; P8-NEXT:    srwi 30, 3, 5
-; P8-NEXT:    bl __unordkf2
-; P8-NEXT:    nop
-; P8-NEXT:    cntlzw 3, 3
-; P8-NEXT:    li 4, 80
-; P8-NEXT:    lvx 31, 1, 4 # 16-byte Folded Reload
-; P8-NEXT:    li 4, 64
-; P8-NEXT:    srwi 3, 3, 5
-; P8-NEXT:    lvx 30, 1, 4 # 16-byte Folded Reload
+; P8-NEXT:    rldicl 4, 4, 47, 1
+; P8-NEXT:    ld 3, -8(1)
+; P8-NEXT:    sradi 5, 4, 63
+; P8-NEXT:    clrldi 3, 3, 1
+; P8-NEXT:    rldicl 6, 3, 1, 63
+; P8-NEXT:    subc 3, 4, 3
+; P8-NEXT:    adde 3, 6, 5
 ; P8-NEXT:    xori 3, 3, 1
-; P8-NEXT:    or 3, 3, 30
-; P8-NEXT:    ld 30, 96(1) # 8-byte Folded Reload
-; P8-NEXT:    addi 1, 1, 112
-; P8-NEXT:    ld 0, 16(1)
-; P8-NEXT:    mtlr 0
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinfornanq:
diff --git a/llvm/test/CodeGen/PowerPC/rldimi.ll b/llvm/test/CodeGen/PowerPC/rldimi.ll
index 322975f547c99..78ea9aa862f2c 100644
--- a/llvm/test/CodeGen/PowerPC/rldimi.ll
+++ b/llvm/test/CodeGen/PowerPC/rldimi.ll
@@ -59,8 +59,8 @@ entry:
   ret i64 %8
 }
 
-define i64 @rldimi_intrinsic(i64 %a) {
-; CHECK-LABEL: rldimi_intrinsic:
+define i64 @rldimi4(i64 %a) {
+; CHECK-LABEL: rldimi4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    rldimi 3, 3, 8, 0
 ; CHECK-NEXT:    rldimi 3, 3, 16, 0
@@ -72,4 +72,71 @@ define i64 @rldimi_intrinsic(i64 %a) {
   ret i64 %r3
 }
 
+define i64 @rldimi5(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rldimi 4, 3, 8, 40
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 8, i64 16776960) ; 0xffff << 8
+  ret i64 %r
+}
+
+define i64 @rldimi6(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 3, 3, 1
+; CHECK-NEXT:    rldimi 4, 3, 7, 41
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 8, i64 8388480) ; 0xffff << 7
+  ret i64 %r
+}
+
+define i64 @rldimi7(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 3, 3, 63
+; CHECK-NEXT:    rldimi 4, 3, 9, 39
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 8, i64 33553920) ; 0xffff << 9
+  ret i64 %r
+}
+
+define i64 @rldimi8(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 0, i64 0)
+  ret i64 %r
+}
+
+define i64 @rldimi9(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 63, i64 0)
+  ret i64 %r
+}
+
+define i64 @rldimi10(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 0, i64 -1)
+  ret i64 %r
+}
+
+define i64 @rldimi11(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 3, 3, 8
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 8, i64 -1)
+  ret i64 %r
+}
+
 declare i64 @llvm.ppc.rldimi(i64, i64, i32 immarg, i64 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/rlwimi.ll b/llvm/test/CodeGen/PowerPC/rlwimi.ll
index 8b126cd3393c1..8da769508c9ae 100644
--- a/llvm/test/CodeGen/PowerPC/rlwimi.ll
+++ b/llvm/test/CodeGen/PowerPC/rlwimi.ll
@@ -107,11 +107,51 @@ entry:
 define i32 @test9(i32 %a, i32 %b) {
 ; CHECK-LABEL: test9:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rlwimi 3, 4, 8, 20, 26
+; CHECK-NEXT:    rlwimi 4, 3, 8, 20, 26
+; CHECK-NEXT:    mr 3, 4
 ; CHECK-NEXT:    blr
 entry:
   %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 8, i32 4064)
   ret i32 %r
 }
 
+define i32 @test10(i32 %a, i32 %b) {
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 0, i32 -1)
+  ret i32 %r
+}
+
+define i32 @test11(i32 %a, i32 %b) {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rotlwi 3, 3, 8
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 8, i32 -1)
+  ret i32 %r
+}
+
+define i32 @test12(i32 %a, i32 %b) {
+; CHECK-LABEL: test12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 0, i32 0)
+  ret i32 %r
+}
+
+define i32 @test13(i32 %a, i32 %b) {
+; CHECK-LABEL: test13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 0, 27, 19
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 0, i32 4064)
+  ret i32 %r
+}
+
 declare i32 @llvm.ppc.rlwimi(i32, i32, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/rlwinm.ll b/llvm/test/CodeGen/PowerPC/rlwinm.ll
index c6d4e5bb00004..363eb17127656 100644
--- a/llvm/test/CodeGen/PowerPC/rlwinm.ll
+++ b/llvm/test/CodeGen/PowerPC/rlwinm.ll
@@ -97,4 +97,24 @@ entry:
   ret i32 %r
 }
 
+define i32 @test10(i32 %a, i32 %s) {
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwnm(i32 %a, i32 %s, i32 0)
+  ret i32 %r
+}
+
+define i32 @test11(i32 %a, i32 %s) {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rotlw 3, 3, 4
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwnm(i32 %a, i32 %s, i32 -1)
+  ret i32 %r
+}
+
 declare i32 @llvm.ppc.rlwnm(i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll b/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll
index 6f68679325c57..798637b6840f1 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll
@@ -7281,3 +7281,61 @@ entry:
   store double %str, ptr inttoptr (i64 1000000000000 to ptr), align 4096
   ret void
 }
+
+define dso_local void @st_reversed_double_from_i8(ptr %ptr) {
+; CHECK-P10-LABEL: st_reversed_double_from_i8:
+; CHECK-P10:       # %bb.0: # %entry
+; CHECK-P10-NEXT:    li r4, 8
+; CHECK-P10-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P10-NEXT:    xxspltidp vs2, -1023410176
+; CHECK-P10-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P10-NEXT:    xscvuxddp f0, f0
+; CHECK-P10-NEXT:    xscvuxddp f1, f1
+; CHECK-P10-NEXT:    xsadddp f0, f0, f2
+; CHECK-P10-NEXT:    xsadddp f1, f1, f2
+; CHECK-P10-NEXT:    stfd f1, 0(r3)
+; CHECK-P10-NEXT:    stfd f0, 8(r3)
+; CHECK-P10-NEXT:    blr
+;
+; CHECK-P9-LABEL: st_reversed_double_from_i8:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    li r4, 8
+; CHECK-P9-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P9-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P9-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P9-NEXT:    xscvuxddp f0, f0
+; CHECK-P9-NEXT:    xscvuxddp f1, f1
+; CHECK-P9-NEXT:    xsadddp f0, f0, f2
+; CHECK-P9-NEXT:    xsadddp f1, f1, f2
+; CHECK-P9-NEXT:    stfd f0, 8(r3)
+; CHECK-P9-NEXT:    stfd f1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-P8-LABEL: st_reversed_double_from_i8:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lbz r4, 0(r3)
+; CHECK-P8-NEXT:    lbz r5, 8(r3)
+; CHECK-P8-NEXT:    mtfprwz f0, r4
+; CHECK-P8-NEXT:    mtfprwz f1, r5
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P8-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P8-NEXT:    xscvuxddp f0, f0
+; CHECK-P8-NEXT:    xscvuxddp f1, f1
+; CHECK-P8-NEXT:    xsadddp f0, f0, f2
+; CHECK-P8-NEXT:    xsadddp f1, f1, f2
+; CHECK-P8-NEXT:    stfd f1, 0(r3)
+; CHECK-P8-NEXT:    stfd f0, 8(r3)
+; CHECK-P8-NEXT:    blr
+entry:
+  %idx = getelementptr inbounds i8, ptr %ptr, i64 8
+  %i0 = load i8, ptr %ptr, align 1
+  %i1 = load i8, ptr %idx, align 1
+  %f0 = uitofp i8 %i0 to double
+  %f1 = uitofp i8 %i1 to double
+  %a0 = fadd double %f0, -1.280000e+02
+  %a1 = fadd double %f1, -1.280000e+02
+  store double %a1, ptr %ptr, align 8
+  store double %a0, ptr %idx, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll b/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll
index 824dd4c4db6cb..f396057342129 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll
@@ -7271,3 +7271,61 @@ entry:
   store double %conv, ptr inttoptr (i64 1000000000000 to ptr), align 4096
   ret void
 }
+
+define dso_local void @st_reversed_float_from_i8(ptr %ptr) {
+; CHECK-P10-LABEL: st_reversed_float_from_i8:
+; CHECK-P10:       # %bb.0: # %entry
+; CHECK-P10-NEXT:    li r4, 8
+; CHECK-P10-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P10-NEXT:    xxspltidp vs2, -1023410176
+; CHECK-P10-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P10-NEXT:    xscvuxdsp f0, f0
+; CHECK-P10-NEXT:    xscvuxdsp f1, f1
+; CHECK-P10-NEXT:    xsaddsp f0, f0, f2
+; CHECK-P10-NEXT:    xsaddsp f1, f1, f2
+; CHECK-P10-NEXT:    stfs f0, 8(r3)
+; CHECK-P10-NEXT:    stfs f1, 0(r3)
+; CHECK-P10-NEXT:    blr
+;
+; CHECK-P9-LABEL: st_reversed_float_from_i8:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    li r4, 8
+; CHECK-P9-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P9-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P9-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xsaddsp f0, f0, f2
+; CHECK-P9-NEXT:    xsaddsp f1, f1, f2
+; CHECK-P9-NEXT:    stfs f0, 8(r3)
+; CHECK-P9-NEXT:    stfs f1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-P8-LABEL: st_reversed_float_from_i8:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lbz r4, 0(r3)
+; CHECK-P8-NEXT:    lbz r5, 8(r3)
+; CHECK-P8-NEXT:    mtfprwz f0, r4
+; CHECK-P8-NEXT:    mtfprwz f1, r5
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P8-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xsaddsp f0, f0, f2
+; CHECK-P8-NEXT:    xsaddsp f1, f1, f2
+; CHECK-P8-NEXT:    stfs f1, 0(r3)
+; CHECK-P8-NEXT:    stfs f0, 8(r3)
+; CHECK-P8-NEXT:    blr
+entry:
+  %idx = getelementptr inbounds i8, ptr %ptr, i64 8
+  %i0 = load i8, ptr %ptr, align 1
+  %i1 = load i8, ptr %idx, align 1
+  %f0 = uitofp i8 %i0 to float
+  %f1 = uitofp i8 %i1 to float
+  %a0 = fadd float %f0, -1.280000e+02
+  %a1 = fadd float %f1, -1.280000e+02
+  store float %a1, ptr %ptr, align 8
+  store float %a0, ptr %idx, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ld.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ld.ll
new file mode 100644
index 0000000000000..31b3c3fe3c5be
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ld.ll
@@ -0,0 +1,948 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -global-isel -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck  -check-prefixes=RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -global-isel -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck  -check-prefixes=RV64 %s
+
+define <vscale x 1 x i8>  @vload_nx1i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x i8>, ptr %pa
+  ret <vscale x 1 x i8> %va
+}
+
+define <vscale x 2 x i8>  @vload_nx2i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i8>, ptr %pa
+  ret <vscale x 2 x i8> %va
+}
+
+define <vscale x 4 x i8>  @vload_nx4i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i8>, ptr %pa
+  ret <vscale x 4 x i8> %va
+}
+
+define <vscale x 8 x i8>  @vload_nx8i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx8i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 8 x i8>, ptr %pa
+  ret <vscale x 8 x i8> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 32 x i8>  @vload_nx32i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx32i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx32i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m4
+  %va = load <vscale x 32 x i8>, ptr %pa
+  ret <vscale x 32 x i8> %va
+}
+
+define <vscale x 64 x i8>  @vload_nx64i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx64i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m8
+  ;
+  ; RV64-LABEL: name: vload_nx64i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 64 x i8>, ptr %pa
+  ret <vscale x 64 x i8> %va
+}
+
+define <vscale x 1 x i16>  @vload_nx1i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x i16>, ptr %pa
+  ret <vscale x 1 x i16> %va
+}
+
+define <vscale x 2 x i16>  @vload_nx2i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i16>, ptr %pa
+  ret <vscale x 2 x i16> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 8 x i16>  @vload_nx8i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx8i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 8 x i16>, ptr %pa
+  ret <vscale x 8 x i16> %va
+}
+
+define <vscale x 16 x i16>  @vload_nx16i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx16i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m4
+  %va = load <vscale x 16 x i16>, ptr %pa
+  ret <vscale x 16 x i16> %va
+}
+
+define <vscale x 32 x i16>  @vload_nx32i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx32i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m8
+  ;
+  ; RV64-LABEL: name: vload_nx32i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 32 x i16>, ptr %pa
+  ret <vscale x 32 x i16> %va
+}
+
+define <vscale x 1 x i32>  @vload_nx1i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x i32>, ptr %pa
+  ret <vscale x 1 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 4 x i32>  @vload_nx4i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx4i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 4 x i32>, ptr %pa
+  ret <vscale x 4 x i32> %va
+}
+
+define <vscale x 8 x i32>  @vload_nx8i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx8i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m4
+  %va = load <vscale x 8 x i32>, ptr %pa
+  ret <vscale x 8 x i32> %va
+}
+
+define <vscale x 16 x i32>  @vload_nx16i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m8
+  ;
+  ; RV64-LABEL: name: vload_nx16i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 16 x i32>, ptr %pa
+  ret <vscale x 16 x i32> %va
+}
+
+define <vscale x 1 x i64>  @vload_nx1i64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x i64>, ptr %pa
+  ret <vscale x 1 x i64> %va
+}
+
+define <vscale x 2 x i64>  @vload_nx2i64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 4 x i64>  @vload_nx4i64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx4i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m4
+  %va = load <vscale x 4 x i64>, ptr %pa
+  ret <vscale x 4 x i64> %va
+}
+
+define <vscale x 8 x i64>  @vload_nx8i64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m8
+  ;
+  ; RV64-LABEL: name: vload_nx8i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 8 x i64>, ptr %pa
+  ret <vscale x 8 x i64> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8_align1(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8_align1
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8_align1
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa, align 1
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8_align2(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8_align2
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8_align2
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa, align 2
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8_align16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa, align 16
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8_align64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8_align64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8_align64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa, align 64
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16_align1(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align1
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 1)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align1
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 1)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 1
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16_align2(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align2
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align2
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 2
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16_align4(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align4
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align4
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 4
+  ret <vscale x 4 x i16> %va
+}
+define <vscale x 4 x i16>  @vload_nx4i16_align8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 8
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16_align16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 16
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align2(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align2
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 2)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align2
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 2)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 2
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align4(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align4
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align4
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 4
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 8
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 16
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align256(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align256
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align256
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 256
+  ret <vscale x 2 x i32> %va
+}
+define <vscale x 2 x i64>  @vload_nx2i64_align4(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64_align4
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 4)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64_align4
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 4)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa, align 4
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 2 x i64>  @vload_nx2i64_align8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64_align8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64_align8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa, align 8
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 2 x i64>  @vload_nx2i64_align16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa, align 16
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 2 x i64>  @vload_nx2i64_align32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64_align32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64_align32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa, align 32
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 1 x ptr>  @vload_nx1ptr(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1ptr
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1ptr
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x ptr>, ptr %pa
+  ret <vscale x 1 x ptr> %va
+}
+
+define <vscale x 2 x ptr>  @vload_nx2ptr(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2ptr
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x p0>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2ptr
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x p0>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2x ptr>, ptr %pa
+  ret <vscale x 2 x ptr> %va
+}
+
+define <vscale x 8 x ptr>  @vload_nx8ptr(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8ptr
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 8 x p0>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx8ptr
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 8 x p0>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 8 x ptr>, ptr %pa
+  ret <vscale x 8 x ptr> %va
+}
+
diff --git a/llvm/test/CodeGen/RISCV/live-sp.mir b/llvm/test/CodeGen/RISCV/live-sp.mir
index 8dd307f521f5b..fa6297a3913a9 100644
--- a/llvm/test/CodeGen/RISCV/live-sp.mir
+++ b/llvm/test/CodeGen/RISCV/live-sp.mir
@@ -44,7 +44,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    4
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll
index cfdefec04600c..ebf232cc458ba 100644
--- a/llvm/test/CodeGen/RISCV/machine-combiner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll
@@ -740,9 +740,9 @@ define i8 @test_reassoc_minu_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 ; CHECK-LABEL: test_reassoc_minu_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a3, a3, 255
-; CHECK-NEXT:    andi a2, a2, 255
 ; CHECK-NEXT:    andi a1, a1, 255
 ; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    andi a2, a2, 255
 ; CHECK-NEXT:    minu a0, a0, a1
 ; CHECK-NEXT:    minu a1, a2, a3
 ; CHECK-NEXT:    minu a0, a0, a1
@@ -757,9 +757,9 @@ define i16 @test_reassoc_minu_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 ; CHECK-LABEL: test_reassoc_minu_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    zext.h a3, a3
-; CHECK-NEXT:    zext.h a2, a2
 ; CHECK-NEXT:    zext.h a1, a1
 ; CHECK-NEXT:    zext.h a0, a0
+; CHECK-NEXT:    zext.h a2, a2
 ; CHECK-NEXT:    minu a0, a0, a1
 ; CHECK-NEXT:    minu a1, a2, a3
 ; CHECK-NEXT:    minu a0, a0, a1
@@ -774,9 +774,9 @@ define i32 @test_reassoc_minu_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_minu_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.w a3, a3
-; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    sext.w a1, a1
 ; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    minu a0, a0, a1
 ; CHECK-NEXT:    minu a1, a2, a3
 ; CHECK-NEXT:    minu a0, a0, a1
@@ -804,9 +804,9 @@ define i8 @test_reassoc_min_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 ; CHECK-LABEL: test_reassoc_min_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.b a3, a3
-; CHECK-NEXT:    sext.b a2, a2
 ; CHECK-NEXT:    sext.b a1, a1
 ; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    sext.b a2, a2
 ; CHECK-NEXT:    min a0, a0, a1
 ; CHECK-NEXT:    min a1, a2, a3
 ; CHECK-NEXT:    min a0, a0, a1
@@ -821,9 +821,9 @@ define i16 @test_reassoc_min_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 ; CHECK-LABEL: test_reassoc_min_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.h a3, a3
-; CHECK-NEXT:    sext.h a2, a2
 ; CHECK-NEXT:    sext.h a1, a1
 ; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    sext.h a2, a2
 ; CHECK-NEXT:    min a0, a0, a1
 ; CHECK-NEXT:    min a1, a2, a3
 ; CHECK-NEXT:    min a0, a0, a1
@@ -838,9 +838,9 @@ define i32 @test_reassoc_min_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_min_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.w a3, a3
-; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    sext.w a1, a1
 ; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    min a0, a0, a1
 ; CHECK-NEXT:    min a1, a2, a3
 ; CHECK-NEXT:    min a0, a0, a1
@@ -868,9 +868,9 @@ define i8 @test_reassoc_maxu_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 ; CHECK-LABEL: test_reassoc_maxu_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a3, a3, 255
-; CHECK-NEXT:    andi a2, a2, 255
 ; CHECK-NEXT:    andi a1, a1, 255
 ; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    andi a2, a2, 255
 ; CHECK-NEXT:    maxu a0, a0, a1
 ; CHECK-NEXT:    maxu a1, a2, a3
 ; CHECK-NEXT:    maxu a0, a0, a1
@@ -885,9 +885,9 @@ define i16 @test_reassoc_maxu_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 ; CHECK-LABEL: test_reassoc_maxu_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    zext.h a3, a3
-; CHECK-NEXT:    zext.h a2, a2
 ; CHECK-NEXT:    zext.h a1, a1
 ; CHECK-NEXT:    zext.h a0, a0
+; CHECK-NEXT:    zext.h a2, a2
 ; CHECK-NEXT:    maxu a0, a0, a1
 ; CHECK-NEXT:    maxu a1, a2, a3
 ; CHECK-NEXT:    maxu a0, a0, a1
@@ -902,9 +902,9 @@ define i32 @test_reassoc_maxu_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_maxu_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.w a3, a3
-; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    sext.w a1, a1
 ; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    maxu a0, a0, a1
 ; CHECK-NEXT:    maxu a1, a2, a3
 ; CHECK-NEXT:    maxu a0, a0, a1
@@ -932,9 +932,9 @@ define i8 @test_reassoc_max_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 ; CHECK-LABEL: test_reassoc_max_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.b a3, a3
-; CHECK-NEXT:    sext.b a2, a2
 ; CHECK-NEXT:    sext.b a1, a1
 ; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    sext.b a2, a2
 ; CHECK-NEXT:    max a0, a0, a1
 ; CHECK-NEXT:    max a1, a2, a3
 ; CHECK-NEXT:    max a0, a0, a1
@@ -949,9 +949,9 @@ define i16 @test_reassoc_max_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 ; CHECK-LABEL: test_reassoc_max_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.h a3, a3
-; CHECK-NEXT:    sext.h a2, a2
 ; CHECK-NEXT:    sext.h a1, a1
 ; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    sext.h a2, a2
 ; CHECK-NEXT:    max a0, a0, a1
 ; CHECK-NEXT:    max a1, a2, a3
 ; CHECK-NEXT:    max a0, a0, a1
@@ -966,9 +966,9 @@ define i32 @test_reassoc_max_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_max_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.w a3, a3
-; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    sext.w a1, a1
 ; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    max a0, a0, a1
 ; CHECK-NEXT:    max a1, a2, a3
 ; CHECK-NEXT:    max a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 139579b3d2a36..87d95d7596d4f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -161,71 +161,72 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-LABEL: fv128:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_0)
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v0, v16, a2
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 4, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_1)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v10, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 6, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 4
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_2)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 6
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_3)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_3)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 6
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 10, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_4)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_4)
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v0, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 12, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 10
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_5)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_5)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 4
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 14, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 12
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_6)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_6)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 6
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 8
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 14
 ; CHECK-NEXT:    ret
   %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
   ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir b/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir
index 5255728821039..080a89e41f0d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir
@@ -22,7 +22,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    16
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   functionContext: ''
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
index ed434deea1a83..c64216180c2af 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
@@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
 ; RV32-NEXT:    th.swia a0, (a1), 4, 0
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV32-NEXT:    vle8.v v10, (a3)
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
 ; RV32-NEXT:    vslideup.vi v10, v9, 4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vzext.vf4 v12, v10
@@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
 ; RV64-NEXT:    th.swia a0, (a1), 4, 0
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV64-NEXT:    vle8.v v10, (a3)
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
 ; RV64-NEXT:    vslideup.vi v10, v9, 4
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vzext.vf4 v12, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
new file mode 100644
index 0000000000000..673008d9c0b3d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
@@ -0,0 +1,871 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mtriple=riscv64 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV64
+; RUN: llc -verify-machineinstrs -mtriple=riscv32 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV32
+
+; Compress + store for i8 type
+
+define void @test_compresstore_v1i8(ptr %p, <1 x i1> %mask, <1 x i8> %data) {
+; RV64-LABEL: test_compresstore_v1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT:    vse8.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT:    vse8.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v1i8(<1 x i8> %data, ptr align 1 %p, <1 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v2i8(ptr %p, <2 x i1> %mask, <2 x i8> %data) {
+; RV64-LABEL: test_compresstore_v2i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT:    vse8.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v2i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT:    vse8.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v2i8(<2 x i8> %data, ptr align 1 %p, <2 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v4i8(ptr %p, <4 x i1> %mask, <4 x i8> %data) {
+; RV64-LABEL: test_compresstore_v4i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT:    vse8.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v4i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT:    vse8.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v4i8(<4 x i8> %data, ptr align 1 %p, <4 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v8i8(ptr %p, <8 x i1> %mask, <8 x i8> %data) {
+; RV64-LABEL: test_compresstore_v8i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT:    vse8.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v8i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT:    vse8.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v8i8(<8 x i8> %data, ptr align 1 %p, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v16i8(ptr %p, <16 x i1> %mask, <16 x i8> %data) {
+; RV64-LABEL: test_compresstore_v16i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vse8.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v16i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vse8.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v16i8(<16 x i8> %data, ptr align 1 %p, <16 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v32i8(ptr %p, <32 x i1> %mask, <32 x i8> %data) {
+; RV64-LABEL: test_compresstore_v32i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v10, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; RV64-NEXT:    vse8.v v10, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v32i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; RV32-NEXT:    vcompress.vm v10, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; RV32-NEXT:    vse8.v v10, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v32i8(<32 x i8> %data, ptr align 1 %p, <32 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v64i8(ptr %p, <64 x i1> %mask, <64 x i8> %data) {
+; RV64-LABEL: test_compresstore_v64i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 64
+; RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT:    vcompress.vm v12, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT:    vse8.v v12, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v64i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 64
+; RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT:    vcompress.vm v12, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT:    vse8.v v12, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v64i8(<64 x i8> %data, ptr align 1 %p, <64 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v128i8(ptr %p, <128 x i1> %mask, <128 x i8> %data) {
+; RV64-LABEL: test_compresstore_v128i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 128
+; RV64-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v16, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT:    vse8.v v16, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v128i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 128
+; RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v16, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT:    vse8.v v16, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v128i8(<128 x i8> %data, ptr align 1 %p, <128 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data) {
+; RV64-LABEL: test_compresstore_v256i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vmv1r.v v7, v8
+; RV64-NEXT:    li a2, 128
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vle8.v v24, (a1)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v9, v0, 1
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vmv.x.s a3, v0
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v16, v0
+; RV64-NEXT:    vcpop.m a4, v0
+; RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
+; RV64-NEXT:    vse8.v v8, (a0)
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v24, v7
+; RV64-NEXT:    vcpop.m a2, v7
+; RV64-NEXT:    cpop a3, a3
+; RV64-NEXT:    cpop a1, a1
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vse8.v v8, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v256i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vmv1r.v v7, v8
+; RV32-NEXT:    li a2, 128
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT:    vle8.v v24, (a1)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v9, v0, 1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsrl.vx v10, v9, a1
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vsrl.vx v10, v0, a1
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vmv.x.s a4, v9
+; RV32-NEXT:    vmv.x.s a5, v0
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v8, v16, v0
+; RV32-NEXT:    vcpop.m a6, v0
+; RV32-NEXT:    vsetvli zero, a6, e8, m8, ta, ma
+; RV32-NEXT:    vse8.v v8, (a0)
+; RV32-NEXT:    cpop a1, a1
+; RV32-NEXT:    cpop a5, a5
+; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    cpop a3, a3
+; RV32-NEXT:    cpop a4, a4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v8, v24, v7
+; RV32-NEXT:    vcpop.m a1, v7
+; RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT:    vse8.v v8, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v256i8(<256 x i8> %data, ptr align 1 %p, <256 x i1> %mask)
+  ret void
+}
+
+; Compress + store for i16 type
+
+define void @test_compresstore_v1i16(ptr %p, <1 x i1> %mask, <1 x i16> %data) {
+; RV64-LABEL: test_compresstore_v1i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT:    vse16.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v1i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT:    vse16.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v1i16(<1 x i16> %data, ptr align 2 %p, <1 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v2i16(ptr %p, <2 x i1> %mask, <2 x i16> %data) {
+; RV64-LABEL: test_compresstore_v2i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT:    vse16.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v2i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT:    vse16.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v2i16(<2 x i16> %data, ptr align 2 %p, <2 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v4i16(ptr %p, <4 x i1> %mask, <4 x i16> %data) {
+; RV64-LABEL: test_compresstore_v4i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; RV64-NEXT:    vse16.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v4i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; RV32-NEXT:    vse16.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v4i16(<4 x i16> %data, ptr align 2 %p, <4 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v8i16(ptr %p, <8 x i1> %mask, <8 x i16> %data) {
+; RV64-LABEL: test_compresstore_v8i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; RV64-NEXT:    vse16.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v8i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; RV32-NEXT:    vse16.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v8i16(<8 x i16> %data, ptr align 2 %p, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v16i16(ptr %p, <16 x i1> %mask, <16 x i16> %data) {
+; RV64-LABEL: test_compresstore_v16i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v10, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vse16.v v10, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v16i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT:    vcompress.vm v10, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vse16.v v10, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v16i16(<16 x i16> %data, ptr align 2 %p, <16 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v32i16(ptr %p, <32 x i1> %mask, <32 x i16> %data) {
+; RV64-LABEL: test_compresstore_v32i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; RV64-NEXT:    vcompress.vm v12, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; RV64-NEXT:    vse16.v v12, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v32i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; RV32-NEXT:    vcompress.vm v12, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; RV32-NEXT:    vse16.v v12, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v32i16(<32 x i16> %data, ptr align 2 %p, <32 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v64i16(ptr %p, <64 x i1> %mask, <64 x i16> %data) {
+; RV64-LABEL: test_compresstore_v64i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 64
+; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v16, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV64-NEXT:    vse16.v v16, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v64i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 64
+; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v16, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT:    vse16.v v16, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v64i16(<64 x i16> %data, ptr align 2 %p, <64 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v128i16(ptr %p, <128 x i1> %mask, <128 x i16> %data) {
+; RV64-LABEL: test_compresstore_v128i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 64
+; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v24, v8, v0
+; RV64-NEXT:    vcpop.m a2, v0
+; RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; RV64-NEXT:    vse16.v v24, (a0)
+; RV64-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v0, 8
+; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v24, v16, v8
+; RV64-NEXT:    vcpop.m a2, v8
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a1, v0
+; RV64-NEXT:    cpop a1, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; RV64-NEXT:    vse16.v v24, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v128i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 64
+; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v24, v8, v0
+; RV32-NEXT:    vcpop.m a2, v0
+; RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; RV32-NEXT:    vse16.v v24, (a0)
+; RV32-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v0, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v8, v16, v24
+; RV32-NEXT:    vcpop.m a1, v24
+; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v0, a2
+; RV32-NEXT:    vmv.x.s a2, v16
+; RV32-NEXT:    cpop a2, a2
+; RV32-NEXT:    vmv.x.s a3, v0
+; RV32-NEXT:    cpop a3, a3
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT:    vse16.v v8, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v128i16(<128 x i16> %data, ptr align 2 %p, <128 x i1> %mask)
+  ret void
+}
+
+; Compress + store for i32 type
+
+define void @test_compresstore_v1i32(ptr %p, <1 x i1> %mask, <1 x i32> %data) {
+; RV64-LABEL: test_compresstore_v1i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    vse32.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v1i32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT:    vse32.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v1i32(<1 x i32> %data, ptr align 4 %p, <1 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v2i32(ptr %p, <2 x i1> %mask, <2 x i32> %data) {
+; RV64-LABEL: test_compresstore_v2i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    vse32.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v2i32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT:    vse32.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v2i32(<2 x i32> %data, ptr align 4 %p, <2 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v4i32(ptr %p, <4 x i1> %mask, <4 x i32> %data) {
+; RV64-LABEL: test_compresstore_v4i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vse32.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v4i32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vse32.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %data, ptr align 4 %p, <4 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v8i32(ptr %p, <8 x i1> %mask, <8 x i32> %data) {
+; RV64-LABEL: test_compresstore_v8i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v10, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; RV64-NEXT:    vse32.v v10, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v8i32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vcompress.vm v10, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT:    vse32.v v10, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %data, ptr align 4 %p, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v16i32(ptr %p, <16 x i1> %mask, <16 x i32> %data) {
+; RV64-LABEL: test_compresstore_v16i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vcompress.vm v12, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vse32.v v12, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v16i32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vcompress.vm v12, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT:    vse32.v v12, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %data, ptr align 4 %p, <16 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v32i32(ptr %p, <32 x i1> %mask, <32 x i32> %data) {
+; RV64-LABEL: test_compresstore_v32i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v16, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vse32.v v16, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v32i32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v16, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vse32.v v16, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v32i32(<32 x i32> %data, ptr align 4 %p, <32 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data) {
+; RV64-LABEL: test_compresstore_v64i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v24, v8, v0
+; RV64-NEXT:    vcpop.m a2, v0
+; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV64-NEXT:    vse32.v v24, (a0)
+; RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v0, 4
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v24, v16, v8
+; RV64-NEXT:    vcpop.m a1, v8
+; RV64-NEXT:    vmv.x.s a2, v0
+; RV64-NEXT:    cpopw a2, a2
+; RV64-NEXT:    slli a2, a2, 2
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vse32.v v24, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v64i32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v24, v8, v0
+; RV32-NEXT:    vcpop.m a2, v0
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vse32.v v24, (a0)
+; RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v0, 4
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v24, v16, v8
+; RV32-NEXT:    vcpop.m a1, v8
+; RV32-NEXT:    vmv.x.s a2, v0
+; RV32-NEXT:    cpop a2, a2
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vse32.v v24, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v64i32(<64 x i32> %data, ptr align 4 %p, <64 x i1> %mask)
+  ret void
+}
+
+; Compress + store for i64 type
+
+define void @test_compresstore_v1i64(ptr %p, <1 x i1> %mask, <1 x i64> %data) {
+; RV64-LABEL: test_compresstore_v1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT:    vse64.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v1i64(<1 x i64> %data, ptr align 8 %p, <1 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
+; RV64-LABEL: test_compresstore_v2i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v9, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v2i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT:    vse64.v v9, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %data, ptr align 8 %p, <2 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v4i64(ptr %p, <4 x i1> %mask, <4 x i64> %data) {
+; RV64-LABEL: test_compresstore_v4i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v10, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vse64.v v10, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v4i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vcompress.vm v10, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV32-NEXT:    vse64.v v10, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %data, ptr align 8 %p, <4 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v8i64(ptr %p, <8 x i1> %mask, <8 x i64> %data) {
+; RV64-LABEL: test_compresstore_v8i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vcompress.vm v12, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vse64.v v12, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v8i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vcompress.vm v12, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV32-NEXT:    vse64.v v12, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %data, ptr align 8 %p, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v16i64(ptr %p, <16 x i1> %mask, <16 x i64> %data) {
+; RV64-LABEL: test_compresstore_v16i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v16, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vse64.v v16, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v16i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v16, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vse64.v v16, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v16i64(<16 x i64> %data, ptr align 8 %p, <16 x i1> %mask)
+  ret void
+}
+
+define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data) {
+; RV64-LABEL: test_compresstore_v32i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v24, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vse64.v v24, (a0)
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v16, v24
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vmv.x.s a1, v0
+; RV64-NEXT:    zext.h a1, a1
+; RV64-NEXT:    cpopw a1, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    vcpop.m a1, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vse64.v v8, (a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_compresstore_v32i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v24, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vse64.v v24, (a0)
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v8, v16, v24
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vmv.x.s a1, v0
+; RV32-NEXT:    zext.h a1, a1
+; RV32-NEXT:    cpop a1, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    vcpop.m a1, v24
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vse64.v v8, (a0)
+; RV32-NEXT:    ret
+entry:
+  tail call void @llvm.masked.compressstore.v32i64(<32 x i64> %data, ptr align 8 %p, <32 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.compressstore.v1i8(<1 x i8>, ptr, <1 x i1>)
+declare void @llvm.masked.compressstore.v2i8(<2 x i8>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i8(<4 x i8>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v128i8(<128 x i8>, ptr, <128 x i1>)
+declare void @llvm.masked.compressstore.v256i8(<256 x i8>, ptr, <256 x i1>)
+
+declare void @llvm.masked.compressstore.v1i16(<1 x i16>, ptr, <1 x i1>)
+declare void @llvm.masked.compressstore.v2i16(<2 x i16>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v64i16(<64 x i16>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v128i16(<128 x i16>, ptr, <128 x i1>)
+
+declare void @llvm.masked.compressstore.v1i32(<1 x i32>, ptr, <1 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v32i32(<32 x i32>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v64i32(<64 x i32>, ptr, <64 x i1>)
+
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i64(<16 x i64>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v32i64(<32 x i64>, ptr, <32 x i1>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index e15e6452163b1..76aa2b913c652 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -469,8 +469,9 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v13, v10, a0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
 ; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
index 52c52921e7e1d..36fbdd8e0664f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
@@ -6,24 +6,20 @@ declare void @llvm.masked.compressstore.v1f16(<1 x half>, ptr, <1 x i1>)
 define void @compressstore_v1f16(ptr %base, <1 x half> %v, <1 x i1> %mask) {
 ; RV32-LABEL: compressstore_v1f16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT:    vfirst.m a1, v0
-; RV32-NEXT:    bnez a1, .LBB0_2
-; RV32-NEXT:  # %bb.1: # %cond.store
 ; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vse16.v v8, (a0)
-; RV32-NEXT:  .LBB0_2: # %else
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT:    vse16.v v9, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v1f16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT:    vfirst.m a1, v0
-; RV64-NEXT:    bnez a1, .LBB0_2
-; RV64-NEXT:  # %bb.1: # %cond.store
 ; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vse16.v v8, (a0)
-; RV64-NEXT:  .LBB0_2: # %else
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT:    vse16.v v9, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v1f16(<1 x half> %v, ptr align 2 %base, <1 x i1> %mask)
   ret void
@@ -33,48 +29,20 @@ declare void @llvm.masked.compressstore.v2f16(<2 x half>, ptr, <2 x i1>)
 define void @compressstore_v2f16(ptr %base, <2 x half> %v, <2 x i1> %mask) {
 ; RV32-LABEL: compressstore_v2f16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB1_3
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a1, a1, 2
-; RV32-NEXT:    bnez a1, .LBB1_4
-; RV32-NEXT:  .LBB1_2: # %else2
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB1_3: # %cond.store
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vse16.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a1, a1, 2
-; RV32-NEXT:    beqz a1, .LBB1_2
-; RV32-NEXT:  .LBB1_4: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-NEXT:    vse16.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV32-NEXT:    vse16.v v9, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v2f16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB1_3
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a1, a1, 2
-; RV64-NEXT:    bnez a1, .LBB1_4
-; RV64-NEXT:  .LBB1_2: # %else2
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB1_3: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vse16.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a1, a1, 2
-; RV64-NEXT:    beqz a1, .LBB1_2
-; RV64-NEXT:  .LBB1_4: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vse16.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; RV64-NEXT:    vse16.v v9, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v2f16(<2 x half> %v, ptr align 2 %base, <2 x i1> %mask)
   ret void
@@ -84,88 +52,20 @@ declare void @llvm.masked.compressstore.v4f16(<4 x half>, ptr, <4 x i1>)
 define void @compressstore_v4f16(ptr %base, <4 x half> %v, <4 x i1> %mask) {
 ; RV32-LABEL: compressstore_v4f16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB2_5
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    bnez a2, .LBB2_6
-; RV32-NEXT:  .LBB2_2: # %else2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    bnez a2, .LBB2_7
-; RV32-NEXT:  .LBB2_3: # %else5
-; RV32-NEXT:    andi a1, a1, 8
-; RV32-NEXT:    bnez a1, .LBB2_8
-; RV32-NEXT:  .LBB2_4: # %else8
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB2_5: # %cond.store
-; RV32-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT:    vse16.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    beqz a2, .LBB2_2
-; RV32-NEXT:  .LBB2_6: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vse16.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    beqz a2, .LBB2_3
-; RV32-NEXT:  .LBB2_7: # %cond.store4
-; RV32-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; RV32-NEXT:    vse16.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a1, a1, 8
-; RV32-NEXT:    beqz a1, .LBB2_4
-; RV32-NEXT:  .LBB2_8: # %cond.store7
-; RV32-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vse16.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v4f16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB2_5
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    bnez a2, .LBB2_6
-; RV64-NEXT:  .LBB2_2: # %else2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    bnez a2, .LBB2_7
-; RV64-NEXT:  .LBB2_3: # %else5
-; RV64-NEXT:    andi a1, a1, 8
-; RV64-NEXT:    bnez a1, .LBB2_8
-; RV64-NEXT:  .LBB2_4: # %else8
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB2_5: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT:    vse16.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    beqz a2, .LBB2_2
-; RV64-NEXT:  .LBB2_6: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vse16.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    beqz a2, .LBB2_3
-; RV64-NEXT:  .LBB2_7: # %cond.store4
-; RV64-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
+; RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; RV64-NEXT:    vse16.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a1, a1, 8
-; RV64-NEXT:    beqz a1, .LBB2_4
-; RV64-NEXT:  .LBB2_8: # %cond.store7
-; RV64-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vse16.v v8, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v4f16(<4 x half> %v, ptr align 2 %base, <4 x i1> %mask)
   ret void
@@ -175,168 +75,20 @@ declare void @llvm.masked.compressstore.v8f16(<8 x half>, ptr, <8 x i1>)
 define void @compressstore_v8f16(ptr %base, <8 x half> %v, <8 x i1> %mask) {
 ; RV32-LABEL: compressstore_v8f16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB3_9
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    bnez a2, .LBB3_10
-; RV32-NEXT:  .LBB3_2: # %else2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    bnez a2, .LBB3_11
-; RV32-NEXT:  .LBB3_3: # %else5
-; RV32-NEXT:    andi a2, a1, 8
-; RV32-NEXT:    bnez a2, .LBB3_12
-; RV32-NEXT:  .LBB3_4: # %else8
-; RV32-NEXT:    andi a2, a1, 16
-; RV32-NEXT:    bnez a2, .LBB3_13
-; RV32-NEXT:  .LBB3_5: # %else11
-; RV32-NEXT:    andi a2, a1, 32
-; RV32-NEXT:    bnez a2, .LBB3_14
-; RV32-NEXT:  .LBB3_6: # %else14
-; RV32-NEXT:    andi a2, a1, 64
-; RV32-NEXT:    bnez a2, .LBB3_15
-; RV32-NEXT:  .LBB3_7: # %else17
-; RV32-NEXT:    andi a1, a1, -128
-; RV32-NEXT:    bnez a1, .LBB3_16
-; RV32-NEXT:  .LBB3_8: # %else20
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB3_9: # %cond.store
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vse16.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    beqz a2, .LBB3_2
-; RV32-NEXT:  .LBB3_10: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vse16.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    beqz a2, .LBB3_3
-; RV32-NEXT:  .LBB3_11: # %cond.store4
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
-; RV32-NEXT:    vse16.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a2, a1, 8
-; RV32-NEXT:    beqz a2, .LBB3_4
-; RV32-NEXT:  .LBB3_12: # %cond.store7
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 3
-; RV32-NEXT:    vse16.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a2, a1, 16
-; RV32-NEXT:    beqz a2, .LBB3_5
-; RV32-NEXT:  .LBB3_13: # %cond.store10
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; RV32-NEXT:    vse16.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a2, a1, 32
-; RV32-NEXT:    beqz a2, .LBB3_6
-; RV32-NEXT:  .LBB3_14: # %cond.store13
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 5
-; RV32-NEXT:    vse16.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a2, a1, 64
-; RV32-NEXT:    beqz a2, .LBB3_7
-; RV32-NEXT:  .LBB3_15: # %cond.store16
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 6
-; RV32-NEXT:    vse16.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 2
-; RV32-NEXT:    andi a1, a1, -128
-; RV32-NEXT:    beqz a1, .LBB3_8
-; RV32-NEXT:  .LBB3_16: # %cond.store19
-; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    vse16.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v8f16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB3_9
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    bnez a2, .LBB3_10
-; RV64-NEXT:  .LBB3_2: # %else2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    bnez a2, .LBB3_11
-; RV64-NEXT:  .LBB3_3: # %else5
-; RV64-NEXT:    andi a2, a1, 8
-; RV64-NEXT:    bnez a2, .LBB3_12
-; RV64-NEXT:  .LBB3_4: # %else8
-; RV64-NEXT:    andi a2, a1, 16
-; RV64-NEXT:    bnez a2, .LBB3_13
-; RV64-NEXT:  .LBB3_5: # %else11
-; RV64-NEXT:    andi a2, a1, 32
-; RV64-NEXT:    bnez a2, .LBB3_14
-; RV64-NEXT:  .LBB3_6: # %else14
-; RV64-NEXT:    andi a2, a1, 64
-; RV64-NEXT:    bnez a2, .LBB3_15
-; RV64-NEXT:  .LBB3_7: # %else17
-; RV64-NEXT:    andi a1, a1, -128
-; RV64-NEXT:    bnez a1, .LBB3_16
-; RV64-NEXT:  .LBB3_8: # %else20
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB3_9: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vse16.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    beqz a2, .LBB3_2
-; RV64-NEXT:  .LBB3_10: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vse16.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    beqz a2, .LBB3_3
-; RV64-NEXT:  .LBB3_11: # %cond.store4
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vse16.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a2, a1, 8
-; RV64-NEXT:    beqz a2, .LBB3_4
-; RV64-NEXT:  .LBB3_12: # %cond.store7
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 3
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; RV64-NEXT:    vse16.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a2, a1, 16
-; RV64-NEXT:    beqz a2, .LBB3_5
-; RV64-NEXT:  .LBB3_13: # %cond.store10
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 4
-; RV64-NEXT:    vse16.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a2, a1, 32
-; RV64-NEXT:    beqz a2, .LBB3_6
-; RV64-NEXT:  .LBB3_14: # %cond.store13
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 5
-; RV64-NEXT:    vse16.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a2, a1, 64
-; RV64-NEXT:    beqz a2, .LBB3_7
-; RV64-NEXT:  .LBB3_15: # %cond.store16
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 6
-; RV64-NEXT:    vse16.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    andi a1, a1, -128
-; RV64-NEXT:    beqz a1, .LBB3_8
-; RV64-NEXT:  .LBB3_16: # %cond.store19
-; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 7
-; RV64-NEXT:    vse16.v v8, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v8f16(<8 x half> %v, ptr align 2 %base, <8 x i1> %mask)
   ret void
@@ -346,24 +98,20 @@ declare void @llvm.masked.compressstore.v1f32(<1 x float>, ptr, <1 x i1>)
 define void @compressstore_v1f32(ptr %base, <1 x float> %v, <1 x i1> %mask) {
 ; RV32-LABEL: compressstore_v1f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT:    vfirst.m a1, v0
-; RV32-NEXT:    bnez a1, .LBB4_2
-; RV32-NEXT:  # %bb.1: # %cond.store
 ; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vse32.v v8, (a0)
-; RV32-NEXT:  .LBB4_2: # %else
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT:    vse32.v v9, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v1f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT:    vfirst.m a1, v0
-; RV64-NEXT:    bnez a1, .LBB4_2
-; RV64-NEXT:  # %bb.1: # %cond.store
 ; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT:    vse32.v v8, (a0)
-; RV64-NEXT:  .LBB4_2: # %else
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    vse32.v v9, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v1f32(<1 x float> %v, ptr align 4 %base, <1 x i1> %mask)
   ret void
@@ -373,48 +121,20 @@ declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
 define void @compressstore_v2f32(ptr %base, <2 x float> %v, <2 x i1> %mask) {
 ; RV32-LABEL: compressstore_v2f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB5_3
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a1, a1, 2
-; RV32-NEXT:    bnez a1, .LBB5_4
-; RV32-NEXT:  .LBB5_2: # %else2
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB5_3: # %cond.store
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vse32.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a1, a1, 2
-; RV32-NEXT:    beqz a1, .LBB5_2
-; RV32-NEXT:  .LBB5_4: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT:    vse32.v v9, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v2f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB5_3
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a1, a1, 2
-; RV64-NEXT:    bnez a1, .LBB5_4
-; RV64-NEXT:  .LBB5_2: # %else2
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB5_3: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT:    vse32.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a1, a1, 2
-; RV64-NEXT:    beqz a1, .LBB5_2
-; RV64-NEXT:  .LBB5_4: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vse32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    vse32.v v9, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v2f32(<2 x float> %v, ptr align 4 %base, <2 x i1> %mask)
   ret void
@@ -424,88 +144,20 @@ declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
 define void @compressstore_v4f32(ptr %base, <4 x float> %v, <4 x i1> %mask) {
 ; RV32-LABEL: compressstore_v4f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB6_5
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    bnez a2, .LBB6_6
-; RV32-NEXT:  .LBB6_2: # %else2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    bnez a2, .LBB6_7
-; RV32-NEXT:  .LBB6_3: # %else5
-; RV32-NEXT:    andi a1, a1, 8
-; RV32-NEXT:    bnez a1, .LBB6_8
-; RV32-NEXT:  .LBB6_4: # %else8
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB6_5: # %cond.store
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vse32.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    beqz a2, .LBB6_2
-; RV32-NEXT:  .LBB6_6: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vse32.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    beqz a2, .LBB6_3
-; RV32-NEXT:  .LBB6_7: # %cond.store4
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vse32.v v9, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a1, a1, 8
-; RV32-NEXT:    beqz a1, .LBB6_4
-; RV32-NEXT:  .LBB6_8: # %cond.store7
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v4f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB6_5
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    bnez a2, .LBB6_6
-; RV64-NEXT:  .LBB6_2: # %else2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    bnez a2, .LBB6_7
-; RV64-NEXT:  .LBB6_3: # %else5
-; RV64-NEXT:    andi a1, a1, 8
-; RV64-NEXT:    bnez a1, .LBB6_8
-; RV64-NEXT:  .LBB6_4: # %else8
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB6_5: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vse32.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    beqz a2, .LBB6_2
-; RV64-NEXT:  .LBB6_6: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vse32.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    beqz a2, .LBB6_3
-; RV64-NEXT:  .LBB6_7: # %cond.store4
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vse32.v v9, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a1, a1, 8
-; RV64-NEXT:    beqz a1, .LBB6_4
-; RV64-NEXT:  .LBB6_8: # %cond.store7
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v4f32(<4 x float> %v, ptr align 4 %base, <4 x i1> %mask)
   ret void
@@ -515,176 +167,20 @@ declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
 define void @compressstore_v8f32(ptr %base, <8 x float> %v, <8 x i1> %mask) {
 ; RV32-LABEL: compressstore_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB7_9
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    bnez a2, .LBB7_10
-; RV32-NEXT:  .LBB7_2: # %else2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    bnez a2, .LBB7_11
-; RV32-NEXT:  .LBB7_3: # %else5
-; RV32-NEXT:    andi a2, a1, 8
-; RV32-NEXT:    bnez a2, .LBB7_12
-; RV32-NEXT:  .LBB7_4: # %else8
-; RV32-NEXT:    andi a2, a1, 16
-; RV32-NEXT:    bnez a2, .LBB7_13
-; RV32-NEXT:  .LBB7_5: # %else11
-; RV32-NEXT:    andi a2, a1, 32
-; RV32-NEXT:    bnez a2, .LBB7_14
-; RV32-NEXT:  .LBB7_6: # %else14
-; RV32-NEXT:    andi a2, a1, 64
-; RV32-NEXT:    bnez a2, .LBB7_15
-; RV32-NEXT:  .LBB7_7: # %else17
-; RV32-NEXT:    andi a1, a1, -128
-; RV32-NEXT:    bnez a1, .LBB7_16
-; RV32-NEXT:  .LBB7_8: # %else20
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB7_9: # %cond.store
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vse32.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    beqz a2, .LBB7_2
-; RV32-NEXT:  .LBB7_10: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vse32.v v10, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    beqz a2, .LBB7_3
-; RV32-NEXT:  .LBB7_11: # %cond.store4
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vse32.v v10, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a2, a1, 8
-; RV32-NEXT:    beqz a2, .LBB7_4
-; RV32-NEXT:  .LBB7_12: # %cond.store7
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vse32.v v10, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a2, a1, 16
-; RV32-NEXT:    beqz a2, .LBB7_5
-; RV32-NEXT:  .LBB7_13: # %cond.store10
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vse32.v v10, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a2, a1, 32
-; RV32-NEXT:    beqz a2, .LBB7_6
-; RV32-NEXT:  .LBB7_14: # %cond.store13
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vse32.v v10, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a2, a1, 64
-; RV32-NEXT:    beqz a2, .LBB7_7
-; RV32-NEXT:  .LBB7_15: # %cond.store16
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vcompress.vm v10, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV32-NEXT:    vse32.v v10, (a0)
-; RV32-NEXT:    addi a0, a0, 4
-; RV32-NEXT:    andi a1, a1, -128
-; RV32-NEXT:    beqz a1, .LBB7_8
-; RV32-NEXT:  .LBB7_16: # %cond.store19
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB7_9
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    bnez a2, .LBB7_10
-; RV64-NEXT:  .LBB7_2: # %else2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    bnez a2, .LBB7_11
-; RV64-NEXT:  .LBB7_3: # %else5
-; RV64-NEXT:    andi a2, a1, 8
-; RV64-NEXT:    bnez a2, .LBB7_12
-; RV64-NEXT:  .LBB7_4: # %else8
-; RV64-NEXT:    andi a2, a1, 16
-; RV64-NEXT:    bnez a2, .LBB7_13
-; RV64-NEXT:  .LBB7_5: # %else11
-; RV64-NEXT:    andi a2, a1, 32
-; RV64-NEXT:    bnez a2, .LBB7_14
-; RV64-NEXT:  .LBB7_6: # %else14
-; RV64-NEXT:    andi a2, a1, 64
-; RV64-NEXT:    bnez a2, .LBB7_15
-; RV64-NEXT:  .LBB7_7: # %else17
-; RV64-NEXT:    andi a1, a1, -128
-; RV64-NEXT:    bnez a1, .LBB7_16
-; RV64-NEXT:  .LBB7_8: # %else20
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB7_9: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vse32.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    beqz a2, .LBB7_2
-; RV64-NEXT:  .LBB7_10: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vse32.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    beqz a2, .LBB7_3
-; RV64-NEXT:  .LBB7_11: # %cond.store4
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vse32.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a2, a1, 8
-; RV64-NEXT:    beqz a2, .LBB7_4
-; RV64-NEXT:  .LBB7_12: # %cond.store7
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vse32.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a2, a1, 16
-; RV64-NEXT:    beqz a2, .LBB7_5
-; RV64-NEXT:  .LBB7_13: # %cond.store10
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vse32.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a2, a1, 32
-; RV64-NEXT:    beqz a2, .LBB7_6
-; RV64-NEXT:  .LBB7_14: # %cond.store13
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vse32.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a2, a1, 64
-; RV64-NEXT:    beqz a2, .LBB7_7
-; RV64-NEXT:  .LBB7_15: # %cond.store16
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v10, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vse32.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 4
-; RV64-NEXT:    andi a1, a1, -128
-; RV64-NEXT:    beqz a1, .LBB7_8
-; RV64-NEXT:  .LBB7_16: # %cond.store19
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 7
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v8f32(<8 x float> %v, ptr align 4 %base, <8 x i1> %mask)
   ret void
@@ -694,24 +190,20 @@ declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
 define void @compressstore_v1f64(ptr %base, <1 x double> %v, <1 x i1> %mask) {
 ; RV32-LABEL: compressstore_v1f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT:    vfirst.m a1, v0
-; RV32-NEXT:    bnez a1, .LBB8_2
-; RV32-NEXT:  # %bb.1: # %cond.store
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vse64.v v8, (a0)
-; RV32-NEXT:  .LBB8_2: # %else
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT:    vse64.v v9, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v1f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT:    vfirst.m a1, v0
-; RV64-NEXT:    bnez a1, .LBB8_2
-; RV64-NEXT:  # %bb.1: # %cond.store
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:  .LBB8_2: # %else
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v9, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v1f64(<1 x double> %v, ptr align 8 %base, <1 x i1> %mask)
   ret void
@@ -721,48 +213,20 @@ declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
 define void @compressstore_v2f64(ptr %base, <2 x double> %v, <2 x i1> %mask) {
 ; RV32-LABEL: compressstore_v2f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB9_3
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a1, a1, 2
-; RV32-NEXT:    bnez a1, .LBB9_4
-; RV32-NEXT:  .LBB9_2: # %else2
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB9_3: # %cond.store
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vse64.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a1, a1, 2
-; RV32-NEXT:    beqz a1, .LBB9_2
-; RV32-NEXT:  .LBB9_4: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-NEXT:    vse64.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT:    vse64.v v9, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v2f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB9_3
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a1, a1, 2
-; RV64-NEXT:    bnez a1, .LBB9_4
-; RV64-NEXT:  .LBB9_2: # %else2
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB9_3: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a1, a1, 2
-; RV64-NEXT:    beqz a1, .LBB9_2
-; RV64-NEXT:  .LBB9_4: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vse64.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vse64.v v9, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v2f64(<2 x double> %v, ptr align 8 %base, <2 x i1> %mask)
   ret void
@@ -772,92 +236,20 @@ declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
 define void @compressstore_v4f64(ptr %base, <4 x double> %v, <4 x i1> %mask) {
 ; RV32-LABEL: compressstore_v4f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB10_5
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    bnez a2, .LBB10_6
-; RV32-NEXT:  .LBB10_2: # %else2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    bnez a2, .LBB10_7
-; RV32-NEXT:  .LBB10_3: # %else5
-; RV32-NEXT:    andi a1, a1, 8
-; RV32-NEXT:    bnez a1, .LBB10_8
-; RV32-NEXT:  .LBB10_4: # %else8
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB10_5: # %cond.store
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vse64.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    beqz a2, .LBB10_2
-; RV32-NEXT:  .LBB10_6: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vcompress.vm v10, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV32-NEXT:    vse64.v v10, (a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    beqz a2, .LBB10_3
-; RV32-NEXT:  .LBB10_7: # %cond.store4
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vse64.v v10, (a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a1, a1, 8
-; RV32-NEXT:    beqz a1, .LBB10_4
-; RV32-NEXT:  .LBB10_8: # %cond.store7
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v4f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB10_5
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    bnez a2, .LBB10_6
-; RV64-NEXT:  .LBB10_2: # %else2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    bnez a2, .LBB10_7
-; RV64-NEXT:  .LBB10_3: # %else5
-; RV64-NEXT:    andi a1, a1, 8
-; RV64-NEXT:    bnez a1, .LBB10_8
-; RV64-NEXT:  .LBB10_4: # %else8
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB10_5: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    beqz a2, .LBB10_2
-; RV64-NEXT:  .LBB10_6: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vse64.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    beqz a2, .LBB10_3
-; RV64-NEXT:  .LBB10_7: # %cond.store4
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v10, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; RV64-NEXT:    vse64.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a1, a1, 8
-; RV64-NEXT:    beqz a1, .LBB10_4
-; RV64-NEXT:  .LBB10_8: # %cond.store7
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v4f64(<4 x double> %v, ptr align 8 %base, <4 x i1> %mask)
   ret void
@@ -867,213 +259,21 @@ declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
 define void @compressstore_v8f64(ptr %base, <8 x double> %v, <8 x i1> %mask) {
 ; RV32-LABEL: compressstore_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB11_11
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    bnez a2, .LBB11_12
-; RV32-NEXT:  .LBB11_2: # %else2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    bnez a2, .LBB11_13
-; RV32-NEXT:  .LBB11_3: # %else5
-; RV32-NEXT:    andi a2, a1, 8
-; RV32-NEXT:    beqz a2, .LBB11_5
-; RV32-NEXT:  .LBB11_4: # %cond.store7
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vse64.v v12, (a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:  .LBB11_5: # %else8
-; RV32-NEXT:    addi sp, sp, -320
-; RV32-NEXT:    .cfi_def_cfa_offset 320
-; RV32-NEXT:    sw ra, 316(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 312(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 320
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    andi a2, a1, 16
-; RV32-NEXT:    bnez a2, .LBB11_14
-; RV32-NEXT:  # %bb.6: # %else11
-; RV32-NEXT:    andi a2, a1, 32
-; RV32-NEXT:    bnez a2, .LBB11_15
-; RV32-NEXT:  .LBB11_7: # %else14
-; RV32-NEXT:    andi a2, a1, 64
-; RV32-NEXT:    bnez a2, .LBB11_16
-; RV32-NEXT:  .LBB11_8: # %else17
-; RV32-NEXT:    andi a1, a1, -128
-; RV32-NEXT:    beqz a1, .LBB11_10
-; RV32-NEXT:  .LBB11_9: # %cond.store19
-; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vse64.v v8, (a1)
-; RV32-NEXT:    fld fa5, 56(sp)
-; RV32-NEXT:    fsd fa5, 0(a0)
-; RV32-NEXT:  .LBB11_10: # %else20
-; RV32-NEXT:    addi sp, s0, -320
-; RV32-NEXT:    lw ra, 316(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 312(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 320
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB11_11: # %cond.store
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vse64.v v8, (a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    beqz a2, .LBB11_2
-; RV32-NEXT:  .LBB11_12: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
+; RV32-NEXT:    vcompress.vm v12, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV32-NEXT:    vse64.v v12, (a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    beqz a2, .LBB11_3
-; RV32-NEXT:  .LBB11_13: # %cond.store4
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vse64.v v12, (a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 8
-; RV32-NEXT:    bnez a2, .LBB11_4
-; RV32-NEXT:    j .LBB11_5
-; RV32-NEXT:  .LBB11_14: # %cond.store10
-; RV32-NEXT:    addi a2, sp, 192
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vse64.v v8, (a2)
-; RV32-NEXT:    fld fa5, 224(sp)
-; RV32-NEXT:    fsd fa5, 0(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 32
-; RV32-NEXT:    beqz a2, .LBB11_7
-; RV32-NEXT:  .LBB11_15: # %cond.store13
-; RV32-NEXT:    addi a2, sp, 128
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vse64.v v8, (a2)
-; RV32-NEXT:    fld fa5, 168(sp)
-; RV32-NEXT:    fsd fa5, 0(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 64
-; RV32-NEXT:    beqz a2, .LBB11_8
-; RV32-NEXT:  .LBB11_16: # %cond.store16
-; RV32-NEXT:    addi a2, sp, 64
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vse64.v v8, (a2)
-; RV32-NEXT:    fld fa5, 112(sp)
-; RV32-NEXT:    fsd fa5, 0(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a1, a1, -128
-; RV32-NEXT:    bnez a1, .LBB11_9
-; RV32-NEXT:    j .LBB11_10
+; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: compressstore_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB11_11
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    bnez a2, .LBB11_12
-; RV64-NEXT:  .LBB11_2: # %else2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    bnez a2, .LBB11_13
-; RV64-NEXT:  .LBB11_3: # %else5
-; RV64-NEXT:    andi a2, a1, 8
-; RV64-NEXT:    beqz a2, .LBB11_5
-; RV64-NEXT:  .LBB11_4: # %cond.store7
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v12, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:  .LBB11_5: # %else8
-; RV64-NEXT:    addi sp, sp, -320
-; RV64-NEXT:    .cfi_def_cfa_offset 320
-; RV64-NEXT:    sd ra, 312(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 304(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 320
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    andi a2, a1, 16
-; RV64-NEXT:    bnez a2, .LBB11_14
-; RV64-NEXT:  # %bb.6: # %else11
-; RV64-NEXT:    andi a2, a1, 32
-; RV64-NEXT:    bnez a2, .LBB11_15
-; RV64-NEXT:  .LBB11_7: # %else14
-; RV64-NEXT:    andi a2, a1, 64
-; RV64-NEXT:    bnez a2, .LBB11_16
-; RV64-NEXT:  .LBB11_8: # %else17
-; RV64-NEXT:    andi a1, a1, -128
-; RV64-NEXT:    beqz a1, .LBB11_10
-; RV64-NEXT:  .LBB11_9: # %cond.store19
-; RV64-NEXT:    mv a1, sp
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a1)
-; RV64-NEXT:    fld fa5, 56(sp)
-; RV64-NEXT:    fsd fa5, 0(a0)
-; RV64-NEXT:  .LBB11_10: # %else20
-; RV64-NEXT:    addi sp, s0, -320
-; RV64-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 304(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 320
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB11_11: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    beqz a2, .LBB11_2
-; RV64-NEXT:  .LBB11_12: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    vcompress.vm v12, v8, v0
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    vse64.v v12, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    beqz a2, .LBB11_3
-; RV64-NEXT:  .LBB11_13: # %cond.store4
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v12, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 8
-; RV64-NEXT:    bnez a2, .LBB11_4
-; RV64-NEXT:    j .LBB11_5
-; RV64-NEXT:  .LBB11_14: # %cond.store10
-; RV64-NEXT:    addi a2, sp, 192
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    fld fa5, 224(sp)
-; RV64-NEXT:    fsd fa5, 0(a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 32
-; RV64-NEXT:    beqz a2, .LBB11_7
-; RV64-NEXT:  .LBB11_15: # %cond.store13
-; RV64-NEXT:    addi a2, sp, 128
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    fld fa5, 168(sp)
-; RV64-NEXT:    fsd fa5, 0(a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 64
-; RV64-NEXT:    beqz a2, .LBB11_8
-; RV64-NEXT:  .LBB11_16: # %cond.store16
-; RV64-NEXT:    addi a2, sp, 64
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    fld fa5, 112(sp)
-; RV64-NEXT:    fsd fa5, 0(a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a1, a1, -128
-; RV64-NEXT:    bnez a1, .LBB11_9
-; RV64-NEXT:    j .LBB11_10
+; RV64-NEXT:    ret
   call void @llvm.masked.compressstore.v8f64(<8 x double> %v, ptr align 8 %base, <8 x i1> %mask)
   ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll
index eb0096dbfba6d..a388ba92f302b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll
@@ -6,13 +6,11 @@ declare void @llvm.masked.compressstore.v1i8(<1 x i8>, ptr, <1 x i1>)
 define void @compressstore_v1i8(ptr %base, <1 x i8> %v, <1 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vfirst.m a1, v0
-; CHECK-NEXT:    bnez a1, .LBB0_2
-; CHECK-NEXT:  # %bb.1: # %cond.store
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vse8.v v8, (a0)
-; CHECK-NEXT:  .LBB0_2: # %else
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vse8.v v9, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v1i8(<1 x i8> %v, ptr %base, <1 x i1> %mask)
   ret void
@@ -22,25 +20,11 @@ declare void @llvm.masked.compressstore.v2i8(<2 x i8>, ptr, <2 x i1>)
 define void @compressstore_v2i8(ptr %base, <2 x i8> %v, <2 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v0
-; CHECK-NEXT:    andi a2, a1, 1
-; CHECK-NEXT:    bnez a2, .LBB1_3
-; CHECK-NEXT:  # %bb.1: # %else
-; CHECK-NEXT:    andi a1, a1, 2
-; CHECK-NEXT:    bnez a1, .LBB1_4
-; CHECK-NEXT:  .LBB1_2: # %else2
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_3: # %cond.store
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vse8.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a1, a1, 2
-; CHECK-NEXT:    beqz a1, .LBB1_2
-; CHECK-NEXT:  .LBB1_4: # %cond.store1
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vse8.v v9, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v2i8(<2 x i8> %v, ptr %base, <2 x i1> %mask)
   ret void
@@ -50,45 +34,11 @@ declare void @llvm.masked.compressstore.v4i8(<4 x i8>, ptr, <4 x i1>)
 define void @compressstore_v4i8(ptr %base, <4 x i8> %v, <4 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v0
-; CHECK-NEXT:    andi a2, a1, 1
-; CHECK-NEXT:    bnez a2, .LBB2_5
-; CHECK-NEXT:  # %bb.1: # %else
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    bnez a2, .LBB2_6
-; CHECK-NEXT:  .LBB2_2: # %else2
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    bnez a2, .LBB2_7
-; CHECK-NEXT:  .LBB2_3: # %else5
-; CHECK-NEXT:    andi a1, a1, 8
-; CHECK-NEXT:    bnez a1, .LBB2_8
-; CHECK-NEXT:  .LBB2_4: # %else8
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB2_5: # %cond.store
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT:    vse8.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    beqz a2, .LBB2_2
-; CHECK-NEXT:  .LBB2_6: # %cond.store1
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vse8.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    beqz a2, .LBB2_3
-; CHECK-NEXT:  .LBB2_7: # %cond.store4
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vse8.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a1, a1, 8
-; CHECK-NEXT:    beqz a1, .LBB2_4
-; CHECK-NEXT:  .LBB2_8: # %cond.store7
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 3
-; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v4i8(<4 x i8> %v, ptr %base, <4 x i1> %mask)
   ret void
@@ -98,85 +48,11 @@ declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
 define void @compressstore_v8i8(ptr %base, <8 x i8> %v, <8 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v0
-; CHECK-NEXT:    andi a2, a1, 1
-; CHECK-NEXT:    bnez a2, .LBB3_9
-; CHECK-NEXT:  # %bb.1: # %else
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    bnez a2, .LBB3_10
-; CHECK-NEXT:  .LBB3_2: # %else2
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    bnez a2, .LBB3_11
-; CHECK-NEXT:  .LBB3_3: # %else5
-; CHECK-NEXT:    andi a2, a1, 8
-; CHECK-NEXT:    bnez a2, .LBB3_12
-; CHECK-NEXT:  .LBB3_4: # %else8
-; CHECK-NEXT:    andi a2, a1, 16
-; CHECK-NEXT:    bnez a2, .LBB3_13
-; CHECK-NEXT:  .LBB3_5: # %else11
-; CHECK-NEXT:    andi a2, a1, 32
-; CHECK-NEXT:    bnez a2, .LBB3_14
-; CHECK-NEXT:  .LBB3_6: # %else14
-; CHECK-NEXT:    andi a2, a1, 64
-; CHECK-NEXT:    bnez a2, .LBB3_15
-; CHECK-NEXT:  .LBB3_7: # %else17
-; CHECK-NEXT:    andi a1, a1, -128
-; CHECK-NEXT:    bnez a1, .LBB3_16
-; CHECK-NEXT:  .LBB3_8: # %else20
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB3_9: # %cond.store
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vse8.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    beqz a2, .LBB3_2
-; CHECK-NEXT:  .LBB3_10: # %cond.store1
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vse8.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    beqz a2, .LBB3_3
-; CHECK-NEXT:  .LBB3_11: # %cond.store4
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vse8.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a2, a1, 8
-; CHECK-NEXT:    beqz a2, .LBB3_4
-; CHECK-NEXT:  .LBB3_12: # %cond.store7
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 3
-; CHECK-NEXT:    vse8.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a2, a1, 16
-; CHECK-NEXT:    beqz a2, .LBB3_5
-; CHECK-NEXT:  .LBB3_13: # %cond.store10
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 4
-; CHECK-NEXT:    vse8.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a2, a1, 32
-; CHECK-NEXT:    beqz a2, .LBB3_6
-; CHECK-NEXT:  .LBB3_14: # %cond.store13
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 5
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vse8.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a2, a1, 64
-; CHECK-NEXT:    beqz a2, .LBB3_7
-; CHECK-NEXT:  .LBB3_15: # %cond.store16
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 6
-; CHECK-NEXT:    vse8.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    andi a1, a1, -128
-; CHECK-NEXT:    beqz a1, .LBB3_8
-; CHECK-NEXT:  .LBB3_16: # %cond.store19
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 7
-; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v8i8(<8 x i8> %v, ptr %base, <8 x i1> %mask)
   ret void
@@ -186,13 +62,11 @@ declare void @llvm.masked.compressstore.v1i16(<1 x i16>, ptr, <1 x i1>)
 define void @compressstore_v1i16(ptr %base, <1 x i16> %v, <1 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vfirst.m a1, v0
-; CHECK-NEXT:    bnez a1, .LBB4_2
-; CHECK-NEXT:  # %bb.1: # %cond.store
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vse16.v v8, (a0)
-; CHECK-NEXT:  .LBB4_2: # %else
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v9, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v1i16(<1 x i16> %v, ptr align 2 %base, <1 x i1> %mask)
   ret void
@@ -202,25 +76,11 @@ declare void @llvm.masked.compressstore.v2i16(<2 x i16>, ptr, <2 x i1>)
 define void @compressstore_v2i16(ptr %base, <2 x i16> %v, <2 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v0
-; CHECK-NEXT:    andi a2, a1, 1
-; CHECK-NEXT:    bnez a2, .LBB5_3
-; CHECK-NEXT:  # %bb.1: # %else
-; CHECK-NEXT:    andi a1, a1, 2
-; CHECK-NEXT:    bnez a1, .LBB5_4
-; CHECK-NEXT:  .LBB5_2: # %else2
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB5_3: # %cond.store
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vse16.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a1, a1, 2
-; CHECK-NEXT:    beqz a1, .LBB5_2
-; CHECK-NEXT:  .LBB5_4: # %cond.store1
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v9, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v2i16(<2 x i16> %v, ptr align 2 %base, <2 x i1> %mask)
   ret void
@@ -230,45 +90,11 @@ declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
 define void @compressstore_v4i16(ptr %base, <4 x i16> %v, <4 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v0
-; CHECK-NEXT:    andi a2, a1, 1
-; CHECK-NEXT:    bnez a2, .LBB6_5
-; CHECK-NEXT:  # %bb.1: # %else
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    bnez a2, .LBB6_6
-; CHECK-NEXT:  .LBB6_2: # %else2
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    bnez a2, .LBB6_7
-; CHECK-NEXT:  .LBB6_3: # %else5
-; CHECK-NEXT:    andi a1, a1, 8
-; CHECK-NEXT:    bnez a1, .LBB6_8
-; CHECK-NEXT:  .LBB6_4: # %else8
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB6_5: # %cond.store
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT:    vse16.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    beqz a2, .LBB6_2
-; CHECK-NEXT:  .LBB6_6: # %cond.store1
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vse16.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    beqz a2, .LBB6_3
-; CHECK-NEXT:  .LBB6_7: # %cond.store4
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vse16.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a1, a1, 8
-; CHECK-NEXT:    beqz a1, .LBB6_4
-; CHECK-NEXT:  .LBB6_8: # %cond.store7
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 3
-; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v4i16(<4 x i16> %v, ptr align 2 %base, <4 x i1> %mask)
   ret void
@@ -278,85 +104,11 @@ declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
 define void @compressstore_v8i16(ptr %base, <8 x i16> %v, <8 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v0
-; CHECK-NEXT:    andi a2, a1, 1
-; CHECK-NEXT:    bnez a2, .LBB7_9
-; CHECK-NEXT:  # %bb.1: # %else
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    bnez a2, .LBB7_10
-; CHECK-NEXT:  .LBB7_2: # %else2
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    bnez a2, .LBB7_11
-; CHECK-NEXT:  .LBB7_3: # %else5
-; CHECK-NEXT:    andi a2, a1, 8
-; CHECK-NEXT:    bnez a2, .LBB7_12
-; CHECK-NEXT:  .LBB7_4: # %else8
-; CHECK-NEXT:    andi a2, a1, 16
-; CHECK-NEXT:    bnez a2, .LBB7_13
-; CHECK-NEXT:  .LBB7_5: # %else11
-; CHECK-NEXT:    andi a2, a1, 32
-; CHECK-NEXT:    bnez a2, .LBB7_14
-; CHECK-NEXT:  .LBB7_6: # %else14
-; CHECK-NEXT:    andi a2, a1, 64
-; CHECK-NEXT:    bnez a2, .LBB7_15
-; CHECK-NEXT:  .LBB7_7: # %else17
-; CHECK-NEXT:    andi a1, a1, -128
-; CHECK-NEXT:    bnez a1, .LBB7_16
-; CHECK-NEXT:  .LBB7_8: # %else20
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB7_9: # %cond.store
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vse16.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    beqz a2, .LBB7_2
-; CHECK-NEXT:  .LBB7_10: # %cond.store1
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vse16.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    beqz a2, .LBB7_3
-; CHECK-NEXT:  .LBB7_11: # %cond.store4
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vse16.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a2, a1, 8
-; CHECK-NEXT:    beqz a2, .LBB7_4
-; CHECK-NEXT:  .LBB7_12: # %cond.store7
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 3
-; CHECK-NEXT:    vse16.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a2, a1, 16
-; CHECK-NEXT:    beqz a2, .LBB7_5
-; CHECK-NEXT:  .LBB7_13: # %cond.store10
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 4
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vse16.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a2, a1, 32
-; CHECK-NEXT:    beqz a2, .LBB7_6
-; CHECK-NEXT:  .LBB7_14: # %cond.store13
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 5
-; CHECK-NEXT:    vse16.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a2, a1, 64
-; CHECK-NEXT:    beqz a2, .LBB7_7
-; CHECK-NEXT:  .LBB7_15: # %cond.store16
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 6
-; CHECK-NEXT:    vse16.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 2
-; CHECK-NEXT:    andi a1, a1, -128
-; CHECK-NEXT:    beqz a1, .LBB7_8
-; CHECK-NEXT:  .LBB7_16: # %cond.store19
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 7
-; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v8i16(<8 x i16> %v, ptr align 2 %base, <8 x i1> %mask)
   ret void
@@ -366,13 +118,11 @@ declare void @llvm.masked.compressstore.v1i32(<1 x i32>, ptr, <1 x i1>)
 define void @compressstore_v1i32(ptr %base, <1 x i32> %v, <1 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vfirst.m a1, v0
-; CHECK-NEXT:    bnez a1, .LBB8_2
-; CHECK-NEXT:  # %bb.1: # %cond.store
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:  .LBB8_2: # %else
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v9, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v1i32(<1 x i32> %v, ptr align 4 %base, <1 x i1> %mask)
   ret void
@@ -382,25 +132,11 @@ declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
 define void @compressstore_v2i32(ptr %base, <2 x i32> %v, <2 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v0
-; CHECK-NEXT:    andi a2, a1, 1
-; CHECK-NEXT:    bnez a2, .LBB9_3
-; CHECK-NEXT:  # %bb.1: # %else
-; CHECK-NEXT:    andi a1, a1, 2
-; CHECK-NEXT:    bnez a1, .LBB9_4
-; CHECK-NEXT:  .LBB9_2: # %else2
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB9_3: # %cond.store
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a1, a1, 2
-; CHECK-NEXT:    beqz a1, .LBB9_2
-; CHECK-NEXT:  .LBB9_4: # %cond.store1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vse32.v v9, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v2i32(<2 x i32> %v, ptr align 4 %base, <2 x i1> %mask)
   ret void
@@ -410,45 +146,11 @@ declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
 define void @compressstore_v4i32(ptr %base, <4 x i32> %v, <4 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v0
-; CHECK-NEXT:    andi a2, a1, 1
-; CHECK-NEXT:    bnez a2, .LBB10_5
-; CHECK-NEXT:  # %bb.1: # %else
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    bnez a2, .LBB10_6
-; CHECK-NEXT:  .LBB10_2: # %else2
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    bnez a2, .LBB10_7
-; CHECK-NEXT:  .LBB10_3: # %else5
-; CHECK-NEXT:    andi a1, a1, 8
-; CHECK-NEXT:    bnez a1, .LBB10_8
-; CHECK-NEXT:  .LBB10_4: # %else8
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB10_5: # %cond.store
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    beqz a2, .LBB10_2
-; CHECK-NEXT:  .LBB10_6: # %cond.store1
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    beqz a2, .LBB10_3
-; CHECK-NEXT:  .LBB10_7: # %cond.store4
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a1, a1, 8
-; CHECK-NEXT:    beqz a1, .LBB10_4
-; CHECK-NEXT:  .LBB10_8: # %cond.store7
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 3
-; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v4i32(<4 x i32> %v, ptr align 4 %base, <4 x i1> %mask)
   ret void
@@ -458,89 +160,11 @@ declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
 define void @compressstore_v8i32(ptr %base, <8 x i32> %v, <8 x i1> %mask) {
 ; CHECK-LABEL: compressstore_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a1, v0
-; CHECK-NEXT:    andi a2, a1, 1
-; CHECK-NEXT:    bnez a2, .LBB11_9
-; CHECK-NEXT:  # %bb.1: # %else
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    bnez a2, .LBB11_10
-; CHECK-NEXT:  .LBB11_2: # %else2
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    bnez a2, .LBB11_11
-; CHECK-NEXT:  .LBB11_3: # %else5
-; CHECK-NEXT:    andi a2, a1, 8
-; CHECK-NEXT:    bnez a2, .LBB11_12
-; CHECK-NEXT:  .LBB11_4: # %else8
-; CHECK-NEXT:    andi a2, a1, 16
-; CHECK-NEXT:    bnez a2, .LBB11_13
-; CHECK-NEXT:  .LBB11_5: # %else11
-; CHECK-NEXT:    andi a2, a1, 32
-; CHECK-NEXT:    bnez a2, .LBB11_14
-; CHECK-NEXT:  .LBB11_6: # %else14
-; CHECK-NEXT:    andi a2, a1, 64
-; CHECK-NEXT:    bnez a2, .LBB11_15
-; CHECK-NEXT:  .LBB11_7: # %else17
-; CHECK-NEXT:    andi a1, a1, -128
-; CHECK-NEXT:    bnez a1, .LBB11_16
-; CHECK-NEXT:  .LBB11_8: # %else20
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB11_9: # %cond.store
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a2, a1, 2
-; CHECK-NEXT:    beqz a2, .LBB11_2
-; CHECK-NEXT:  .LBB11_10: # %cond.store1
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 1
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a2, a1, 4
-; CHECK-NEXT:    beqz a2, .LBB11_3
-; CHECK-NEXT:  .LBB11_11: # %cond.store4
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a2, a1, 8
-; CHECK-NEXT:    beqz a2, .LBB11_4
-; CHECK-NEXT:  .LBB11_12: # %cond.store7
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vcompress.vm v10, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a2, a1, 16
-; CHECK-NEXT:    beqz a2, .LBB11_5
-; CHECK-NEXT:  .LBB11_13: # %cond.store10
-; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 4
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a2, a1, 32
-; CHECK-NEXT:    beqz a2, .LBB11_6
-; CHECK-NEXT:  .LBB11_14: # %cond.store13
-; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 5
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a2, a1, 64
-; CHECK-NEXT:    beqz a2, .LBB11_7
-; CHECK-NEXT:  .LBB11_15: # %cond.store16
-; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    andi a1, a1, -128
-; CHECK-NEXT:    beqz a1, .LBB11_8
-; CHECK-NEXT:  .LBB11_16: # %cond.store19
-; CHECK-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 7
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v8i32(<8 x i32> %v, ptr align 4 %base, <8 x i1> %mask)
   ret void
@@ -548,439 +172,59 @@ define void @compressstore_v8i32(ptr %base, <8 x i32> %v, <8 x i1> %mask) {
 
 declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
 define void @compressstore_v1i64(ptr %base, <1 x i64> %v, <1 x i1> %mask) {
-; RV32-LABEL: compressstore_v1i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT:    vfirst.m a1, v0
-; RV32-NEXT:    bnez a1, .LBB12_2
-; RV32-NEXT:  # %bb.1: # %cond.store
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    vmv.x.s a1, v9
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    sw a2, 0(a0)
-; RV32-NEXT:    sw a1, 4(a0)
-; RV32-NEXT:  .LBB12_2: # %else
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: compressstore_v1i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT:    vfirst.m a1, v0
-; RV64-NEXT:    bnez a1, .LBB12_2
-; RV64-NEXT:  # %bb.1: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:  .LBB12_2: # %else
-; RV64-NEXT:    ret
+; CHECK-LABEL: compressstore_v1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v9, (a0)
+; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v1i64(<1 x i64> %v, ptr align 8 %base, <1 x i1> %mask)
   ret void
 }
 
 declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
 define void @compressstore_v2i64(ptr %base, <2 x i64> %v, <2 x i1> %mask) {
-; RV32-LABEL: compressstore_v2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB13_3
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a1, a1, 2
-; RV32-NEXT:    bnez a1, .LBB13_4
-; RV32-NEXT:  .LBB13_2: # %else2
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB13_3: # %cond.store
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a2
-; RV32-NEXT:    vmv.x.s a2, v9
-; RV32-NEXT:    vmv.x.s a3, v8
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a1, a1, 2
-; RV32-NEXT:    beqz a1, .LBB13_2
-; RV32-NEXT:  .LBB13_4: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    vmv.x.s a1, v9
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    sw a2, 0(a0)
-; RV32-NEXT:    sw a1, 4(a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: compressstore_v2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB13_3
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a1, a1, 2
-; RV64-NEXT:    bnez a1, .LBB13_4
-; RV64-NEXT:  .LBB13_2: # %else2
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB13_3: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a1, a1, 2
-; RV64-NEXT:    beqz a1, .LBB13_2
-; RV64-NEXT:  .LBB13_4: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    ret
+; CHECK-LABEL: compressstore_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vcompress.vm v9, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vse64.v v9, (a0)
+; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v2i64(<2 x i64> %v, ptr align 8 %base, <2 x i1> %mask)
   ret void
 }
 
 declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
 define void @compressstore_v4i64(ptr %base, <4 x i64> %v, <4 x i1> %mask) {
-; RV32-LABEL: compressstore_v4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB14_5
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    bnez a2, .LBB14_6
-; RV32-NEXT:  .LBB14_2: # %else2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    bnez a2, .LBB14_7
-; RV32-NEXT:  .LBB14_3: # %else5
-; RV32-NEXT:    andi a1, a1, 8
-; RV32-NEXT:    bnez a1, .LBB14_8
-; RV32-NEXT:  .LBB14_4: # %else8
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB14_5: # %cond.store
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vmv.x.s a3, v8
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    beqz a2, .LBB14_2
-; RV32-NEXT:  .LBB14_6: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v12, v10, a2
-; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    beqz a2, .LBB14_3
-; RV32-NEXT:  .LBB14_7: # %cond.store4
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v12, v10, a2
-; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a1, a1, 8
-; RV32-NEXT:    beqz a1, .LBB14_4
-; RV32-NEXT:  .LBB14_8: # %cond.store7
-; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    sw a2, 0(a0)
-; RV32-NEXT:    sw a1, 4(a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: compressstore_v4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB14_5
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    bnez a2, .LBB14_6
-; RV64-NEXT:  .LBB14_2: # %else2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    bnez a2, .LBB14_7
-; RV64-NEXT:  .LBB14_3: # %else5
-; RV64-NEXT:    andi a1, a1, 8
-; RV64-NEXT:    bnez a1, .LBB14_8
-; RV64-NEXT:  .LBB14_4: # %else8
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB14_5: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    beqz a2, .LBB14_2
-; RV64-NEXT:  .LBB14_6: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vse64.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    beqz a2, .LBB14_3
-; RV64-NEXT:  .LBB14_7: # %cond.store4
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v10, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a1, a1, 8
-; RV64-NEXT:    beqz a1, .LBB14_4
-; RV64-NEXT:  .LBB14_8: # %cond.store7
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    ret
+; CHECK-LABEL: compressstore_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vcompress.vm v10, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vse64.v v10, (a0)
+; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v4i64(<4 x i64> %v, ptr align 8 %base, <4 x i1> %mask)
   ret void
 }
 
 declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
 define void @compressstore_v8i64(ptr %base, <8 x i64> %v, <8 x i1> %mask) {
-; RV32-LABEL: compressstore_v8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
-; RV32-NEXT:    andi a2, a1, 1
-; RV32-NEXT:    bnez a2, .LBB15_9
-; RV32-NEXT:  # %bb.1: # %else
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    bnez a2, .LBB15_10
-; RV32-NEXT:  .LBB15_2: # %else2
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    bnez a2, .LBB15_11
-; RV32-NEXT:  .LBB15_3: # %else5
-; RV32-NEXT:    andi a2, a1, 8
-; RV32-NEXT:    bnez a2, .LBB15_12
-; RV32-NEXT:  .LBB15_4: # %else8
-; RV32-NEXT:    andi a2, a1, 16
-; RV32-NEXT:    bnez a2, .LBB15_13
-; RV32-NEXT:  .LBB15_5: # %else11
-; RV32-NEXT:    andi a2, a1, 32
-; RV32-NEXT:    bnez a2, .LBB15_14
-; RV32-NEXT:  .LBB15_6: # %else14
-; RV32-NEXT:    andi a2, a1, 64
-; RV32-NEXT:    bnez a2, .LBB15_15
-; RV32-NEXT:  .LBB15_7: # %else17
-; RV32-NEXT:    andi a1, a1, -128
-; RV32-NEXT:    bnez a1, .LBB15_16
-; RV32-NEXT:  .LBB15_8: # %else20
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB15_9: # %cond.store
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a2
-; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vmv.x.s a3, v8
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 2
-; RV32-NEXT:    beqz a2, .LBB15_2
-; RV32-NEXT:  .LBB15_10: # %cond.store1
-; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 1
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v16, v12, a2
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 4
-; RV32-NEXT:    beqz a2, .LBB15_3
-; RV32-NEXT:  .LBB15_11: # %cond.store4
-; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v16, v12, a2
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 8
-; RV32-NEXT:    beqz a2, .LBB15_4
-; RV32-NEXT:  .LBB15_12: # %cond.store7
-; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v16, v12, a2
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 16
-; RV32-NEXT:    beqz a2, .LBB15_5
-; RV32-NEXT:  .LBB15_13: # %cond.store10
-; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v16, v12, a2
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 32
-; RV32-NEXT:    beqz a2, .LBB15_6
-; RV32-NEXT:  .LBB15_14: # %cond.store13
-; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v16, v12, a2
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a2, a1, 64
-; RV32-NEXT:    beqz a2, .LBB15_7
-; RV32-NEXT:  .LBB15_15: # %cond.store16
-; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v16, v12, a2
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a2, 4(a0)
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    andi a1, a1, -128
-; RV32-NEXT:    beqz a1, .LBB15_8
-; RV32-NEXT:  .LBB15_16: # %cond.store19
-; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v12, v8, a1
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    sw a2, 0(a0)
-; RV32-NEXT:    sw a1, 4(a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: compressstore_v8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    andi a2, a1, 1
-; RV64-NEXT:    bnez a2, .LBB15_11
-; RV64-NEXT:  # %bb.1: # %else
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    bnez a2, .LBB15_12
-; RV64-NEXT:  .LBB15_2: # %else2
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    bnez a2, .LBB15_13
-; RV64-NEXT:  .LBB15_3: # %else5
-; RV64-NEXT:    andi a2, a1, 8
-; RV64-NEXT:    beqz a2, .LBB15_5
-; RV64-NEXT:  .LBB15_4: # %cond.store7
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v12, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:  .LBB15_5: # %else8
-; RV64-NEXT:    addi sp, sp, -320
-; RV64-NEXT:    .cfi_def_cfa_offset 320
-; RV64-NEXT:    sd ra, 312(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 304(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 320
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    andi a2, a1, 16
-; RV64-NEXT:    bnez a2, .LBB15_14
-; RV64-NEXT:  # %bb.6: # %else11
-; RV64-NEXT:    andi a2, a1, 32
-; RV64-NEXT:    bnez a2, .LBB15_15
-; RV64-NEXT:  .LBB15_7: # %else14
-; RV64-NEXT:    andi a2, a1, 64
-; RV64-NEXT:    bnez a2, .LBB15_16
-; RV64-NEXT:  .LBB15_8: # %else17
-; RV64-NEXT:    andi a1, a1, -128
-; RV64-NEXT:    beqz a1, .LBB15_10
-; RV64-NEXT:  .LBB15_9: # %cond.store19
-; RV64-NEXT:    mv a1, sp
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a1)
-; RV64-NEXT:    ld a1, 56(sp)
-; RV64-NEXT:    sd a1, 0(a0)
-; RV64-NEXT:  .LBB15_10: # %else20
-; RV64-NEXT:    addi sp, s0, -320
-; RV64-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 304(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 320
-; RV64-NEXT:    ret
-; RV64-NEXT:  .LBB15_11: # %cond.store
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v8, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 2
-; RV64-NEXT:    beqz a2, .LBB15_2
-; RV64-NEXT:  .LBB15_12: # %cond.store1
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 1
-; RV64-NEXT:    vse64.v v12, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 4
-; RV64-NEXT:    beqz a2, .LBB15_3
-; RV64-NEXT:  .LBB15_13: # %cond.store4
-; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT:    vse64.v v12, (a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 8
-; RV64-NEXT:    bnez a2, .LBB15_4
-; RV64-NEXT:    j .LBB15_5
-; RV64-NEXT:  .LBB15_14: # %cond.store10
-; RV64-NEXT:    addi a2, sp, 192
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    ld a2, 224(sp)
-; RV64-NEXT:    sd a2, 0(a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 32
-; RV64-NEXT:    beqz a2, .LBB15_7
-; RV64-NEXT:  .LBB15_15: # %cond.store13
-; RV64-NEXT:    addi a2, sp, 128
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    ld a2, 168(sp)
-; RV64-NEXT:    sd a2, 0(a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a2, a1, 64
-; RV64-NEXT:    beqz a2, .LBB15_8
-; RV64-NEXT:  .LBB15_16: # %cond.store16
-; RV64-NEXT:    addi a2, sp, 64
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    ld a2, 112(sp)
-; RV64-NEXT:    sd a2, 0(a0)
-; RV64-NEXT:    addi a0, a0, 8
-; RV64-NEXT:    andi a1, a1, -128
-; RV64-NEXT:    bnez a1, .LBB15_9
-; RV64-NEXT:    j .LBB15_10
+; CHECK-LABEL: compressstore_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vcompress.vm v12, v8, v0
+; CHECK-NEXT:    vcpop.m a1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vse64.v v12, (a0)
+; CHECK-NEXT:    ret
   call void @llvm.masked.compressstore.v8i64(<8 x i64> %v, ptr align 8 %base, <8 x i1> %mask)
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 68740eec56e4c..7dcfb247d37cb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -1599,15 +1599,16 @@ define float @vreduce_fminimum_v2f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB99_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB99_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x float>, ptr %x
@@ -1619,15 +1620,8 @@ define float @vreduce_fminimum_v2f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v2f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x float>, ptr %x
@@ -1641,24 +1635,16 @@ define float @vreduce_fminimum_v4f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB101_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB101_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %x
@@ -1670,24 +1656,8 @@ define float @vreduce_fminimum_v4f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v4f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %x
@@ -1701,33 +1671,16 @@ define float @vreduce_fminimum_v8f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v10, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v10
+; CHECK-NEXT:    beqz a0, .LBB103_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB103_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %x
@@ -1739,33 +1692,8 @@ define float @vreduce_fminimum_v8f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v8f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %x
@@ -1779,42 +1707,16 @@ define float @vreduce_fminimum_v16f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v12, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v12
+; CHECK-NEXT:    beqz a0, .LBB105_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB105_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %x
@@ -1826,42 +1728,8 @@ define float @vreduce_fminimum_v16f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v16f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %x
@@ -1876,51 +1744,16 @@ define float @vreduce_fminimum_v32f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB107_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB107_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x float>, ptr %x
@@ -1933,51 +1766,8 @@ define float @vreduce_fminimum_v32f32_nonans(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x float>, ptr %x
@@ -2009,52 +1799,18 @@ define float @vreduce_fminimum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB109_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    j .LBB109_3
+; CHECK-NEXT:  .LBB109_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB109_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -2073,51 +1829,8 @@ define float @vreduce_fminimum_v64f32_nonans(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <64 x float>, ptr %x
@@ -2208,52 +1921,18 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB111_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    j .LBB111_3
+; CHECK-NEXT:  .LBB111_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB111_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -2281,51 +1960,8 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) {
 ; CHECK-NEXT:    vle32.v v0, (a1)
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16
 ; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <128 x float>, ptr %x
@@ -2339,15 +1975,16 @@ define double @vreduce_fminimum_v2f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB113_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI113_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI113_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB113_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x double>, ptr %x
@@ -2359,15 +1996,8 @@ define double @vreduce_fminimum_v2f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v2f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x double>, ptr %x
@@ -2381,24 +2011,16 @@ define double @vreduce_fminimum_v4f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v10, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v10
+; CHECK-NEXT:    beqz a0, .LBB115_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI115_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI115_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB115_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %x
@@ -2410,24 +2032,8 @@ define double @vreduce_fminimum_v4f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v4f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %x
@@ -2441,33 +2047,16 @@ define double @vreduce_fminimum_v8f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vle64.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v12, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v12
+; CHECK-NEXT:    beqz a0, .LBB117_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI117_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI117_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB117_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x double>, ptr %x
@@ -2479,33 +2068,8 @@ define double @vreduce_fminimum_v8f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v8f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vle64.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x double>, ptr %x
@@ -2519,42 +2083,16 @@ define double @vreduce_fminimum_v16f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v16f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI119_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI119_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x double>, ptr %x
@@ -2566,42 +2104,8 @@ define double @vreduce_fminimum_v16f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v16f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x double>, ptr %x
@@ -2632,43 +2136,18 @@ define double @vreduce_fminimum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB121_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI121_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI121_0)(a0)
+; CHECK-NEXT:    j .LBB121_3
+; CHECK-NEXT:  .LBB121_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB121_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -2686,42 +2165,8 @@ define double @vreduce_fminimum_v32f64_nonans(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
@@ -2811,43 +2256,18 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB123_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI123_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI123_0)(a0)
+; CHECK-NEXT:    j .LBB123_3
+; CHECK-NEXT:  .LBB123_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB123_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -2874,42 +2294,8 @@ define double @vreduce_fminimum_v64f64_nonans(ptr %x) {
 ; CHECK-NEXT:    vle64.v v0, (a1)
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16
 ; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <64 x double>, ptr %x
@@ -2923,15 +2309,16 @@ define float @vreduce_fmaximum_v2f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB125_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB125_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x float>, ptr %x
@@ -2943,15 +2330,8 @@ define float @vreduce_fmaximum_v2f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v2f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x float>, ptr %x
@@ -2965,24 +2345,16 @@ define float @vreduce_fmaximum_v4f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB127_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB127_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %x
@@ -2994,24 +2366,8 @@ define float @vreduce_fmaximum_v4f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v4f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %x
@@ -3025,33 +2381,16 @@ define float @vreduce_fmaximum_v8f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v10, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v10
+; CHECK-NEXT:    beqz a0, .LBB129_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB129_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %x
@@ -3063,33 +2402,8 @@ define float @vreduce_fmaximum_v8f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v8f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %x
@@ -3103,42 +2417,16 @@ define float @vreduce_fmaximum_v16f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v12, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v12
+; CHECK-NEXT:    beqz a0, .LBB131_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB131_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %x
@@ -3150,42 +2438,8 @@ define float @vreduce_fmaximum_v16f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v16f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %x
@@ -3200,51 +2454,16 @@ define float @vreduce_fmaximum_v32f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB133_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB133_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x float>, ptr %x
@@ -3257,51 +2476,8 @@ define float @vreduce_fmaximum_v32f32_nonans(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x float>, ptr %x
@@ -3333,52 +2509,18 @@ define float @vreduce_fmaximum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB135_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    j .LBB135_3
+; CHECK-NEXT:  .LBB135_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB135_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -3397,51 +2539,8 @@ define float @vreduce_fmaximum_v64f32_nonans(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <64 x float>, ptr %x
@@ -3532,52 +2631,18 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB137_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    j .LBB137_3
+; CHECK-NEXT:  .LBB137_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB137_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -3605,51 +2670,8 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) {
 ; CHECK-NEXT:    vle32.v v0, (a1)
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16
 ; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <128 x float>, ptr %x
@@ -3663,15 +2685,16 @@ define double @vreduce_fmaximum_v2f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB139_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI139_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI139_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB139_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x double>, ptr %x
@@ -3683,15 +2706,8 @@ define double @vreduce_fmaximum_v2f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v2f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x double>, ptr %x
@@ -3705,24 +2721,16 @@ define double @vreduce_fmaximum_v4f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v10, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v10
+; CHECK-NEXT:    beqz a0, .LBB141_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI141_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI141_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB141_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %x
@@ -3734,24 +2742,8 @@ define double @vreduce_fmaximum_v4f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v4f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %x
@@ -3765,33 +2757,16 @@ define double @vreduce_fmaximum_v8f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vle64.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v12, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v12
+; CHECK-NEXT:    beqz a0, .LBB143_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI143_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI143_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB143_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x double>, ptr %x
@@ -3803,33 +2778,8 @@ define double @vreduce_fmaximum_v8f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v8f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vle64.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x double>, ptr %x
@@ -3843,42 +2793,16 @@ define double @vreduce_fmaximum_v16f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v16f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB145_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI145_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI145_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB145_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x double>, ptr %x
@@ -3890,42 +2814,8 @@ define double @vreduce_fmaximum_v16f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v16f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x double>, ptr %x
@@ -3956,43 +2846,18 @@ define double @vreduce_fmaximum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB147_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI147_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI147_0)(a0)
+; CHECK-NEXT:    j .LBB147_3
+; CHECK-NEXT:  .LBB147_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB147_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -4010,42 +2875,8 @@ define double @vreduce_fmaximum_v32f64_nonans(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
@@ -4135,43 +2966,18 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB149_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI149_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI149_0)(a0)
+; CHECK-NEXT:    j .LBB149_3
+; CHECK-NEXT:  .LBB149_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB149_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -4198,42 +3004,8 @@ define double @vreduce_fmaximum_v64f64_nonans(ptr %x) {
 ; CHECK-NEXT:    vle64.v v0, (a1)
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16
 ; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <64 x double>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 8474f95edd813..e5bef20fd9e24 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -5,59 +5,6 @@
 ; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
-define <8 x i16> @concat_2xv4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: concat_2xv4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 4
-; CHECK-NEXT:    ret
-  %ab = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %ab
-}
-
-define <8 x i16> @concat_4xv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
-; CHECK-LABEL: concat_4xv2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
-  %ab = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cd = shufflevector <2 x i16> %c, <2 x i16> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %abcd = shufflevector <4 x i16> %ab, <4 x i16> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %abcd
-}
-
-define <8 x i16> @concat_8xv1i16(<1 x i16> %a, <1 x i16> %b, <1 x i16> %c, <1 x i16> %d, <1 x i16> %e, <1 x i16> %f, <1 x i16> %g, <1 x i16> %h) {
-; CHECK-LABEL: concat_8xv1i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v12, v13, 1
-; CHECK-NEXT:    vsetivli zero, 3, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 2
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v15, 3
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-NEXT:    vsetivli zero, 3, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 3
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
-; CHECK-NEXT:    ret
-  %ab = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
-  %cd = shufflevector <1 x i16> %c, <1 x i16> %d, <2 x i32> <i32 0, i32 1>
-  %abcd = shufflevector <2 x i16> %ab, <2 x i16> %cd, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %ef = shufflevector <1 x i16> %e, <1 x i16> %f, <2 x i32> <i32 0, i32 1>
-  %gh = shufflevector <1 x i16> %g, <1 x i16> %h, <2 x i32> <i32 0, i32 1>
-  %efgh = shufflevector <2 x i16> %ef, <2 x i16> %gh, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %abcdefgh = shufflevector <4 x i16> %abcd, <4 x i16> %efgh, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %abcdefgh
-}
-
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: concat_2xv4i32:
 ; CHECK:       # %bb.0:
@@ -72,11 +19,14 @@ define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
 ; CHECK-LABEL: concat_4xv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v11
+; CHECK-NEXT:    vmv1r.v v14, v9
+; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v14, 2
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 6
 ; CHECK-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -87,18 +37,24 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) {
 ; CHECK-LABEL: concat_8xv1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 1
-; CHECK-NEXT:    vslideup.vi v12, v13, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 1
-; CHECK-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v15
+; CHECK-NEXT:    vmv1r.v v18, v13
+; CHECK-NEXT:    vmv1r.v v20, v11
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v22, 1
+; CHECK-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v20, 3
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v18, 5
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v14, 6
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 7
 ; CHECK-NEXT:    ret
   %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> <i32 0, i32 1>
@@ -124,14 +80,15 @@ define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) {
 define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; CHECK-LABEL: concat_4xv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v14, v11
-; CHECK-NEXT:    vmv1r.v v12, v10
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vmv1r.v v12, v11
+; CHECK-NEXT:    vmv1r.v v16, v10
+; CHECK-NEXT:    vmv1r.v v20, v9
+; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v20, 4
+; CHECK-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 8
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vslideup.vi v8, v12, 12
 ; CHECK-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -142,18 +99,26 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) {
 ; CHECK-LABEL: concat_8xv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 2
-; CHECK-NEXT:    vslideup.vi v12, v13, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v15
+; CHECK-NEXT:    vmv1r.v v20, v14
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vmv1r.v v28, v11
+; CHECK-NEXT:    vmv1r.v v4, v10
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v0, 2
+; CHECK-NEXT:    vsetivli zero, 6, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v4, 4
+; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v28, 6
+; CHECK-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v24, 10
+; CHECK-NEXT:    vsetivli zero, 14, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v20, 12
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 14
 ; CHECK-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -187,27 +152,29 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
 define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
 ; VLA-LABEL: concat_4xv8i32:
 ; VLA:       # %bb.0:
-; VLA-NEXT:    vmv2r.v v20, v14
-; VLA-NEXT:    vmv2r.v v16, v12
-; VLA-NEXT:    vmv2r.v v12, v10
-; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLA-NEXT:    vslideup.vi v16, v20, 8
-; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    vmv2r.v v16, v14
+; VLA-NEXT:    vmv2r.v v24, v12
+; VLA-NEXT:    vmv2r.v v0, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 8
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 16
 ; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    vslideup.vi v8, v16, 24
 ; VLA-NEXT:    ret
 ;
 ; VLS-LABEL: concat_4xv8i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv2r.v v20, v14
-; VLS-NEXT:    vmv2r.v v16, v12
-; VLS-NEXT:    vmv2r.v v12, v10
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    vmv2r.v v16, v14
+; VLS-NEXT:    vmv2r.v v24, v12
+; VLS-NEXT:    vmv2r.v v0, v10
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 8
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 16
 ; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    vslideup.vi v8, v16, 24
 ; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -218,49 +185,123 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) {
 ; VLA-LABEL: concat_8xv4i32:
 ; VLA:       # %bb.0:
-; VLA-NEXT:    vmv1r.v v18, v15
-; VLA-NEXT:    vmv1r.v v20, v14
-; VLA-NEXT:    vmv1r.v v22, v13
+; VLA-NEXT:    addi sp, sp, -16
+; VLA-NEXT:    .cfi_def_cfa_offset 16
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    sub sp, sp, a0
+; VLA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLA-NEXT:    vmv1r.v v16, v15
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    mv a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v14
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v13
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; VLA-NEXT:    vmv1r.v v16, v12
-; VLA-NEXT:    vmv1r.v v14, v11
-; VLA-NEXT:    vmv1r.v v12, v10
-; VLA-NEXT:    vmv1r.v v10, v9
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v20, v18, 4
-; VLA-NEXT:    vslideup.vi v16, v22, 4
-; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLA-NEXT:    vslideup.vi v16, v20, 8
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v12, v14, 4
-; VLA-NEXT:    vslideup.vi v8, v10, 4
-; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    addi a0, sp, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v0, v11
+; VLA-NEXT:    vmv1r.v v24, v10
+; VLA-NEXT:    vmv1r.v v16, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 8
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 12
+; VLA-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 20
+; VLA-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 24
 ; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    mv a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 28
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    add sp, sp, a0
+; VLA-NEXT:    addi sp, sp, 16
 ; VLA-NEXT:    ret
 ;
 ; VLS-LABEL: concat_8xv4i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv1r.v v18, v15
-; VLS-NEXT:    vmv1r.v v20, v14
-; VLS-NEXT:    vmv1r.v v22, v13
+; VLS-NEXT:    addi sp, sp, -16
+; VLS-NEXT:    .cfi_def_cfa_offset 16
+; VLS-NEXT:    addi sp, sp, -512
+; VLS-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLS-NEXT:    vmv1r.v v16, v15
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v14
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v13
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; VLS-NEXT:    vmv1r.v v16, v12
-; VLS-NEXT:    vmv1r.v v14, v11
-; VLS-NEXT:    vmv1r.v v12, v10
-; VLS-NEXT:    vmv1r.v v10, v9
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v20, v18, 4
-; VLS-NEXT:    vslideup.vi v16, v22, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v12, v14, 4
-; VLS-NEXT:    vslideup.vi v8, v10, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    addi a0, sp, 16
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v0, v11
+; VLS-NEXT:    vmv1r.v v24, v10
+; VLS-NEXT:    vmv1r.v v16, v9
+; VLS-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 8
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 12
+; VLS-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 20
+; VLS-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 28
+; VLS-NEXT:    addi sp, sp, 512
+; VLS-NEXT:    addi sp, sp, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
index 37902aa187321..4ec2e59672ad6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
@@ -24,15 +24,17 @@ define void @widen_2xv4i16(ptr %x, ptr %z) {
 define void @widen_3xv4i16(ptr %x, ptr %z) {
 ; CHECK-LABEL: widen_3xv4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a2, a0, 16
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 8
-; CHECK-NEXT:    vsetivli zero, 12, e16, m2, ta, ma
-; CHECK-NEXT:    vse16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    addi a2, a0, 8
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
   %b.gep = getelementptr i8, ptr %x, i64 8
@@ -70,18 +72,20 @@ define void @widen_4xv4i16(ptr %x, ptr %z) {
 define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
 ; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned:
 ; CHECK-NO-MISALIGN:       # %bb.0:
-; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NO-MISALIGN-NEXT:    vle8.v v8, (a0)
-; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 16
-; CHECK-NO-MISALIGN-NEXT:    vle8.v v10, (a2)
 ; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 8
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v10, (a2)
+; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 16
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v12, (a2)
 ; CHECK-NO-MISALIGN-NEXT:    addi a0, a0, 24
-; CHECK-NO-MISALIGN-NEXT:    vle8.v v9, (a0)
-; CHECK-NO-MISALIGN-NEXT:    vle8.v v11, (a2)
-; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v10, v9, 4
-; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v11, 4
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v14, (a0)
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v10, 8
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v14, 12
 ; CHECK-NO-MISALIGN-NEXT:    vse16.v v8, (a1)
 ; CHECK-NO-MISALIGN-NEXT:    ret
 ;
@@ -181,14 +185,21 @@ define void @strided_constant_0(ptr %x, ptr %z) {
 define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
 ; CHECK-LABEL: strided_constant_mismatch_4xv4i16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    addi a2, a0, 2
+; CHECK-NEXT:    vle16.v v10, (a2)
 ; CHECK-NEXT:    addi a2, a0, 6
-; CHECK-NEXT:    li a3, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), a3
-; CHECK-NEXT:    vlse64.v v10, (a2), a3
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vse64.v v8, (a1)
+; CHECK-NEXT:    vle16.v v12, (a2)
+; CHECK-NEXT:    addi a0, a0, 8
+; CHECK-NEXT:    vle16.v v14, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v14, 12
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
   %b.gep = getelementptr i8, ptr %x, i64 2
@@ -244,38 +255,59 @@ define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) {
 define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
 ; RV32-LABEL: strided_runtime_mismatch_4xv4i16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), a2
-; RV32-NEXT:    vlse64.v v10, (a3), a2
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vslideup.vi v8, v10, 2
-; RV32-NEXT:    vse64.v v8, (a1)
+; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT:    vle16.v v8, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vle16.v v10, (a0)
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    vle16.v v12, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vle16.v v14, (a0)
+; RV32-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v8, v10, 4
+; RV32-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v8, v12, 8
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT:    vslideup.vi v8, v14, 12
+; RV32-NEXT:    vse16.v v8, (a1)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: strided_runtime_mismatch_4xv4i16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    add a4, a0, a2
-; RV64-NEXT:    add a3, a4, a3
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vlse64.v v8, (a0), a2
-; RV64-NEXT:    vlse64.v v10, (a3), a2
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vslideup.vi v8, v10, 2
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NEXT:    vle16.v v8, (a0)
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vle16.v v10, (a0)
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    vle16.v v12, (a0)
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vle16.v v14, (a0)
+; RV64-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; RV64-NEXT:    vslideup.vi v8, v10, 4
+; RV64-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; RV64-NEXT:    vslideup.vi v8, v12, 8
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT:    vslideup.vi v8, v14, 12
+; RV64-NEXT:    vse16.v v8, (a1)
 ; RV64-NEXT:    ret
 ;
 ; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16:
 ; ZVE64F:       # %bb.0:
-; ZVE64F-NEXT:    add a4, a0, a2
-; ZVE64F-NEXT:    add a3, a4, a3
-; ZVE64F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; ZVE64F-NEXT:    vlse64.v v8, (a0), a2
-; ZVE64F-NEXT:    vlse64.v v10, (a3), a2
-; ZVE64F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; ZVE64F-NEXT:    vslideup.vi v8, v10, 2
-; ZVE64F-NEXT:    vse64.v v8, (a1)
+; ZVE64F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE64F-NEXT:    vle16.v v8, (a0)
+; ZVE64F-NEXT:    add a0, a0, a2
+; ZVE64F-NEXT:    vle16.v v10, (a0)
+; ZVE64F-NEXT:    add a0, a0, a3
+; ZVE64F-NEXT:    vle16.v v12, (a0)
+; ZVE64F-NEXT:    add a0, a0, a2
+; ZVE64F-NEXT:    vle16.v v14, (a0)
+; ZVE64F-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; ZVE64F-NEXT:    vslideup.vi v8, v10, 4
+; ZVE64F-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; ZVE64F-NEXT:    vslideup.vi v8, v12, 8
+; ZVE64F-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVE64F-NEXT:    vslideup.vi v8, v14, 12
+; ZVE64F-NEXT:    vse16.v v8, (a1)
 ; ZVE64F-NEXT:    ret
   %a = load <4 x i16>, ptr %x
   %b.gep = getelementptr i8, ptr %x, i64 %s
@@ -534,3 +566,28 @@ define void @reverse_strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
   store <8 x float> %e.2, ptr %z
   ret void
 }
+
+; The middle end sometimes produces this pattern of shuffles, where the
+; intermediate shuffles are the full result vector size padded with poison
+; elements.
+define <16 x i8> @widen_4xv4i8_immediate_expand(ptr %p, i64 %s) {
+; CHECK-LABEL: widen_4xv4i8_immediate_expand:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %p
+  %b.ptr = getelementptr i8, ptr %p, i64 %s
+  %b = load <4 x i8>, ptr %b.ptr
+  %c.ptr = getelementptr i8, ptr %b.ptr, i64 %s
+  %c = load <4 x i8>, ptr %c.ptr
+  %d.ptr = getelementptr i8, ptr %c.ptr, i64 %s
+  %d = load <4 x i8>, ptr %d.ptr
+
+  %ab = shufflevector <4 x i8> %a, <4 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %cx = shufflevector <4 x i8> %c, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %dx = shufflevector <4 x i8> %d, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %abcx = shufflevector <16 x i8> %ab, <16 x i8> %cx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+  %abcd = shufflevector <16 x i8> %abcx, <16 x i8> %dx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+  ret <16 x i8> %abcd
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index b3bda5973eb8c..eb7894ede0464 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -441,50 +441,57 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -602,50 +609,57 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -773,53 +787,60 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    li a0, -1
 ; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
+; CHECK-V-NEXT:    vmin.vx v8, v10, a0
 ; CHECK-V-NEXT:    vmax.vx v10, v8, zero
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
@@ -1383,125 +1404,90 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -1696,125 +1682,90 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
+; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
+; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
-; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -2031,129 +1982,94 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    lui a0, 16
 ; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
+; CHECK-V-NEXT:    vmin.vx v8, v10, a0
 ; CHECK-V-NEXT:    vmax.vx v10, v8, zero
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -3807,50 +3723,57 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -3966,50 +3889,57 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -4136,53 +4066,60 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    li a0, -1
 ; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
+; CHECK-V-NEXT:    vmin.vx v8, v10, a0
 ; CHECK-V-NEXT:    vmax.vx v10, v8, zero
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
@@ -4734,125 +4671,90 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -5045,125 +4947,90 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -5379,129 +5246,94 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    lui a0, 16
 ; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
+; CHECK-V-NEXT:    vmin.vx v8, v10, a0
 ; CHECK-V-NEXT:    vmax.vx v10, v8, zero
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 0b236f6d3ff38..f3ae03af7c786 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -2136,18 +2136,17 @@ define <vscale x 32 x i8> @mgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8>
 ; RV64-NEXT:    vluxei64.v v13, (a0), v24, v0.t
 ; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v8, v16, a1
+; RV64-NEXT:    vslidedown.vx v0, v16, a1
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v10
+; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
+; RV64-NEXT:    vluxei64.v v14, (a0), v16, v0.t
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v8, a2
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v11
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64-NEXT:    vluxei64.v v15, (a0), v16, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v16, v10
-; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vluxei64.v v14, (a0), v16, v0.t
 ; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i8, ptr %base, <vscale x 32 x i8> %idxs
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
index d13d67fd0a882..c27488b18a017 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
@@ -9,38 +9,39 @@ define <4 x float> @foo(ptr %0) nounwind {
 ; CHECK-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    lhu s0, 0(a0)
-; CHECK-NEXT:    lhu s1, 2(a0)
-; CHECK-NEXT:    lhu s2, 4(a0)
-; CHECK-NEXT:    lhu a0, 6(a0)
+; CHECK-NEXT:    lhu s0, 6(a0)
+; CHECK-NEXT:    lhu s1, 4(a0)
+; CHECK-NEXT:    lhu s2, 0(a0)
+; CHECK-NEXT:    lhu a0, 2(a0)
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fsw fa0, 4(sp)
+; CHECK-NEXT:    fsw fa0, 8(sp)
 ; CHECK-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fsw fa0, 12(sp)
+; CHECK-NEXT:    fsw fa0, 0(sp)
 ; CHECK-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fsw fa0, 8(sp)
+; CHECK-NEXT:    fsw fa0, 12(sp)
 ; CHECK-NEXT:    fmv.w.x fa0, s0
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fsw fa0, 0(sp)
-; CHECK-NEXT:    addi a0, sp, 4
+; CHECK-NEXT:    fsw fa0, 4(sp)
+; CHECK-NEXT:    addi a0, sp, 8
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    addi a0, sp, 12
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    addi a0, sp, 8
+; CHECK-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    addi a0, sp, 4
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    mv a0, sp
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vslideup.vi v8, v9, 3
 ; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-stack-align.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-stack-align.mir
index 6ea6fb183a7fd..749bd4c13879b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-stack-align.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-stack-align.mir
@@ -159,7 +159,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
@@ -204,7 +204,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    16
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
@@ -249,7 +249,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    32
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir b/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir
index 06ed46f291a83..8248c26636793 100644
--- a/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir
@@ -83,7 +83,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/RISCV/stack-inst-compress.mir b/llvm/test/CodeGen/RISCV/stack-inst-compress.mir
index 6721ff11d99b7..5cc4615bb64a1 100644
--- a/llvm/test/CodeGen/RISCV/stack-inst-compress.mir
+++ b/llvm/test/CodeGen/RISCV/stack-inst-compress.mir
@@ -32,6 +32,7 @@ alignment:       2
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
+  adjustsStack:    true
   hasCalls:        true
   localFrameSize:  2048
 stack:
@@ -117,6 +118,7 @@ alignment:       2
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
+  adjustsStack:    true
   hasCalls:        true
   localFrameSize:  4096
 stack:
@@ -210,6 +212,7 @@ alignment:       2
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
+  adjustsStack:    true
   hasCalls:        true
   localFrameSize:  8192
 stack:
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 8c0d97afe6c21..f1ae320017563 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -89,8 +89,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    snez a3, a3
 ; RISCV32-NEXT:    and a3, a3, a7
 ; RISCV32-NEXT:    or a2, a3, a2
-; RISCV32-NEXT:    or a3, t2, t3
-; RISCV32-NEXT:    or a2, a2, a3
+; RISCV32-NEXT:    or a2, a2, t2
+; RISCV32-NEXT:    or a2, a2, t3
 ; RISCV32-NEXT:    mul a3, a5, a4
 ; RISCV32-NEXT:    andi a2, a2, 1
 ; RISCV32-NEXT:    sw a3, 0(a0)
diff --git a/llvm/test/CodeGen/SPIRV/ComparePointers.ll b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
index 9be05944789b6..6777fc38024b3 100644
--- a/llvm/test/CodeGen/SPIRV/ComparePointers.ll
+++ b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --mattr=+spirv1.3  %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; kernel void test(int global *in, int global *in2) {
 ;;   if (!in)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll
new file mode 100644
index 0000000000000..710a1581f760c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll
@@ -0,0 +1,25 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+
+; CHECK: OpFunction
+; CHECK: %[[FooArg:.*]] = OpVariable
+; CHECK: OpLifetimeStart %[[FooArg]], 0
+; CHECK: OpCopyMemorySized
+; CHECK: OpBitcast
+; CHECK: OpInBoundsPtrAccessChain
+; CHECK: OpLifetimeStop %[[FooArg]], 0
+
+%tprange = type { %tparray }
+%tparray = type { [2 x i64] }
+
+define spir_func void @foo(ptr noundef byval(%tprange) align 8 %_arg_UserRange) {
+  %RoundedRangeKernel = alloca %tprange, align 8
+  call void @llvm.lifetime.start.p0(i64 72, ptr nonnull %RoundedRangeKernel) #7
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %RoundedRangeKernel, ptr align 8 %_arg_UserRange, i64 16, i1 false)
+  %KernelFunc = getelementptr inbounds i8, ptr %RoundedRangeKernel, i64 16
+  call void @llvm.lifetime.end.p0(i64 72, ptr nonnull %RoundedRangeKernel) #7
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-load.ll b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-load.ll
index a30d0792e3998..18752fdf843d2 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-load.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-load.ll
@@ -9,7 +9,7 @@
 ; CHECK-DAG: %[[#TYLONGPTR:]] = OpTypePointer Function %[[#TYLONG]]
 ; CHECK: %[[#PTRTOSTRUCT:]] = OpFunctionParameter %[[#TYSTRUCTPTR]]
 ; CHECK: %[[#PTRTOLONG:]] = OpBitcast %[[#TYLONGPTR]] %[[#PTRTOSTRUCT]]
-; CHECK: OpLoad %[[#TYLONG]] %[[#PTRTOLONG]]
+; CHECK-NEXT: OpLoad %[[#TYLONG]] %[[#PTRTOLONG]]
 
 %struct.S = type { i32 }
 %struct.__wrapper_class = type { [7 x %struct.S] }
diff --git a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-store.ll b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-store.ll
index 4701f02ea33af..202bcfbf2599a 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-store.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-store.ll
@@ -13,7 +13,7 @@
 ; CHECK: %[[#OBJ:]] = OpFunctionParameter %[[#TYSTRUCT]]
 ; CHECK: %[[#ARGPTR2:]] = OpFunctionParameter %[[#TYLONGPTR]]
 ; CHECK: %[[#PTRTOSTRUCT:]] = OpBitcast %[[#TYSTRUCTPTR]] %[[#ARGPTR2]]
-; CHECK: OpStore %[[#PTRTOSTRUCT]] %[[#OBJ]]
+; CHECK-NEXT: OpStore %[[#PTRTOSTRUCT]] %[[#OBJ]]
 
 %struct.S = type { i32 }
 %struct.__wrapper_class = type { [7 x %struct.S] }
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args-rev.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args-rev.ll
new file mode 100644
index 0000000000000..ae7fb99907b13
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args-rev.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: OpName %[[ArgToDeduce:.*]] "unknown_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Bar:.*]] "bar"
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[LongPtr:.*]] = OpTypePointer CrossWorkgroup %[[Long]]
+; CHECK-SPIRV-DAG: %[[Fun:.*]] = OpTypeFunction %[[Void]] %[[LongPtr]]
+; CHECK-SPIRV: %[[Bar]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[ArgToDeduce]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo]] %[[ArgToDeduce]]
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[FooArg]] = OpFunctionParameter %[[LongPtr]]
+
+define spir_kernel void @bar(ptr addrspace(1) %unknown_type_ptr) {
+entry:
+  %elem = getelementptr inbounds i32, ptr addrspace(1) %unknown_type_ptr, i64 0
+  call void @foo(ptr addrspace(1) %unknown_type_ptr)
+  ret void
+}
+
+define void @foo(ptr addrspace(1) %known_type_ptr) {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args.ll
new file mode 100644
index 0000000000000..ee411f2646602
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args.ll
@@ -0,0 +1,97 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "unknown_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: OpName %[[BarArg:.*]] "known_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Bar:.*]] "bar"
+; CHECK-SPIRV-DAG: OpName %[[UntypedArg:.*]] "arg"
+; CHECK-SPIRV-DAG: OpName %[[FunUntypedArg:.*]] "foo_untyped_arg"
+; CHECK-SPIRV-DAG: OpName %[[UnusedArg1:.*]] "unused_arg1"
+; CHECK-SPIRV-DAG: OpName %[[Foo2Arg:.*]] "unknown_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Foo2:.*]] "foo2"
+; CHECK-SPIRV-DAG: OpName %[[Bar2Arg:.*]] "known_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Bar2:.*]] "bar2"
+; CHECK-SPIRV-DAG: OpName %[[Foo5Arg1:.*]] "unknown_type_ptr1"
+; CHECK-SPIRV-DAG: OpName %[[Foo5Arg2:.*]] "unknown_type_ptr2"
+; CHECK-SPIRV-DAG: OpName %[[Foo5:.*]] "foo5"
+; CHECK-SPIRV-DAG: OpName %[[Bar5Arg:.*]] "known_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Bar5:.*]] "bar5"
+; CHECK-SPIRV-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[Half:.*]] = OpTypeFloat 16
+; CHECK-SPIRV-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[HalfConst:.*]] = OpConstant %[[Half]] 15360
+; CHECK-SPIRV-DAG: %[[CharPtr:.*]] = OpTypePointer CrossWorkgroup %[[Char]]
+; CHECK-SPIRV-DAG: %[[LongPtr:.*]] = OpTypePointer CrossWorkgroup %[[Long]]
+; CHECK-SPIRV-DAG: %[[Fun:.*]] = OpTypeFunction %[[Void]] %[[LongPtr]]
+; CHECK-SPIRV-DAG: %[[Fun2:.*]] = OpTypeFunction %[[Void]] %[[Half]] %[[LongPtr]]
+; CHECK-SPIRV-DAG: %[[Fun5:.*]] = OpTypeFunction %[[Void]] %[[Half]] %[[LongPtr]] %[[Half]] %[[LongPtr]] %[[Half]]
+; CHECK-SPIRV-DAG: %[[FunUntyped:.*]] = OpTypeFunction %[[Void]] %[[CharPtr]]
+
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[FooArg]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: %[[Bar]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[BarArg]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo]] %[[BarArg]]
+
+; CHECK-SPIRV: %[[FunUntypedArg]] = OpFunction %[[Void]] None %[[FunUntyped]]
+; CHECK-SPIRV: %[[UntypedArg]] = OpFunctionParameter %[[CharPtr]]
+
+; CHECK-SPIRV: %[[Foo2]] = OpFunction %[[Void]] None %[[Fun2]]
+; CHECK-SPIRV: %[[UnusedArg1]] = OpFunctionParameter %[[Half]]
+; CHECK-SPIRV: %[[Foo2Arg]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: %[[Bar2]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[Bar2Arg]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo2]] %[[HalfConst]] %[[Bar2Arg]]
+
+; CHECK-SPIRV: %[[Foo5]] = OpFunction %[[Void]] None %[[Fun5]]
+; CHECK-SPIRV: OpFunctionParameter %[[Half]]
+; CHECK-SPIRV: %[[Foo5Arg1]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionParameter %[[Half]]
+; CHECK-SPIRV: %[[Foo5Arg2]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionParameter %[[Half]]
+; CHECK-SPIRV: %[[Bar5]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[Bar5Arg]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo5]] %[[HalfConst]] %[[Bar5Arg]] %[[HalfConst]] %[[Bar5Arg]] %[[HalfConst]]
+
+define void @foo(ptr addrspace(1) %unknown_type_ptr) {
+entry:
+  ret void
+}
+
+define spir_kernel void @bar(ptr addrspace(1) %known_type_ptr) {
+entry:
+  %elem = getelementptr inbounds i32, ptr addrspace(1) %known_type_ptr, i64 0
+  call void @foo(ptr addrspace(1) %known_type_ptr)
+  ret void
+}
+
+define void @foo_untyped_arg(ptr addrspace(1) %arg) {
+entry:
+  ret void
+}
+
+define void @foo2(half %unused_arg1, ptr addrspace(1) %unknown_type_ptr) {
+entry:
+  ret void
+}
+
+define spir_kernel void @bar2(ptr addrspace(1) %known_type_ptr) {
+entry:
+  %elem = getelementptr inbounds i32, ptr addrspace(1) %known_type_ptr, i64 0
+  call void @foo2(half 1.0, ptr addrspace(1) %known_type_ptr)
+  ret void
+}
+
+define void @foo5(half %unused_arg1, ptr addrspace(1) %unknown_type_ptr1, half %unused_arg2, ptr addrspace(1) %unknown_type_ptr2, half %unused_arg3) {
+entry:
+  ret void
+}
+
+define spir_kernel void @bar5(ptr addrspace(1) %known_type_ptr) {
+entry:
+  %elem = getelementptr inbounds i32, ptr addrspace(1) %known_type_ptr, i64 0
+  call void @foo5(half 1.0, ptr addrspace(1) %known_type_ptr, half 1.0, ptr addrspace(1) %known_type_ptr, half 1.0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
index 55cfcea999d84..e0c47798cc6d0 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --mattr=+spirv1.3 %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --mattr=+spirv1.3 %s -o - -filetype=obj | spirv-val %}
 
 ;; __kernel void testAtomicCompareExchangeExplicit_cl20(
 ;;     volatile global atomic_int* object,
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
index f18f27a6de51d..5074893163565 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId
 ; CHECK-SPIRV: %[[#Id:]] = OpVariable %[[#]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
index d39ca3c39383c..d0c4dff43121c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; The IR was generated from the following source:
 ;; #include <CL/sycl.hpp>
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
index 03456aef6b6b2..3885f07023144 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; The IR was generated from the following source:
 ;; #include <CL/sycl.hpp>
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-06.ll b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
index c9c5504520345..60ff780df87b0 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
@@ -4,9 +4,7 @@
 
 define float @f1(ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: lgf [[R:%r[0-9]+]], 0(%r2)
-; CHECK: sllg [[R]], [[R]], 32
-; CHECK: ldgr %f0, [[R]]
+; CHECK: le %f0, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic float, ptr %src seq_cst, align 4
   ret float %val
diff --git a/llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll b/llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll
new file mode 100644
index 0000000000000..8038329c0e09a
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+;
+; Test fpext of atomic loads to fp128 without VectorEnhancements1 (using FP register pairs).
+
+define fp128 @f1(ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lxeb %f0, 0(%r3)
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    std %f2, 8(%r2)
+; CHECK-NEXT:    br %r14
+  %V  = load atomic float, ptr %src seq_cst, align 4
+  %Res = fpext float %V to fp128
+  ret fp128 %Res
+}
+
+define fp128 @f2(ptr %src) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lxdb %f0, 0(%r3)
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    std %f2, 8(%r2)
+; CHECK-NEXT:    br %r14
+  %V  = load atomic double, ptr %src seq_cst, align 8
+  %Res = fpext double %V to fp128
+  ret fp128 %Res
+}
+
+
+
diff --git a/llvm/test/CodeGen/SystemZ/atomic-memops.ll b/llvm/test/CodeGen/SystemZ/atomic-memops.ll
new file mode 100644
index 0000000000000..0bc647aa0e0f7
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-memops.ll
@@ -0,0 +1,739 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
+
+; Sign-extending atomic loads.
+define void @f1(ptr %src, ptr %dst) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i16
+  store volatile i16 %s, ptr %dst
+  ret void
+}
+
+define void @f2(ptr %src, ptr %dst) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i32
+  store volatile i32 %s, ptr %dst
+  ret void
+}
+
+define void @f3(ptr %src, ptr %dst) {
+; CHECK-LABEL: f3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgb %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i64
+  store volatile i64 %s, ptr %dst
+  ret void
+}
+
+define void @f4(ptr %src, ptr %dst) {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lh %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %s = sext i16 %b to i32
+  store volatile i32 %s, ptr %dst
+  ret void
+}
+
+define void @f5(ptr %src, ptr %dst) {
+; CHECK-LABEL: f5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgh %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %s = sext i16 %b to i64
+  store volatile i64 %s, ptr %dst
+  ret void
+}
+
+define void @f6(ptr %src, ptr %dst) {
+; CHECK-LABEL: f6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgf %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %s = sext i32 %b to i64
+  store volatile i64 %s, ptr %dst
+  ret void
+}
+
+; Zero-extending atomic loads.
+define void @f7(ptr %src, ptr %dst) {
+; CHECK-LABEL: f7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llc %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i16
+  store volatile i16 %z, ptr %dst
+  ret void
+}
+
+define void @f8(ptr %src, ptr %dst) {
+; CHECK-LABEL: f8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llc %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i32
+  store volatile i32 %z, ptr %dst
+  ret void
+}
+
+define void @f9(ptr %src, ptr %dst) {
+; CHECK-LABEL: f9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i64
+  store volatile i64 %z, ptr %dst
+  ret void
+}
+
+define void @f10(ptr %src, ptr %dst) {
+; CHECK-LABEL: f10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llh %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %z = zext i16 %b to i32
+  store volatile i32 %z, ptr %dst
+  ret void
+}
+
+define void @f11(ptr %src, ptr %dst) {
+; CHECK-LABEL: f11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgh %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %z = zext i16 %b to i64
+  store volatile i64 %z, ptr %dst
+  ret void
+}
+
+define void @f12(ptr %src, ptr %dst) {
+; CHECK-LABEL: f12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgf %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %z = zext i32 %b to i64
+  store volatile i64 %z, ptr %dst
+  ret void
+}
+
+; reg/mem
+define i64 @f13(i64 %a, ptr %src) {
+; CHECK-LABEL: f13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ag %r2, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i64, ptr %src seq_cst, align 8
+  %add = add i64 %a, %b
+  ret i64 %add
+}
+
+; reg/mem op with extension from memory.
+define i64 @f14(i64 %a, ptr %src) {
+; CHECK-LABEL: f14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slgf %r2, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %bext = zext i32 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+define float @f15(float %f1, ptr %ptr, float %acc) {
+; CHECK-LABEL: f15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maeb %f2, %f0, 0(%r2)
+; CHECK-NEXT:    ldr %f0, %f2
+; CHECK-NEXT:    br %r14
+  %f2 = load atomic float, ptr %ptr seq_cst, align 4
+  %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
+  ret float %res
+}
+declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+
+define double @f15_b(ptr %src) {
+; CHECK-LABEL: f15_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ldeb %f0, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %V  = load atomic float, ptr %src seq_cst, align 4
+  %Res = fpext float %V to double
+  ret double %Res
+}
+
+define fp128 @f15_c(ptr %src) {
+; CHECK-LABEL: f15_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lde %f0, 0(%r3)
+; CHECK-NEXT:    ldebr %f0, %f0
+; CHECK-NEXT:    wflld %v0, %f0
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %V  = load atomic float, ptr %src seq_cst, align 4
+  %Res = fpext float %V to fp128
+  ret fp128 %Res
+}
+
+define fp128 @f15_d(ptr %src) {
+; CHECK-LABEL: f15_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld %f0, 0(%r3)
+; CHECK-NEXT:    wflld %v0, %f0
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %V  = load atomic double, ptr %src seq_cst, align 8
+  %Res = fpext double %V to fp128
+  ret fp128 %Res
+}
+
+; Do it twice for good measure given the involved DAG combines.
+define void @f16(ptr %src, ptr %dst) {
+; CHECK-LABEL: f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    lgbr %r1, %r0
+; CHECK-NEXT:    stg %r1, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    lgbr %r1, %r0
+; CHECK-NEXT:    stg %r1, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i64
+  %z = zext i8 %b to i64
+  store volatile i64 %s, ptr %dst
+  store volatile i64 %z, ptr %dst
+
+  %b2 = load atomic i8, ptr %src seq_cst, align 1
+  %s2 = sext i8 %b2 to i64
+  %z2 = zext i8 %b2 to i64
+  store volatile i64 %s2, ptr %dst
+  store volatile i64 %z2, ptr %dst
+
+  ret void
+}
+
+define void @f16_b(ptr %src, ptr %dst) {
+; CHECK-LABEL: f16_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgb %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i16
+  store volatile i16 %s, ptr %dst
+
+  %s2 = sext i8 %b to i64
+  store volatile i64 %s2, ptr %dst
+
+  ret void
+}
+
+define void @f16_c(ptr %src, ptr %dst) {
+; CHECK-LABEL: f16_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i16
+  store volatile i16 %z, ptr %dst
+
+  %z2 = zext i8 %b to i64
+  store volatile i64 %z2, ptr %dst
+
+  ret void
+}
+
+; Check that two i8 loads use a reg/reg op.
+define i8 @f16_d(ptr %src, ptr %src2) {
+; CHECK-LABEL: f16_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r2, 0(%r2)
+; CHECK-NEXT:    lb %r0, 0(%r3)
+; CHECK-NEXT:    ar %r2, %r0
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %b2 = load atomic i8, ptr %src2 seq_cst, align 1
+  %add = add i8 %b, %b2
+  ret i8 %add
+}
+
+; Binary operations on a byte in memory, with an atomic load.
+define void @f17(ptr %ptr) {
+; CHECK-LABEL: f17:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ni 0(%r2), 1
+; CHECK-NEXT:    br %r14
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %xor = and i8 %val, -255
+  store i8 %xor, ptr %ptr
+  ret void
+}
+
+define void @f18(ptr %src) {
+; CHECK-LABEL: f18:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    oiy 4096(%r2), 1
+; CHECK-NEXT:    br %r14
+  %ptr = getelementptr i8, ptr %src, i64 4096
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %xor = or i8 %val, -255
+  store i8 %xor, ptr %ptr
+  ret void
+}
+
+define void @f19(ptr %src) {
+; CHECK-LABEL: f19:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xi 4095(%r2), 1
+; CHECK-NEXT:    br %r14
+  %ptr = getelementptr i8, ptr %src, i64 4095
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %xor = xor i8 %val, -255
+  store i8 %xor, ptr %ptr
+  ret void
+}
+
+; TM
+define double @f20(ptr %src, double %a, double %b) {
+; CHECK-LABEL: f20:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    tm 0(%r2), 1
+; CHECK-NEXT:    je .LBB25_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    ldr %f2, %f0
+; CHECK-NEXT:  .LBB25_2:
+; CHECK-NEXT:    ldr %f0, %f2
+; CHECK-NEXT:    br %r14
+  %byte = load atomic i8, ptr %src seq_cst, align 1
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; vector load and replicate
+define void @f21(ptr %src, ptr %dst) {
+; CHECK-LABEL: f21:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepb %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %v = insertelement <16 x i8> undef, i8 %b, i32 1
+  store volatile <16 x i8> %v, ptr %dst
+  ret void
+}
+
+define void @f22(ptr %src, ptr %dst) {
+; CHECK-LABEL: f22:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlreph %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %v = insertelement <8 x i16> undef, i16 %b, i32 1
+  store volatile <8 x i16> %v, ptr %dst
+  ret void
+}
+
+define void @f23(ptr %src, ptr %dst) {
+; CHECK-LABEL: f23:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %v = insertelement <4 x i32> undef, i32 %b, i32 2
+  store volatile <4 x i32> %v, ptr %dst
+  ret void
+}
+
+define void @f24(ptr %src, ptr %dst) {
+; CHECK-LABEL: f24:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepg %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i64, ptr %src seq_cst, align 8
+  %v = insertelement <2 x i64> undef, i64 %b, i32 0
+  store volatile <2 x i64> %v, ptr %dst
+  ret void
+}
+
+define void @f25(ptr %src, ptr %dst) {
+; CHECK-LABEL: f25:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic float, ptr %src seq_cst, align 4
+  %v = insertelement <4 x float> undef, float %b, i32 1
+  store volatile <4 x float> %v, ptr %dst
+  ret void
+}
+
+; Do *not* use vlrep for an extending load.
+define <4 x i32> @f25_c(ptr %ptr) {
+; CHECK-LABEL: f25_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepf %v24, %v0, 1
+; CHECK-NEXT:    br %r14
+  %L = load atomic i8, ptr %ptr seq_cst, align 4
+  %S = sext i8 %L to i32
+  %val = insertelement <4 x i32> undef, i32 %S, i32 0
+  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+; Do *not* use vlrep if there is another scalar use.
+define <4 x i32> @f25_d(ptr %ptr, ptr %dst) {
+; CHECK-LABEL: f25_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    l %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepf %v24, %v0, 1
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %L = load atomic i32, ptr %ptr seq_cst, align 4
+  store i32 %L, ptr %dst, align 4
+  %val = insertelement <4 x i32> undef, i32 %L, i32 0
+  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define void @f26(ptr %src, ptr %dst) {
+; CHECK-LABEL: f26:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepg %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic double, ptr %src seq_cst, align 8
+  %v = insertelement <2 x double> undef, double %b, i32 0
+  store volatile <2 x double> %v, ptr %dst
+  ret void
+}
+
+; Vector Load logical element and zero.
+define <16 x i8> @f27(ptr %ptr) {
+; CHECK-LABEL: f27:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezb %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @f28(ptr %ptr) {
+; CHECK-LABEL: f28:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezh %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i16, ptr %ptr seq_cst, align 2
+  %ret = insertelement <8 x i16> zeroinitializer, i16 %val, i32 3
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @f29(ptr %ptr) {
+; CHECK-LABEL: f29:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezf %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i32, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 1
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @f30(ptr %ptr) {
+; CHECK-LABEL: f30:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezg %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i64, ptr %ptr seq_cst, align 8
+  %ret = insertelement <2 x i64> zeroinitializer, i64 %val, i32 0
+  ret <2 x i64> %ret
+}
+
+define <4 x i32> @f31(ptr %ptr) {
+; CHECK-LABEL: f31:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezlf %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i32, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 0
+  ret <4 x i32> %ret
+}
+
+define <4 x float> @f32(ptr %ptr) {
+; CHECK-LABEL: f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezlf %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic float, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 0
+  ret <4 x float> %ret
+}
+
+; Vector Load element.
+define <16 x i8> @f33(<16 x i8> %val, ptr %ptr) {
+; CHECK-LABEL: f33:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vleb %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i8, ptr %ptr seq_cst, align 1
+  %ret = insertelement <16 x i8> %val, i8 %element, i32 0
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @f34(<8 x i16> %val, ptr %ptr) {
+; CHECK-LABEL: f34:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vleh %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i16, ptr %ptr seq_cst, align 2
+  %ret = insertelement <8 x i16> %val, i16 %element, i32 0
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @f35(<4 x i32> %val, ptr %ptr) {
+; CHECK-LABEL: f35:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlef %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i32, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x i32> %val, i32 %element, i32 0
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @f36(<2 x i64> %val, ptr %ptr) {
+; CHECK-LABEL: f36:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vleg %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i64, ptr %ptr seq_cst, align 8
+  %ret = insertelement <2 x i64> %val, i64 %element, i32 0
+  ret <2 x i64> %ret
+}
+
+; Test operation on memory involving atomic load and store.
+define void @f39(ptr %ptr) {
+; CHECK-LABEL: f39:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    oi 0(%r2), 1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %or = or i8 %val, -255
+  store atomic i8 %or, ptr %ptr seq_cst, align 1
+  ret void
+}
+
+; Some atomic stores of immediates.
+define void @f40(ptr %ptr) {
+; CHECK-LABEL: f40:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvi 0(%r2), 128
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i8 128, ptr %ptr seq_cst, align 1
+  ret void
+}
+
+define void @f41(ptr %ptr) {
+; CHECK-LABEL: f41:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvhi 0(%r2), -1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i32 4294967295, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f42(ptr %ptr) {
+; CHECK-LABEL: f42:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvhi 0(%r2), -1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i32 4294967295, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f43(ptr %ptr) {
+; CHECK-LABEL: f43:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llihl %r0, 255
+; CHECK-NEXT:    oilf %r0, 4294967295
+; CHECK-NEXT:    stg %r0, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i64 1099511627775, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define void @f44(ptr %ptr) {
+; CHECK-LABEL: f44:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI49_0
+; CHECK-NEXT:    ld %f0, 0(%r1)
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic double 0x3ff0000020000000, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+; Vector Store Element.
+define void @f45(<16 x i8> %val, ptr %ptr) {
+; CHECK-LABEL: f45:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteb %v24, 0(%r2), 0
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <16 x i8> %val, i32 0
+  store atomic i8 %element, ptr %ptr seq_cst, align 1
+  ret void
+}
+
+define void @f46(<8 x i16> %val, ptr %base) {
+; CHECK-LABEL: f46:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteh %v24, 4094(%r2), 5
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %ptr = getelementptr i16, ptr %base, i32 2047
+  %element = extractelement <8 x i16> %val, i32 5
+  store atomic i16 %element, ptr %ptr seq_cst, align 2
+  ret void
+}
+
+define void @f47(<4 x i32> %val, ptr %ptr) {
+; CHECK-LABEL: f47:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vstef %v24, 0(%r2), 3
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <4 x i32> %val, i32 3
+  store atomic i32 %element, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f48(<2 x i64> %val, ptr %ptr) {
+; CHECK-LABEL: f48:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteg %v24, 0(%r2), 1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <2 x i64> %val, i32 1
+  store atomic i64 %element, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define void @f49(<4 x float> %val, ptr %ptr) {
+; CHECK-LABEL: f49:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vstef %v24, 0(%r2), 0
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <4 x float> %val, i32 0
+  store atomic float %element, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f50(<2 x double> %val, ptr %ptr) {
+; CHECK-LABEL: f50:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteg %v24, 0(%r2), 1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <2 x double> %val, i32 1
+  store atomic double %element, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define void @f51(ptr %src, ptr %dst) {
+; CHECK-LABEL: f51:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vgmf %v1, 2, 8
+; CHECK-NEXT:    aebr %f0, %f1
+; CHECK-NEXT:    ste %f0, 0(%r3)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %atomic-load = load atomic i128, ptr %src seq_cst, align 16
+  %b0 = bitcast i128 %atomic-load to <4 x float>
+  %vecext = extractelement <4 x float> %b0, i64 0
+  %add = fadd float %vecext, 1.000000e+00
+  store atomic float %add, ptr %dst seq_cst, align 4
+  ret void
+}
+
+define void @f52(ptr %src, ptr %dst) {
+; CHECK-LABEL: f52:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vgmg %v1, 2, 11
+; CHECK-NEXT:    adbr %f0, %f1
+; CHECK-NEXT:    std %f0, 0(%r3)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %atomic-load = load atomic i128, ptr %src seq_cst, align 16
+  %b0 = bitcast i128 %atomic-load to <2 x double>
+  %vecext = extractelement <2 x double> %b0, i64 0
+  %add = fadd double %vecext, 1.000000e+00
+  store atomic double %add, ptr %dst seq_cst, align 8
+  ret void
+}
+
+define void @fun58(ptr %ptr, i64 %arg) {
+; CHECK-LABEL: fun58:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    st %r3, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %res = trunc i64 %arg to i32
+  store atomic i32 %res, ptr %ptr seq_cst, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
index b748bfc767a4d..91e324b0af1a9 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
@@ -6,10 +6,7 @@
 define void @f1(ptr %src, float %val) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $f0s killed $f0s def $f0d
-; CHECK-NEXT:    lgdr %r0, %f0
-; CHECK-NEXT:    srlg %r0, %r0, 32
-; CHECK-NEXT:    st %r0, 0(%r2)
+; CHECK-NEXT:    ste %f0, 0(%r2)
 ; CHECK-NEXT:    bcr 15, %r0
 ; CHECK-NEXT:    br %r14
   store atomic float %val, ptr %src seq_cst, align 4
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-04.mir b/llvm/test/CodeGen/SystemZ/cond-move-04.mir
index 97aa00f582921..23fd2739698a4 100644
--- a/llvm/test/CodeGen/SystemZ/cond-move-04.mir
+++ b/llvm/test/CodeGen/SystemZ/cond-move-04.mir
@@ -53,6 +53,7 @@ registers:
   - { id: 10, class: gr64bit }
   - { id: 11, class: gr32bit }
 frameInfo:       
+  adjustsStack:    true
   hasCalls:        true
 body:             |
   bb.0 (%ir-block.1):
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-08.mir b/llvm/test/CodeGen/SystemZ/cond-move-08.mir
index 93aa5626b8e89..64c6d06979928 100644
--- a/llvm/test/CodeGen/SystemZ/cond-move-08.mir
+++ b/llvm/test/CodeGen/SystemZ/cond-move-08.mir
@@ -116,6 +116,7 @@ registers:
   - { id: 27, class: grx32bit }
   - { id: 28, class: addr64bit }
 frameInfo:
+  adjustsStack:    true
   hasCalls:        true
 body:             |
   bb.0.bb5:
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints-02.mir b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints-02.mir
index 37e29800fb1d6..2701a1dc034a2 100644
--- a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints-02.mir
+++ b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints-02.mir
@@ -30,6 +30,7 @@ registers:
   - { id: 11, class: gr32bit }
 frameInfo:
   maxAlignment:    1
+  adjustsStack:    true
   hasCalls:        true
 machineFunctionInfo: {}
 body:             |
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
index e7e1eaf8f8fdc..c98ffda837272 100644
--- a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
+++ b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
@@ -192,6 +192,7 @@ liveins:
   - { reg: '$r2d', virtual-reg: '%31' }
   - { reg: '$r3d', virtual-reg: '%32' }
 frameInfo:       
+  adjustsStack:    true
   hasCalls:        true
 body:             |
   bb.0.bb:
diff --git a/llvm/test/CodeGen/SystemZ/frame-28.mir b/llvm/test/CodeGen/SystemZ/frame-28.mir
index dd5933a9c7b4b..13337dba6ec53 100644
--- a/llvm/test/CodeGen/SystemZ/frame-28.mir
+++ b/llvm/test/CodeGen/SystemZ/frame-28.mir
@@ -162,6 +162,8 @@ body:             |
 ---
 name:            fun4
 tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, size: 5000 }
   - { id: 1, size: 2500 }
diff --git a/llvm/test/CodeGen/SystemZ/int-usub-12.ll b/llvm/test/CodeGen/SystemZ/int-usub-12.ll
index c39a6da37048d..147fbfd920a9d 100644
--- a/llvm/test/CodeGen/SystemZ/int-usub-12.ll
+++ b/llvm/test/CodeGen/SystemZ/int-usub-12.ll
@@ -11,6 +11,7 @@ define zeroext i1 @f1(i128 %a, i128 %b, ptr %res) {
 ; CHECK-NEXT:    vscbiq %v2, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v2, 1
 ; CHECK-NEXT:    vsq %v0, %v1, %v0
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    vst %v0, 0(%r4), 3
 ; CHECK-NEXT:    br %r14
   %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b)
@@ -27,6 +28,7 @@ define zeroext i1 @f2(i128 %a, i128 %b) {
 ; CHECK-NEXT:    vl %v1, 0(%r2), 3
 ; CHECK-NEXT:    vscbiq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    br %r14
   %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b)
   %obit = extractvalue {i128, i1} %t, 1
@@ -46,5 +48,25 @@ define i128 @f3(i128 %a, i128 %b) {
   ret i128 %val
 }
 
+define i128 @f4(i128 %a, i128 %b) {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v0, 0(%r4), 3
+; CHECK-NEXT:    vl %v1, 0(%r3), 3
+; CHECK-NEXT:    vscbiq %v2, %v1, %v0
+; CHECK-NEXT:    vlgvf %r0, %v2, 3
+; CHECK-NEXT:    vgbm %v2, 0
+; CHECK-NEXT:    xilf %r0, 1
+; CHECK-NEXT:    jl .LBB3_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vsq %v2, %v1, %v0
+; CHECK-NEXT:  .LBB3_2:
+; CHECK-NEXT:    vst %v2, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %val = call i128 @llvm.usub.sat.i128(i128 %a, i128 %b)
+  ret i128 %val
+}
+
 declare {i128, i1} @llvm.usub.with.overflow.i128(i128, i128) nounwind readnone
+declare i128 @llvm.usub.sat.i128(i128, i128) nounwind readnone
 
diff --git a/llvm/test/CodeGen/SystemZ/int-usub-13.ll b/llvm/test/CodeGen/SystemZ/int-usub-13.ll
index 637e1a81de996..794af3b73fbe2 100644
--- a/llvm/test/CodeGen/SystemZ/int-usub-13.ll
+++ b/llvm/test/CodeGen/SystemZ/int-usub-13.ll
@@ -15,6 +15,7 @@ define zeroext i1 @f1(i256 %a, i256 %b, ptr %res) {
 ; CHECK-NEXT:    vlgvg %r2, %v5, 1
 ; CHECK-NEXT:    vsbiq %v0, %v1, %v0, %v4
 ; CHECK-NEXT:    vsq %v1, %v3, %v2
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    vst %v1, 16(%r4), 3
 ; CHECK-NEXT:    vst %v0, 0(%r4), 3
 ; CHECK-NEXT:    br %r14
@@ -35,6 +36,7 @@ define zeroext i1 @f2(i256 %a, i256 %b) {
 ; CHECK-NEXT:    vscbiq %v2, %v3, %v2
 ; CHECK-NEXT:    vsbcbiq %v0, %v1, %v0, %v2
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    br %r14
   %t = call {i256, i1} @llvm.usub.with.overflow.i256(i256 %a, i256 %b)
   %obit = extractvalue {i256, i1} %t, 1
diff --git a/llvm/test/CodeGen/X86/apx/add.ll b/llvm/test/CodeGen/X86/apx/add.ll
index cdb29a70770e4..d3301ecdb72d0 100644
--- a/llvm/test/CodeGen/X86/apx/add.ll
+++ b/llvm/test/CodeGen/X86/apx/add.ll
@@ -298,9 +298,9 @@ define i8 @addflag8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: addflag8rr:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xf7]
-; CHECK-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; CHECK-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -311,10 +311,10 @@ entry:
 define i16 @addflag16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-LABEL: addflag16rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw %si, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x01,0xf7]
-; CHECK-NEXT:    movl $65535, %eax # encoding: [0xb8,0xff,0xff,0x00,0x00]
+; CHECK-NEXT:    addw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xf7]
+; CHECK-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0xFFFF
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -325,9 +325,9 @@ entry:
 define i32 @addflag32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK-LABEL: addflag32rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x01,0xf7]
-; CHECK-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
+; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
@@ -337,9 +337,9 @@ entry:
 define i64 @addflag64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK-LABEL: addflag64rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x01,0xf7]
-; CHECK-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    addq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xf7]
+; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 %b)
@@ -350,9 +350,9 @@ define i8 @addflag8rm(i8 noundef %a, ptr %b) {
 ; CHECK-LABEL: addflag8rm:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
-; CHECK-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; CHECK-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -364,10 +364,10 @@ entry:
 define i16 @addflag16rm(i16 noundef %a, ptr %b) {
 ; CHECK-LABEL: addflag16rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw (%rsi), %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x03,0x3e]
-; CHECK-NEXT:    movl $65535, %eax # encoding: [0xb8,0xff,0xff,0x00,0x00]
+; CHECK-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
+; CHECK-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0xFFFF
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -379,9 +379,9 @@ entry:
 define i32 @addflag32rm(i32 noundef %a, ptr %b) {
 ; CHECK-LABEL: addflag32rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x03,0x3e]
-; CHECK-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
+; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
@@ -392,9 +392,9 @@ entry:
 define i64 @addflag64rm(i64 noundef %a, ptr %b) {
 ; CHECK-LABEL: addflag64rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x03,0x3e]
-; CHECK-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
+; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
@@ -405,10 +405,10 @@ entry:
 define i16 @addflag16ri8(i16 noundef %a) {
 ; CHECK-LABEL: addflag16ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw $123, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xc7,0x7b]
-; CHECK-NEXT:    movl $65535, %eax # encoding: [0xb8,0xff,0xff,0x00,0x00]
+; CHECK-NEXT:    addw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xc7,0x7b]
+; CHECK-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0xFFFF
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -419,9 +419,9 @@ entry:
 define i32 @addflag32ri8(i32 noundef %a) {
 ; CHECK-LABEL: addflag32ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xc7,0x7b]
-; CHECK-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
+; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123)
@@ -431,9 +431,9 @@ entry:
 define i64 @addflag64ri8(i64 noundef %a) {
 ; CHECK-LABEL: addflag64ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xc7,0x7b]
-; CHECK-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
+; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123)
@@ -444,9 +444,9 @@ define i8 @addflag8ri(i8 noundef %a) {
 ; CHECK-LABEL: addflag8ri:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x7b]
-; CHECK-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; CHECK-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -457,11 +457,11 @@ entry:
 define i16 @addflag16ri(i16 noundef %a) {
 ; CHECK-LABEL: addflag16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw $1234, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xc7,0xd2,0x04]
+; CHECK-NEXT:    addw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xc7,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    movl $65535, %eax # encoding: [0xb8,0xff,0xff,0x00,0x00]
+; CHECK-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0xFFFF
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -472,10 +472,10 @@ entry:
 define i32 @addflag32ri(i32 noundef %a) {
 ; CHECK-LABEL: addflag32ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT:    addl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123456)
@@ -485,10 +485,10 @@ entry:
 define i64 @addflag64ri(i64 noundef %a) {
 ; CHECK-LABEL: addflag64ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123456)
diff --git a/llvm/test/CodeGen/X86/apx/cfcmov.ll b/llvm/test/CodeGen/X86/apx/cfcmov.ll
new file mode 100644
index 0000000000000..f643120c9b50f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/cfcmov.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+cf -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+cf -x86-cmov-converter=false -verify-machineinstrs | FileCheck %s
+
+define i8 @cfcmov8rr(i8 %0) {
+; CHECK-LABEL: cfcmov8rr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    cfcmovel %edi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %2 = icmp eq i8 %0, 1
+  %3 = select i1 %2, i8 %0, i8 0
+  ret i8 %3
+}
+
+define i16 @cfcmov16rr(i16 %0) {
+; CHECK-LABEL: cfcmov16rr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpw $1, %di
+; CHECK-NEXT:    cfcmovnel %edi, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %2 = icmp ne i16 %0, 1
+  %3 = select i1 %2, i16 %0, i16 0
+  ret i16 %3
+}
+
+define i32 @cfcmov32rr(i32 %0) {
+; CHECK-LABEL: cfcmov32rr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    cfcmovael %edi, %eax
+; CHECK-NEXT:    retq
+  %2 = icmp ugt i32 %0, 1
+  %3 = select i1 %2, i32 %0, i32 0
+  ret i32 %3
+}
+
+define i64 @cfcmov64rr(i64 %0) {
+; CHECK-LABEL: cfcmov64rr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    cfcmoveq %rdi, %rax
+; CHECK-NEXT:    retq
+  %2 = icmp ult i64 %0, 1
+  %3 = select i1 %2, i64 %0, i64 0
+  ret i64 %3
+}
+
+define i8 @cfcmov8rr_inv(i8 %0) {
+; CHECK-LABEL: cfcmov8rr_inv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    cfcmovnel %edi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %2 = icmp eq i8 %0, 1
+  %3 = select i1 %2, i8 0, i8 %0
+  ret i8 %3
+}
+
+define i16 @cfcmov16rr_inv(i16 %0) {
+; CHECK-LABEL: cfcmov16rr_inv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpw $1, %di
+; CHECK-NEXT:    cfcmovel %edi, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %2 = icmp ne i16 %0, 1
+  %3 = select i1 %2, i16 0, i16 %0
+  ret i16 %3
+}
+
+define i32 @cfcmov32rr_inv(i32 %0) {
+; CHECK-LABEL: cfcmov32rr_inv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    cfcmovbl %edi, %eax
+; CHECK-NEXT:    retq
+  %2 = icmp ugt i32 %0, 1
+  %3 = select i1 %2, i32 0, i32 %0
+  ret i32 %3
+}
+
+define i64 @cfcmov64rr_inv(i64 %0) {
+; CHECK-LABEL: cfcmov64rr_inv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpq $2, %rdi
+; CHECK-NEXT:    cfcmovaeq %rdi, %rax
+; CHECK-NEXT:    retq
+  %2 = icmp ule i64 %0, 1
+  %3 = select i1 %2, i64 0, i64 %0
+  ret i64 %3
+}
diff --git a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.mir b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.mir
index d6a9cda1dc816..e81a4480ba44c 100644
--- a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.mir
+++ b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.mir
@@ -29,6 +29,18 @@
     call void @foo()
     ret void
   }
+
+  define void @test_cmov(i64 %a, i64 %b) {
+  entry:
+    call void @foo()
+    ret void
+  }
+
+  define void @test_cfcmov(i64 %a, i64 %b) {
+  entry:
+    call void @foo()
+    ret void
+  }
 ...
 ---
 name:            test_adc
@@ -166,3 +178,93 @@ body:             |
     RET 0
 
 ...
+---
+name:            test_cmov
+# CHECK-LABEL: name: test_cmov
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $rdi, $rsi
+
+    %0:gr64 = COPY $rdi
+    %1:gr64 = COPY $rsi
+    CMP64rr %0, %1, implicit-def $eflags
+    %2:gr64 = COPY $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+  ; CHECK:      %[[A_REG:[^:]*]]:gr8 = SETCCr 7, implicit $eflags
+  ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETCCr 2, implicit $eflags
+  ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETCCr 4, implicit $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+
+    $eflags = COPY %2
+    %3:gr64 = CMOV64rr_ND %0, %1, 7, implicit $eflags
+    %4:gr64 = CMOV64rr_ND %0, %1, 2, implicit $eflags
+    %5:gr64 = CMOV64rr_ND %0, %1, 4, implicit $eflags
+    %6:gr64 = CMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NOT:     $eflags =
+  ; CHECK:         TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %3:gr64 = CMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[B_REG]], %[[B_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %4:gr64 = CMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %5:gr64 = CMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %6:gr64 = CMOV64rr_ND %0, %1, 4, implicit killed $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %6
+
+    RET 0
+...
+---
+name:            test_cfcmov
+# CHECK-LABEL: name: test_cfcmov
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $rdi, $rsi
+
+    %0:gr64 = COPY $rdi
+    %1:gr64 = COPY $rsi
+    CMP64rr %0, %1, implicit-def $eflags
+    %2:gr64 = COPY $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+  ; CHECK:      %[[A_REG:[^:]*]]:gr8 = SETCCr 7, implicit $eflags
+  ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETCCr 2, implicit $eflags
+  ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETCCr 4, implicit $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+
+    $eflags = COPY %2
+    %3:gr64 = CFCMOV64rr %1, 7, implicit $eflags
+    %4:gr64 = CFCMOV64rr %1, 2, implicit $eflags
+    %5:gr64 = CFCMOV64rr_ND %0, %1, 4, implicit $eflags
+    %6:gr64 = CFCMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NOT:     $eflags =
+  ; CHECK:         TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %3:gr64 = CFCMOV64rr %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[B_REG]], %[[B_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %4:gr64 = CFCMOV64rr %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %5:gr64 = CFCMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %6:gr64 = CFCMOV64rr_ND %0, %1, 4, implicit killed $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %6
+
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/apx/inc.ll b/llvm/test/CodeGen/X86/apx/inc.ll
index 613f7866c9ac5..a9c6d740cf2ce 100644
--- a/llvm/test/CodeGen/X86/apx/inc.ll
+++ b/llvm/test/CodeGen/X86/apx/inc.ll
@@ -92,9 +92,9 @@ define i8 @uinc8r(i8 noundef %a) {
 ; CHECK-LABEL: uinc8r:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incb %dil, %al
-; CHECK-NEXT:    movzbl %al, %ecx
-; CHECK-NEXT:    movl $255, %eax
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movl $255, %ecx
+; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 entry:
@@ -105,9 +105,9 @@ entry:
 define i16 @uinc16r(i16 noundef %a) {
 ; CHECK-LABEL: uinc16r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incw %di, %cx
-; CHECK-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    incw %di, %ax
+; CHECK-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
 entry:
@@ -118,9 +118,9 @@ entry:
 define i32 @uinc32r(i32 noundef %a) {
 ; CHECK-LABEL: uinc32r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incl %edi, %ecx
-; CHECK-NEXT:    movl $-1, %eax
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    incl %edi, %eax
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    retq
 entry:
   %inc = call i32 @llvm.uadd.sat.i32(i32 %a, i32 1)
@@ -130,9 +130,9 @@ entry:
 define i64 @uinc64r(i64 noundef %a) {
 ; CHECK-LABEL: uinc64r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incq %rdi, %rcx
-; CHECK-NEXT:    movq $-1, %rax
-; CHECK-NEXT:    cmovneq %rcx, %rax
+; CHECK-NEXT:    incq %rdi, %rax
+; CHECK-NEXT:    movq $-1, %rcx
+; CHECK-NEXT:    cmoveq %rcx, %rax
 ; CHECK-NEXT:    retq
 entry:
   %inc = call i64 @llvm.uadd.sat.i64(i64 %a, i64 1)
diff --git a/llvm/test/CodeGen/X86/apx/shift-eflags.ll b/llvm/test/CodeGen/X86/apx/shift-eflags.ll
index f34dc6c05dad9..932cdc189bf9f 100644
--- a/llvm/test/CodeGen/X86/apx/shift-eflags.ll
+++ b/llvm/test/CodeGen/X86/apx/shift-eflags.ll
@@ -7,9 +7,8 @@
 define i32 @ashr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: ashr_const:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    sarl $14, %edi, %edx
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    sarl $14, %edi, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = ashr i32 %a0, 14
   %c = icmp eq i32 %s, 0
@@ -21,9 +20,8 @@ define i32 @ashr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @lshr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: lshr_const:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    testl $-16384, %edi # imm = 0xC000
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = lshr i32 %a0, 14
   %c = icmp eq i32 %s, 0
@@ -35,9 +33,8 @@ define i32 @lshr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @shl_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: shl_const:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    testl $262143, %edi # imm = 0x3FFFF
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = shl i32 %a0, 14
   %c = icmp eq i32 %s, 0
@@ -88,9 +85,8 @@ define i32 @shl_const_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @ashr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: ashr_const1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    sarl %edi, %edx
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    sarl %edi, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = ashr i32 %a0, 1
   %c = icmp eq i32 %s, 0
@@ -102,9 +98,8 @@ define i32 @ashr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @lshr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: lshr_const1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    testl $-2, %edi
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = lshr i32 %a0, 1
   %c = icmp eq i32 %s, 0
@@ -116,9 +111,8 @@ define i32 @lshr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @shl_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: shl_const1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    testl $2147483647, %edi # imm = 0x7FFFFFFF
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = shl i32 %a0, 1
   %c = icmp eq i32 %s, 0
diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll
index 4b0bd14872141..be0914c90b9fa 100644
--- a/llvm/test/CodeGen/X86/apx/sub.ll
+++ b/llvm/test/CodeGen/X86/apx/sub.ll
@@ -299,10 +299,10 @@ declare i64 @llvm.usub.sat.i64(i64, i64)
 define i8 @subflag8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: subflag8rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf7]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subb %sil, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x28,0xf7]
+; CHECK-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -313,9 +313,9 @@ entry:
 define i16 @subflag16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-LABEL: subflag16rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf7]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subw %si, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x29,0xf7]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -326,9 +326,9 @@ entry:
 define i32 @subflag32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK-LABEL: subflag32rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf7]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 %b)
@@ -340,7 +340,7 @@ define i64 @subflag64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 %b)
@@ -350,10 +350,10 @@ entry:
 define i8 @subflag8rm(i8 noundef %a, ptr %b) {
 ; CHECK-LABEL: subflag8rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x2a,0x3e]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subb (%rsi), %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x2a,0x3e]
+; CHECK-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -365,9 +365,9 @@ entry:
 define i16 @subflag16rm(i16 noundef %a, ptr %b) {
 ; CHECK-LABEL: subflag16rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x2b,0x3e]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subw (%rsi), %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x2b,0x3e]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -379,9 +379,9 @@ entry:
 define i32 @subflag32rm(i32 noundef %a, ptr %b) {
 ; CHECK-LABEL: subflag32rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x2b,0x3e]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
@@ -394,7 +394,7 @@ define i64 @subflag64rm(i64 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
@@ -405,9 +405,9 @@ entry:
 define i16 @subflag16ri8(i16 noundef %a) {
 ; CHECK-LABEL: subflag16ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xef,0x7b]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subw $123, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xef,0x7b]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -418,9 +418,9 @@ entry:
 define i32 @subflag32ri8(i32 noundef %a) {
 ; CHECK-LABEL: subflag32ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xef,0x7b]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123)
@@ -432,7 +432,7 @@ define i64 @subflag64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123)
@@ -442,10 +442,10 @@ entry:
 define i8 @subflag8ri(i8 noundef %a) {
 ; CHECK-LABEL: subflag8ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xef,0x7b]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subb $123, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xef,0x7b]
+; CHECK-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -456,10 +456,10 @@ entry:
 define i16 @subflag16ri(i16 noundef %a) {
 ; CHECK-LABEL: subflag16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xef,0xd2,0x04]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subw $1234, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xef,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -470,10 +470,10 @@ entry:
 define i32 @subflag32ri(i32 noundef %a) {
 ; CHECK-LABEL: subflag32ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123456)
@@ -486,7 +486,7 @@ define i64 @subflag64ri(i64 noundef %a) {
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    subq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123456)
diff --git a/llvm/test/CodeGen/X86/asm-dialect-module.ll b/llvm/test/CodeGen/X86/asm-dialect-module.ll
new file mode 100644
index 0000000000000..2c00a44424c2c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/asm-dialect-module.ll
@@ -0,0 +1,10 @@
+;; Test that we respect the assembler dialect when parsing module-level inline asm.
+; RUN: not llc < %s -mtriple=x86_64 2>&1 | FileCheck %s --check-prefix=ERR
+; RUN: llc < %s -mtriple=x86_64 -x86-asm-syntax=intel | FileCheck %s
+
+; ERR: <inline asm>:1:1: error: unknown use of instruction mnemonic without a size suffix
+
+; CHECK: .intel_syntax noprefix
+; CHECK: mov eax, eax
+
+module asm "mov eax, eax"
diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll
new file mode 100644
index 0000000000000..4529ea275df9c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgceils.ll
@@ -0,0 +1,3821 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+;
+; 128-bit vectors
+;
+
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_fixed_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm0, %xmm2
+; SSE-NEXT:    paddb %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i8> %a0, %a1
+  %xor = xor <16 x i8> %a0, %a1
+  %shift = ashr <16 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <16 x i8> %or, %shift
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE2-LABEL: test_ext_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    paddw %xmm2, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    psubw %xmm1, %xmm4
+; SSE2-NEXT:    psubw %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm3
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm0
+; SSE4-NEXT:    paddw %xmm2, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm1
+; SSE4-NEXT:    paddw %xmm3, %xmm1
+; SSE4-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE4-NEXT:    psubw %xmm2, %xmm0
+; SSE4-NEXT:    psubw %xmm2, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm2, %xmm0
+; SSE4-NEXT:    pand %xmm2, %xmm1
+; SSE4-NEXT:    packuswb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i8> %a0 to <16 x i16>
+  %x1 = sext <16 x i8> %a1 to <16 x i16>
+  %sum = add <16 x i16> %x0, %x1
+  %inc = add <16 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = ashr <16 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <16 x i16> %shift to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_fixed_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    psraw $1, %xmm1
+; SSE-NEXT:    psubw %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %or = or <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a1, %a0
+  %shift = ashr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <8 x i16> %or, %shift
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE2-LABEL: test_ext_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm2, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm3
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm0
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm1
+; SSE4-NEXT:    paddd %xmm3, %xmm1
+; SSE4-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE4-NEXT:    psubd %xmm2, %xmm0
+; SSE4-NEXT:    psubd %xmm2, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE4-NEXT:    packusdw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %sum = add <8 x i32> %x0, %x1
+  %inc = add <8 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = ashr <8 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <8 x i32> %shift to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_fixed_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    psrad $1, %xmm1
+; SSE-NEXT:    psubd %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %or = or <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a1, %a0
+  %shift = ashr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
+  %res = sub <4 x i32> %or, %shift
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE2-LABEL: test_ext_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    psubq %xmm1, %xmm4
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm2
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm4
+; SSE4-NEXT:    paddq %xmm2, %xmm4
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm0
+; SSE4-NEXT:    paddq %xmm3, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT:    psubq %xmm1, %xmm4
+; SSE4-NEXT:    psubq %xmm1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <4 x i32> %a0 to <4 x i64>
+  %x1 = sext <4 x i32> %a1 to <4 x i64>
+  %sum = add <4 x i64> %x0, %x1
+  %inc = add <4 x i64> %sum, <i64 1, i64 1, i64 1, i64 1>
+  %shift = ashr <4 x i64> %inc, <i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <4 x i64> %shift to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psubq %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    por %xmm1, %xmm2
+; SSE4-NEXT:    pxor %xmm0, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    psrad $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE4-NEXT:    psubq %xmm1, %xmm2
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpsraq $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %or = or <2 x i64> %a0, %a1
+  %xor = xor <2 x i64> %a1, %a0
+  %shift = ashr <2 x i64> %xor, <i64 1, i64 1>
+  %res = sub <2 x i64> %or, %shift
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_ext_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    movq %xmm1, %rdi
+; SSE2-NEXT:    movq %rdi, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r9
+; SSE2-NEXT:    movq %r9, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    addq %r9, %rdx
+; SSE2-NEXT:    adcq %rsi, %r10
+; SSE2-NEXT:    addq %rdi, %rax
+; SSE2-NEXT:    adcq %rcx, %r8
+; SSE2-NEXT:    addq $1, %rax
+; SSE2-NEXT:    adcq $0, %r8
+; SSE2-NEXT:    addq $1, %rdx
+; SSE2-NEXT:    adcq $0, %r10
+; SSE2-NEXT:    shldq $63, %rdx, %r10
+; SSE2-NEXT:    shldq $63, %rax, %r8
+; SSE2-NEXT:    movq %r8, %xmm0
+; SSE2-NEXT:    movq %r10, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %xmm0, %rdx
+; SSE4-NEXT:    movq %rdx, %rsi
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    pextrq $1, %xmm1, %rdi
+; SSE4-NEXT:    movq %rdi, %r8
+; SSE4-NEXT:    sarq $63, %r8
+; SSE4-NEXT:    movq %xmm1, %r9
+; SSE4-NEXT:    movq %r9, %r10
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    addq %r9, %rdx
+; SSE4-NEXT:    adcq %rsi, %r10
+; SSE4-NEXT:    addq %rdi, %rax
+; SSE4-NEXT:    adcq %rcx, %r8
+; SSE4-NEXT:    addq $1, %rax
+; SSE4-NEXT:    adcq $0, %r8
+; SSE4-NEXT:    addq $1, %rdx
+; SSE4-NEXT:    adcq $0, %r10
+; SSE4-NEXT:    shldq $63, %rdx, %r10
+; SSE4-NEXT:    shldq $63, %rax, %r8
+; SSE4-NEXT:    movq %r8, %xmm1
+; SSE4-NEXT:    movq %r10, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vmovq %xmm0, %rdx
+; AVX-NEXT:    movq %rdx, %rsi
+; AVX-NEXT:    sarq $63, %rsi
+; AVX-NEXT:    vpextrq $1, %xmm1, %rdi
+; AVX-NEXT:    movq %rdi, %r8
+; AVX-NEXT:    sarq $63, %r8
+; AVX-NEXT:    vmovq %xmm1, %r9
+; AVX-NEXT:    movq %r9, %r10
+; AVX-NEXT:    sarq $63, %r10
+; AVX-NEXT:    addq %r9, %rdx
+; AVX-NEXT:    adcq %rsi, %r10
+; AVX-NEXT:    addq %rdi, %rax
+; AVX-NEXT:    adcq %rcx, %r8
+; AVX-NEXT:    addq $1, %rax
+; AVX-NEXT:    adcq $0, %r8
+; AVX-NEXT:    addq $1, %rdx
+; AVX-NEXT:    adcq $0, %r10
+; AVX-NEXT:    shldq $63, %rdx, %r10
+; AVX-NEXT:    shldq $63, %rax, %r8
+; AVX-NEXT:    vmovq %r8, %xmm0
+; AVX-NEXT:    vmovq %r10, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %x0 = sext <2 x i64> %a0 to <2 x i128>
+  %x1 = sext <2 x i64> %a1 to <2 x i128>
+  %sum = add <2 x i128> %x0, %x1
+  %inc = add <2 x i128> %sum, <i128 1, i128 1>
+  %shift = ashr <2 x i128> %inc, <i128 1, i128 1>
+  %res = trunc <2 x i128> %shift to <2 x i64>
+  ret <2 x i64> %res
+}
+
+;
+; 256-bit vectors
+;
+
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_fixed_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psubb %xmm1, %xmm4
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    psubb %xmm0, %xmm5
+; SSE-NEXT:    paddb %xmm3, %xmm5
+; SSE-NEXT:    paddb %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512-NEXT:    vpsubb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <32 x i8> %a0, %a1
+  %xor = xor <32 x i8> %a0, %a1
+  %shift = ashr <32 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <32 x i8> %or, %shift
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE2-LABEL: test_ext_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm8
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    paddw %xmm5, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm6, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    paddw %xmm7, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm8, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    psubw %xmm3, %xmm4
+; SSE2-NEXT:    psubw %xmm3, %xmm0
+; SSE2-NEXT:    psubw %xmm3, %xmm2
+; SSE2-NEXT:    psubw %xmm3, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm4
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm5
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm7
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm0
+; SSE4-NEXT:    paddw %xmm4, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm2
+; SSE4-NEXT:    paddw %xmm5, %xmm2
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm1
+; SSE4-NEXT:    paddw %xmm6, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm3
+; SSE4-NEXT:    paddw %xmm7, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE4-NEXT:    psubw %xmm4, %xmm0
+; SSE4-NEXT:    psubw %xmm4, %xmm2
+; SSE4-NEXT:    psubw %xmm4, %xmm1
+; SSE4-NEXT:    psubw %xmm4, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm4, %xmm0
+; SSE4-NEXT:    pand %xmm4, %xmm2
+; SSE4-NEXT:    packuswb %xmm2, %xmm0
+; SSE4-NEXT:    pand %xmm4, %xmm1
+; SSE4-NEXT:    pand %xmm4, %xmm3
+; SSE4-NEXT:    packuswb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmovsxbw %xmm5, %xmm6
+; AVX1-NEXT:    vpaddw %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm5, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <32 x i8> %a0 to <32 x i16>
+  %x1 = sext <32 x i8> %a1 to <32 x i16>
+  %sum = add <32 x i16> %x0, %x1
+  %inc = add <32 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = ashr <32 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <32 x i16> %shift to <32 x i8>
+  ret <32 x i8> %res
+}
+
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_fixed_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    psraw $1, %xmm3
+; SSE-NEXT:    psubw %xmm3, %xmm4
+; SSE-NEXT:    psraw $1, %xmm2
+; SSE-NEXT:    psubw %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubw %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i16> %a0, %a1
+  %xor = xor <16 x i16> %a1, %a0
+  %shift = ashr <16 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <16 x i16> %or, %shift
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE2-LABEL: test_ext_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    paddd %xmm5, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm6, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    paddd %xmm7, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm1
+; SSE2-NEXT:    psubd %xmm4, %xmm3
+; SSE2-NEXT:    psubd %xmm4, %xmm0
+; SSE2-NEXT:    psubd %xmm4, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm4
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm5
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm7
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm0
+; SSE4-NEXT:    paddd %xmm4, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm2
+; SSE4-NEXT:    paddd %xmm5, %xmm2
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm1
+; SSE4-NEXT:    paddd %xmm6, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm3
+; SSE4-NEXT:    paddd %xmm7, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE4-NEXT:    psubd %xmm4, %xmm0
+; SSE4-NEXT:    psubd %xmm4, %xmm2
+; SSE4-NEXT:    psubd %xmm4, %xmm1
+; SSE4-NEXT:    psubd %xmm4, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm4, %xmm4
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm2, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmovsxwd %xmm5, %xmm6
+; AVX1-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i16> %a0 to <16 x i32>
+  %x1 = sext <16 x i16> %a1 to <16 x i32>
+  %sum = add <16 x i32> %x0, %x1
+  %inc = add <16 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = ashr <16 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <16 x i32> %shift to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_fixed_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    psrad $1, %xmm3
+; SSE-NEXT:    psubd %xmm3, %xmm4
+; SSE-NEXT:    psrad $1, %xmm2
+; SSE-NEXT:    psubd %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <8 x i32> %a0, %a1
+  %xor = xor <8 x i32> %a1, %a0
+  %shift = ashr <8 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = sub <8 x i32> %or, %shift
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE2-LABEL: test_ext_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm3, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm4
+; SSE2-NEXT:    psubq %xmm3, %xmm0
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm3, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm4, %xmm5
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm7
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm4
+; SSE4-NEXT:    paddq %xmm5, %xmm4
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm0
+; SSE4-NEXT:    paddq %xmm6, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm2
+; SSE4-NEXT:    paddq %xmm7, %xmm2
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm1
+; SSE4-NEXT:    paddq %xmm8, %xmm1
+; SSE4-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE4-NEXT:    psubq %xmm3, %xmm4
+; SSE4-NEXT:    psubq %xmm3, %xmm0
+; SSE4-NEXT:    psubq %xmm3, %xmm2
+; SSE4-NEXT:    psubq %xmm3, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm5
+; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT:    vpaddq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,2],ymm0[0,2],ymm2[4,6],ymm0[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i32> %a0 to <8 x i64>
+  %x1 = sext <8 x i32> %a1 to <8 x i64>
+  %sum = add <8 x i64> %x0, %x1
+  %inc = add <8 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shift = ashr <8 x i64> %inc, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <8 x i64> %shift to <8 x i32>
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psubq %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psubq %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm1, %xmm4
+; SSE4-NEXT:    por %xmm3, %xmm4
+; SSE4-NEXT:    movdqa %xmm0, %xmm5
+; SSE4-NEXT:    por %xmm2, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm2
+; SSE4-NEXT:    pxor %xmm1, %xmm3
+; SSE4-NEXT:    movdqa %xmm3, %xmm0
+; SSE4-NEXT:    psrad $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7]
+; SSE4-NEXT:    psubq %xmm3, %xmm4
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    psrad $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE4-NEXT:    psubq %xmm2, %xmm5
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsraq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <4 x i64> %a0, %a1
+  %xor = xor <4 x i64> %a1, %a0
+  %shift = ashr <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1>
+  %res = sub <4 x i64> %or, %shift
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_ext_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    movq %xmm0, %r11
+; SSE2-NEXT:    movq %r11, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %rcx, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movq %xmm1, %rdx
+; SSE2-NEXT:    movq %rdx, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r9
+; SSE2-NEXT:    movq %r9, %r15
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movq %xmm2, %rsi
+; SSE2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r13
+; SSE2-NEXT:    movq %r13, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movq %xmm3, %rbp
+; SSE2-NEXT:    movq %rbp, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    addq %rax, %r9
+; SSE2-NEXT:    adcq %r15, %r10
+; SSE2-NEXT:    addq %rbp, %rdx
+; SSE2-NEXT:    adcq %r14, %rdi
+; SSE2-NEXT:    addq %r13, %rcx
+; SSE2-NEXT:    adcq %rbx, %r8
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE2-NEXT:    adcq %r12, %rsi
+; SSE2-NEXT:    addq $1, %r11
+; SSE2-NEXT:    adcq $0, %rsi
+; SSE2-NEXT:    addq $1, %rcx
+; SSE2-NEXT:    adcq $0, %r8
+; SSE2-NEXT:    addq $1, %rdx
+; SSE2-NEXT:    adcq $0, %rdi
+; SSE2-NEXT:    addq $1, %r9
+; SSE2-NEXT:    adcq $0, %r10
+; SSE2-NEXT:    shldq $63, %r9, %r10
+; SSE2-NEXT:    shldq $63, %rdx, %rdi
+; SSE2-NEXT:    shldq $63, %rcx, %r8
+; SSE2-NEXT:    shldq $63, %r11, %rsi
+; SSE2-NEXT:    movq %rsi, %xmm0
+; SSE2-NEXT:    movq %r8, %xmm2
+; SSE2-NEXT:    movq %rdi, %xmm1
+; SSE2-NEXT:    movq %r10, %xmm3
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    pextrq $1, %xmm0, %r11
+; SSE4-NEXT:    movq %r11, %r12
+; SSE4-NEXT:    sarq $63, %r12
+; SSE4-NEXT:    movq %xmm0, %rcx
+; SSE4-NEXT:    movq %rcx, %rbx
+; SSE4-NEXT:    sarq $63, %rbx
+; SSE4-NEXT:    pextrq $1, %xmm1, %rdx
+; SSE4-NEXT:    movq %rdx, %r14
+; SSE4-NEXT:    sarq $63, %r14
+; SSE4-NEXT:    movq %xmm1, %r9
+; SSE4-NEXT:    movq %r9, %r15
+; SSE4-NEXT:    sarq $63, %r15
+; SSE4-NEXT:    pextrq $1, %xmm2, %rsi
+; SSE4-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    movq %xmm2, %r13
+; SSE4-NEXT:    movq %r13, %r8
+; SSE4-NEXT:    sarq $63, %r8
+; SSE4-NEXT:    pextrq $1, %xmm3, %rbp
+; SSE4-NEXT:    movq %rbp, %rdi
+; SSE4-NEXT:    sarq $63, %rdi
+; SSE4-NEXT:    movq %xmm3, %rax
+; SSE4-NEXT:    movq %rax, %r10
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    addq %rax, %r9
+; SSE4-NEXT:    adcq %r15, %r10
+; SSE4-NEXT:    addq %rbp, %rdx
+; SSE4-NEXT:    adcq %r14, %rdi
+; SSE4-NEXT:    addq %r13, %rcx
+; SSE4-NEXT:    adcq %rbx, %r8
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE4-NEXT:    adcq %r12, %rsi
+; SSE4-NEXT:    addq $1, %r11
+; SSE4-NEXT:    adcq $0, %rsi
+; SSE4-NEXT:    addq $1, %rcx
+; SSE4-NEXT:    adcq $0, %r8
+; SSE4-NEXT:    addq $1, %rdx
+; SSE4-NEXT:    adcq $0, %rdi
+; SSE4-NEXT:    addq $1, %r9
+; SSE4-NEXT:    adcq $0, %r10
+; SSE4-NEXT:    shldq $63, %r9, %r10
+; SSE4-NEXT:    shldq $63, %rdx, %rdi
+; SSE4-NEXT:    shldq $63, %rcx, %r8
+; SSE4-NEXT:    shldq $63, %r11, %rsi
+; SSE4-NEXT:    movq %rsi, %xmm2
+; SSE4-NEXT:    movq %r8, %xmm0
+; SSE4-NEXT:    movq %rdi, %xmm3
+; SSE4-NEXT:    movq %r10, %xmm1
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX1-NEXT:    movq %r11, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vmovq %xmm2, %rcx
+; AVX1-NEXT:    movq %rcx, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    movq %rdx, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vmovq %xmm0, %r8
+; AVX1-NEXT:    movq %r8, %r15
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vmovq %xmm0, %r13
+; AVX1-NEXT:    movq %r13, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rbp
+; AVX1-NEXT:    movq %rbp, %r9
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    movq %rax, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    addq %rax, %r8
+; AVX1-NEXT:    adcq %r15, %r10
+; AVX1-NEXT:    addq %rbp, %rdx
+; AVX1-NEXT:    adcq %r14, %r9
+; AVX1-NEXT:    addq %r13, %rcx
+; AVX1-NEXT:    adcq %rbx, %rdi
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX1-NEXT:    adcq %r12, %rsi
+; AVX1-NEXT:    addq $1, %r11
+; AVX1-NEXT:    adcq $0, %rsi
+; AVX1-NEXT:    addq $1, %rcx
+; AVX1-NEXT:    adcq $0, %rdi
+; AVX1-NEXT:    addq $1, %rdx
+; AVX1-NEXT:    adcq $0, %r9
+; AVX1-NEXT:    addq $1, %r8
+; AVX1-NEXT:    adcq $0, %r10
+; AVX1-NEXT:    shldq $63, %r8, %r10
+; AVX1-NEXT:    shldq $63, %rdx, %r9
+; AVX1-NEXT:    shldq $63, %rcx, %rdi
+; AVX1-NEXT:    shldq $63, %r11, %rsi
+; AVX1-NEXT:    vmovq %rsi, %xmm0
+; AVX1-NEXT:    vmovq %rdi, %xmm1
+; AVX1-NEXT:    vmovq %r9, %xmm2
+; AVX1-NEXT:    vmovq %r10, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX2-NEXT:    movq %r11, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vmovq %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vmovq %xmm0, %r8
+; AVX2-NEXT:    movq %r8, %r15
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vmovq %xmm0, %r13
+; AVX2-NEXT:    movq %r13, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rbp
+; AVX2-NEXT:    movq %rbp, %r9
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    addq %rax, %r8
+; AVX2-NEXT:    adcq %r15, %r10
+; AVX2-NEXT:    addq %rbp, %rdx
+; AVX2-NEXT:    adcq %r14, %r9
+; AVX2-NEXT:    addq %r13, %rcx
+; AVX2-NEXT:    adcq %rbx, %rdi
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    adcq %r12, %rsi
+; AVX2-NEXT:    addq $1, %r11
+; AVX2-NEXT:    adcq $0, %rsi
+; AVX2-NEXT:    addq $1, %rcx
+; AVX2-NEXT:    adcq $0, %rdi
+; AVX2-NEXT:    addq $1, %rdx
+; AVX2-NEXT:    adcq $0, %r9
+; AVX2-NEXT:    addq $1, %r8
+; AVX2-NEXT:    adcq $0, %r10
+; AVX2-NEXT:    shldq $63, %r8, %r10
+; AVX2-NEXT:    shldq $63, %rdx, %r9
+; AVX2-NEXT:    shldq $63, %rcx, %rdi
+; AVX2-NEXT:    shldq $63, %r11, %rsi
+; AVX2-NEXT:    vmovq %rsi, %xmm0
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vmovq %r9, %xmm2
+; AVX2-NEXT:    vmovq %r10, %xmm3
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX512-NEXT:    movq %r11, %r12
+; AVX512-NEXT:    sarq $63, %r12
+; AVX512-NEXT:    vmovq %xmm2, %rcx
+; AVX512-NEXT:    movq %rcx, %rbx
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT:    sarq $63, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    sarq $63, %r14
+; AVX512-NEXT:    vmovq %xmm0, %rdi
+; AVX512-NEXT:    movq %rdi, %r15
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r15
+; AVX512-NEXT:    sarq $63, %rsi
+; AVX512-NEXT:    vmovq %xmm0, %r13
+; AVX512-NEXT:    movq %r13, %r8
+; AVX512-NEXT:    sarq $63, %r8
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rbp
+; AVX512-NEXT:    movq %rbp, %r9
+; AVX512-NEXT:    sarq $63, %r9
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    sarq $63, %r10
+; AVX512-NEXT:    addq %rax, %rdi
+; AVX512-NEXT:    adcq %r15, %r10
+; AVX512-NEXT:    addq %rbp, %rdx
+; AVX512-NEXT:    adcq %r14, %r9
+; AVX512-NEXT:    addq %r13, %rcx
+; AVX512-NEXT:    adcq %rbx, %r8
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    adcq %r12, %rsi
+; AVX512-NEXT:    addq $1, %r11
+; AVX512-NEXT:    adcq $0, %rsi
+; AVX512-NEXT:    addq $1, %rcx
+; AVX512-NEXT:    adcq $0, %r8
+; AVX512-NEXT:    addq $1, %rdx
+; AVX512-NEXT:    adcq $0, %r9
+; AVX512-NEXT:    addq $1, %rdi
+; AVX512-NEXT:    adcq $0, %r10
+; AVX512-NEXT:    shldq $63, %rdi, %r10
+; AVX512-NEXT:    shldq $63, %rdx, %r9
+; AVX512-NEXT:    shldq $63, %rcx, %r8
+; AVX512-NEXT:    shldq $63, %r11, %rsi
+; AVX512-NEXT:    vmovq %rsi, %xmm0
+; AVX512-NEXT:    vmovq %r8, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vmovq %r10, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = sext <4 x i64> %a0 to <4 x i128>
+  %x1 = sext <4 x i64> %a1 to <4 x i128>
+  %sum = add <4 x i128> %x0, %x1
+  %inc = add <4 x i128> %sum, <i128 1, i128 1, i128 1, i128 1>
+  %shift = ashr <4 x i128> %inc, <i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <4 x i128> %shift to <4 x i64>
+  ret <4 x i64> %res
+}
+
+;
+; 512-bit vectors
+;
+
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_fixed_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm11
+; SSE-NEXT:    movdqa %xmm2, %xmm8
+; SSE-NEXT:    movdqa %xmm1, %xmm9
+; SSE-NEXT:    movdqa %xmm0, %xmm10
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm6, %xmm2
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm10
+; SSE-NEXT:    pxor %xmm5, %xmm9
+; SSE-NEXT:    pxor %xmm6, %xmm8
+; SSE-NEXT:    pxor %xmm7, %xmm11
+; SSE-NEXT:    psrlw $1, %xmm11
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm5, %xmm11
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm4, %xmm11
+; SSE-NEXT:    psubb %xmm11, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm8
+; SSE-NEXT:    pand %xmm5, %xmm8
+; SSE-NEXT:    pxor %xmm4, %xmm8
+; SSE-NEXT:    psubb %xmm8, %xmm2
+; SSE-NEXT:    psrlw $1, %xmm9
+; SSE-NEXT:    pand %xmm5, %xmm9
+; SSE-NEXT:    pxor %xmm4, %xmm9
+; SSE-NEXT:    psubb %xmm9, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    pxor %xmm4, %xmm10
+; SSE-NEXT:    psubb %xmm10, %xmm0
+; SSE-NEXT:    paddb %xmm4, %xmm0
+; SSE-NEXT:    paddb %xmm4, %xmm1
+; SSE-NEXT:    paddb %xmm4, %xmm2
+; SSE-NEXT:    paddb %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm7, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512-NEXT:    vpsubb %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <64 x i8> %a0, %a1
+  %xor = xor <64 x i8> %a0, %a1
+  %shift = ashr <64 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <64 x i8> %or, %shift
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE2-LABEL: test_ext_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm14
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm15
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm13
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15]
+; SSE2-NEXT:    psraw $8, %xmm12
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
+; SSE2-NEXT:    psraw $8, %xmm11
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
+; SSE2-NEXT:    psraw $8, %xmm10
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; SSE2-NEXT:    psraw $8, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE2-NEXT:    psraw $8, %xmm8
+; SSE2-NEXT:    paddw %xmm14, %xmm8
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm15, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    paddw %xmm3, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm13, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    paddw %xmm12, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    paddw %xmm11, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    paddw %xmm10, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    paddw %xmm9, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    psubw %xmm7, %xmm8
+; SSE2-NEXT:    psubw %xmm7, %xmm0
+; SSE2-NEXT:    psubw %xmm7, %xmm4
+; SSE2-NEXT:    psubw %xmm7, %xmm1
+; SSE2-NEXT:    psubw %xmm7, %xmm5
+; SSE2-NEXT:    psubw %xmm7, %xmm2
+; SSE2-NEXT:    psubw %xmm7, %xmm6
+; SSE2-NEXT:    psubw %xmm7, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm6
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm7, %xmm8
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    packuswb %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    packuswb %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm7, %xmm6
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    packuswb %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v64i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm9
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm11
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm13
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm15
+; SSE4-NEXT:    pmovsxbw %xmm4, %xmm0
+; SSE4-NEXT:    paddw %xmm8, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm4
+; SSE4-NEXT:    paddw %xmm9, %xmm4
+; SSE4-NEXT:    pmovsxbw %xmm5, %xmm1
+; SSE4-NEXT:    paddw %xmm10, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm5
+; SSE4-NEXT:    paddw %xmm11, %xmm5
+; SSE4-NEXT:    pmovsxbw %xmm6, %xmm2
+; SSE4-NEXT:    paddw %xmm12, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm6
+; SSE4-NEXT:    paddw %xmm13, %xmm6
+; SSE4-NEXT:    pmovsxbw %xmm7, %xmm3
+; SSE4-NEXT:    paddw %xmm14, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm7, %xmm7
+; SSE4-NEXT:    paddw %xmm15, %xmm7
+; SSE4-NEXT:    pcmpeqd %xmm8, %xmm8
+; SSE4-NEXT:    psubw %xmm8, %xmm0
+; SSE4-NEXT:    psubw %xmm8, %xmm4
+; SSE4-NEXT:    psubw %xmm8, %xmm1
+; SSE4-NEXT:    psubw %xmm8, %xmm5
+; SSE4-NEXT:    psubw %xmm8, %xmm2
+; SSE4-NEXT:    psubw %xmm8, %xmm6
+; SSE4-NEXT:    psubw %xmm8, %xmm3
+; SSE4-NEXT:    psubw %xmm8, %xmm7
+; SSE4-NEXT:    psrlw $1, %xmm7
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm6
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm5
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm8, %xmm0
+; SSE4-NEXT:    pand %xmm8, %xmm4
+; SSE4-NEXT:    packuswb %xmm4, %xmm0
+; SSE4-NEXT:    pand %xmm8, %xmm1
+; SSE4-NEXT:    pand %xmm8, %xmm5
+; SSE4-NEXT:    packuswb %xmm5, %xmm1
+; SSE4-NEXT:    pand %xmm8, %xmm2
+; SSE4-NEXT:    pand %xmm8, %xmm6
+; SSE4-NEXT:    packuswb %xmm6, %xmm2
+; SSE4-NEXT:    pand %xmm8, %xmm3
+; SSE4-NEXT:    pand %xmm8, %xmm7
+; SSE4-NEXT:    packuswb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpmovsxbw %xmm7, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm10
+; AVX1-NEXT:    vpmovsxbw %xmm10, %xmm11
+; AVX1-NEXT:    vpaddw %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm10, %xmm10
+; AVX1-NEXT:    vpaddw %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm10
+; AVX1-NEXT:    vpaddw %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm10
+; AVX1-NEXT:    vpaddw %xmm10, %xmm8, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm7
+; AVX1-NEXT:    vpaddw %xmm7, %xmm9, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubw %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm8, %xmm8
+; AVX1-NEXT:    vpsubw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm7, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm5
+; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm8, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpand %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpmovsxbw %xmm4, %ymm4
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT:    vpmovsxbw %xmm6, %ymm6
+; AVX2-NEXT:    vpaddw %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpaddw %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpsubw %ymm3, %ymm4, %ymm4
+; AVX2-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm4, %ymm3
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm3
+; AVX512-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubw %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsubw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <64 x i8> %a0 to <64 x i16>
+  %x1 = sext <64 x i8> %a1 to <64 x i16>
+  %sum = add <64 x i16> %x0, %x1
+  %inc = add <64 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = ashr <64 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <64 x i16> %shift to <64 x i8>
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_fixed_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    por %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    por %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm8, %xmm7
+; SSE-NEXT:    psraw $1, %xmm7
+; SSE-NEXT:    psubw %xmm7, %xmm3
+; SSE-NEXT:    psraw $1, %xmm6
+; SSE-NEXT:    psubw %xmm6, %xmm9
+; SSE-NEXT:    psraw $1, %xmm5
+; SSE-NEXT:    psubw %xmm5, %xmm10
+; SSE-NEXT:    psraw $1, %xmm4
+; SSE-NEXT:    psubw %xmm4, %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubw %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsraw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsraw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubw %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <32 x i16> %a0, %a1
+  %xor = xor <32 x i16> %a1, %a0
+  %shift = ashr <32 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <32 x i16> %or, %shift
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE2-LABEL: test_ext_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
+; SSE2-NEXT:    psrad $16, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7]
+; SSE2-NEXT:    psrad $16, %xmm14
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; SSE2-NEXT:    psrad $16, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
+; SSE2-NEXT:    psrad $16, %xmm12
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm10
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    paddd %xmm13, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    paddd %xmm14, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    paddd %xmm15, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    paddd %xmm12, %xmm6
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm11, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    paddd %xmm10, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm9, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    paddd %xmm8, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm8
+; SSE2-NEXT:    psubd %xmm8, %xmm3
+; SSE2-NEXT:    psubd %xmm8, %xmm7
+; SSE2-NEXT:    psubd %xmm8, %xmm2
+; SSE2-NEXT:    psubd %xmm8, %xmm6
+; SSE2-NEXT:    psubd %xmm8, %xmm1
+; SSE2-NEXT:    psubd %xmm8, %xmm5
+; SSE2-NEXT:    psubd %xmm8, %xmm0
+; SSE2-NEXT:    psubd %xmm8, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm4, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm5, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm6
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm6, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm7
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    pslld $15, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    packssdw %xmm7, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm9
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm11
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm13
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm15
+; SSE4-NEXT:    pmovsxwd %xmm4, %xmm0
+; SSE4-NEXT:    paddd %xmm8, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm4
+; SSE4-NEXT:    paddd %xmm9, %xmm4
+; SSE4-NEXT:    pmovsxwd %xmm5, %xmm1
+; SSE4-NEXT:    paddd %xmm10, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm5
+; SSE4-NEXT:    paddd %xmm11, %xmm5
+; SSE4-NEXT:    pmovsxwd %xmm6, %xmm2
+; SSE4-NEXT:    paddd %xmm12, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm6
+; SSE4-NEXT:    paddd %xmm13, %xmm6
+; SSE4-NEXT:    pmovsxwd %xmm7, %xmm3
+; SSE4-NEXT:    paddd %xmm14, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm7, %xmm7
+; SSE4-NEXT:    paddd %xmm15, %xmm7
+; SSE4-NEXT:    pcmpeqd %xmm8, %xmm8
+; SSE4-NEXT:    psubd %xmm8, %xmm0
+; SSE4-NEXT:    psubd %xmm8, %xmm4
+; SSE4-NEXT:    psubd %xmm8, %xmm1
+; SSE4-NEXT:    psubd %xmm8, %xmm5
+; SSE4-NEXT:    psubd %xmm8, %xmm2
+; SSE4-NEXT:    psubd %xmm8, %xmm6
+; SSE4-NEXT:    psubd %xmm8, %xmm3
+; SSE4-NEXT:    psubd %xmm8, %xmm7
+; SSE4-NEXT:    psrld $1, %xmm7
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm6
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm5
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm8, %xmm8
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2],xmm8[3],xmm0[4],xmm8[5],xmm0[6],xmm8[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2],xmm8[3],xmm4[4],xmm8[5],xmm4[6],xmm8[7]
+; SSE4-NEXT:    packusdw %xmm4, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2],xmm8[3],xmm1[4],xmm8[5],xmm1[6],xmm8[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4],xmm8[5],xmm5[6],xmm8[7]
+; SSE4-NEXT:    packusdw %xmm5, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2],xmm8[3],xmm2[4],xmm8[5],xmm2[6],xmm8[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4],xmm8[5],xmm6[6],xmm8[7]
+; SSE4-NEXT:    packusdw %xmm6, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2],xmm8[3],xmm3[4],xmm8[5],xmm3[6],xmm8[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2],xmm8[3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]
+; SSE4-NEXT:    packusdw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm4, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpmovsxwd %xmm7, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm10
+; AVX1-NEXT:    vpmovsxwd %xmm10, %xmm11
+; AVX1-NEXT:    vpaddd %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm10, %xmm10
+; AVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm10
+; AVX1-NEXT:    vpaddd %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm10
+; AVX1-NEXT:    vpaddd %xmm10, %xmm8, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm7
+; AVX1-NEXT:    vpaddd %xmm7, %xmm9, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubd %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm8, %xmm8
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm7, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4],xmm8[5],xmm5[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2],xmm8[3],xmm4[4],xmm8[5],xmm4[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4],xmm8[5],xmm6[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2],xmm8[3],xmm0[4],xmm8[5],xmm0[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0],xmm8[1],xmm7[2],xmm8[3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2],xmm8[3],xmm2[4],xmm8[5],xmm2[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2],xmm8[3],xmm3[4],xmm8[5],xmm3[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2],xmm8[3],xmm1[4],xmm8[5],xmm1[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpmovsxwd %xmm4, %ymm4
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vpmovsxwd %xmm5, %ymm5
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT:    vpmovsxwd %xmm6, %ymm6
+; AVX2-NEXT:    vpaddd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpsubd %ymm3, %ymm4, %ymm4
+; AVX2-NEXT:    vpsubd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7],ymm3[8],ymm4[9],ymm3[10],ymm4[11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7],ymm2[8],ymm4[9],ymm2[10],ymm4[11],ymm2[12],ymm4[13],ymm2[14],ymm4[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm3
+; AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubd %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <32 x i16> %a0 to <32 x i32>
+  %x1 = sext <32 x i16> %a1 to <32 x i32>
+  %sum = add <32 x i32> %x0, %x1
+  %inc = add <32 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = ashr <32 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <32 x i32> %shift to <32 x i16>
+  ret <32 x i16> %res
+}
+
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE-LABEL: test_fixed_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    por %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    por %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm8, %xmm7
+; SSE-NEXT:    psrad $1, %xmm7
+; SSE-NEXT:    psubd %xmm7, %xmm3
+; SSE-NEXT:    psrad $1, %xmm6
+; SSE-NEXT:    psubd %xmm6, %xmm9
+; SSE-NEXT:    psrad $1, %xmm5
+; SSE-NEXT:    psubd %xmm5, %xmm10
+; SSE-NEXT:    psrad $1, %xmm4
+; SSE-NEXT:    psubd %xmm4, %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxord %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrad $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i32> %a0, %a1
+  %xor = xor <16 x i32> %a1, %a0
+  %shift = ashr <16 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = sub <16 x i32> %or, %shift
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE2-LABEL: test_ext_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm13, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm14, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm14
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1]
+; SSE2-NEXT:    paddq %xmm13, %xmm8
+; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm13
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm13
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
+; SSE2-NEXT:    paddq %xmm12, %xmm4
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm12
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
+; SSE2-NEXT:    paddq %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm12
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
+; SSE2-NEXT:    paddq %xmm11, %xmm5
+; SSE2-NEXT:    pxor %xmm11, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm11
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm11, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm11
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
+; SSE2-NEXT:    paddq %xmm10, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    psubq %xmm7, %xmm8
+; SSE2-NEXT:    psubq %xmm7, %xmm0
+; SSE2-NEXT:    psubq %xmm7, %xmm4
+; SSE2-NEXT:    psubq %xmm7, %xmm1
+; SSE2-NEXT:    psubq %xmm7, %xmm5
+; SSE2-NEXT:    psubq %xmm7, %xmm2
+; SSE2-NEXT:    psubq %xmm7, %xmm6
+; SSE2-NEXT:    psubq %xmm7, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm8
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm8, %xmm9
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm11
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm13
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm15
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm8
+; SSE4-NEXT:    paddq %xmm9, %xmm8
+; SSE4-NEXT:    pmovsxdq %xmm4, %xmm0
+; SSE4-NEXT:    paddq %xmm10, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm4
+; SSE4-NEXT:    paddq %xmm11, %xmm4
+; SSE4-NEXT:    pmovsxdq %xmm5, %xmm1
+; SSE4-NEXT:    paddq %xmm12, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm5
+; SSE4-NEXT:    paddq %xmm13, %xmm5
+; SSE4-NEXT:    pmovsxdq %xmm6, %xmm2
+; SSE4-NEXT:    paddq %xmm14, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm6
+; SSE4-NEXT:    paddq %xmm15, %xmm6
+; SSE4-NEXT:    pmovsxdq %xmm7, %xmm3
+; SSE4-NEXT:    paddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE4-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE4-NEXT:    psubq %xmm7, %xmm8
+; SSE4-NEXT:    psubq %xmm7, %xmm0
+; SSE4-NEXT:    psubq %xmm7, %xmm4
+; SSE4-NEXT:    psubq %xmm7, %xmm1
+; SSE4-NEXT:    psubq %xmm7, %xmm5
+; SSE4-NEXT:    psubq %xmm7, %xmm2
+; SSE4-NEXT:    psubq %xmm7, %xmm6
+; SSE4-NEXT:    psubq %xmm7, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm8
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm8
+; AVX1-NEXT:    vpmovsxdq %xmm8, %xmm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm8, %xmm8
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm10
+; AVX1-NEXT:    vpaddq %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm10
+; AVX1-NEXT:    vpmovsxdq %xmm10, %xmm11
+; AVX1-NEXT:    vpaddq %xmm6, %xmm11, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm5
+; AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
+; AVX1-NEXT:    vpmovsxdq %xmm7, %xmm10
+; AVX1-NEXT:    vpaddq %xmm10, %xmm9, %xmm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubq %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubq %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubq %xmm7, %xmm9, %xmm8
+; AVX1-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrlq $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[0,2],ymm0[0,2],ymm4[4,6],ymm0[4,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm4
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm6
+; AVX2-NEXT:    vpaddq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm6
+; AVX2-NEXT:    vpaddq %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubq %ymm2, %ymm4, %ymm3
+; AVX2-NEXT:    vpsubq %ymm2, %ymm5, %ymm4
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm0[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm3
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i32> %a0 to <16 x i64>
+  %x1 = sext <16 x i32> %a1 to <16 x i64>
+  %sum = add <16 x i64> %x0, %x1
+  %inc = add <16 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shift = ashr <16 x i64> %inc, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <16 x i64> %shift to <16 x i32>
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm11, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm8
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; SSE2-NEXT:    psubq %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; SSE2-NEXT:    psubq %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm6
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; SSE2-NEXT:    psubq %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    psubq %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm8
+; SSE4-NEXT:    movdqa %xmm2, %xmm9
+; SSE4-NEXT:    movdqa %xmm1, %xmm10
+; SSE4-NEXT:    movdqa %xmm0, %xmm11
+; SSE4-NEXT:    por %xmm7, %xmm3
+; SSE4-NEXT:    por %xmm6, %xmm2
+; SSE4-NEXT:    por %xmm5, %xmm1
+; SSE4-NEXT:    por %xmm4, %xmm0
+; SSE4-NEXT:    pxor %xmm11, %xmm4
+; SSE4-NEXT:    pxor %xmm10, %xmm5
+; SSE4-NEXT:    pxor %xmm9, %xmm6
+; SSE4-NEXT:    pxor %xmm8, %xmm7
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    psrad $1, %xmm8
+; SSE4-NEXT:    psrlq $1, %xmm7
+; SSE4-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7]
+; SSE4-NEXT:    psubq %xmm7, %xmm3
+; SSE4-NEXT:    movdqa %xmm6, %xmm7
+; SSE4-NEXT:    psrad $1, %xmm7
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
+; SSE4-NEXT:    psubq %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm6
+; SSE4-NEXT:    psrad $1, %xmm6
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
+; SSE4-NEXT:    psubq %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm4, %xmm5
+; SSE4-NEXT:    psrad $1, %xmm5
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; SSE4-NEXT:    psubq %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpsubq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsraq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <8 x i64> %a0, %a1
+  %xor = xor <8 x i64> %a1, %a0
+  %shift = ashr <8 x i64> %xor, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = sub <8 x i64> %or, %shift
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_ext_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    .cfi_def_cfa_offset 64
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm3, %rbx
+; SSE2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    movq %rdi, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    movq %xmm4, %r8
+; SSE2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r11
+; SSE2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movq %xmm5, %r10
+; SSE2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r15
+; SSE2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movq %xmm6, %r9
+; SSE2-NEXT:    movq %r9, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    movq %rsi, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movq %xmm7, %rdx
+; SSE2-NEXT:    movq %rdx, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    addq %rax, %rdi
+; SSE2-NEXT:    adcq %rbp, %rcx
+; SSE2-NEXT:    addq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    adcq %rbx, %r12
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE2-NEXT:    addq %rsi, %rbp
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    addq %r9, %rbx
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE2-NEXT:    adcq (%rsp), %r8 # 8-byte Folded Reload
+; SSE2-NEXT:    addq $1, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    adcq $0, %r8
+; SSE2-NEXT:    addq $1, %rdx
+; SSE2-NEXT:    adcq $0, %r11
+; SSE2-NEXT:    addq $1, %rsi
+; SSE2-NEXT:    adcq $0, %r10
+; SSE2-NEXT:    addq $1, %r9
+; SSE2-NEXT:    adcq $0, %r15
+; SSE2-NEXT:    addq $1, %rbx
+; SSE2-NEXT:    adcq $0, %r14
+; SSE2-NEXT:    addq $1, %rbp
+; SSE2-NEXT:    adcq $0, %r13
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    addq $1, %rax
+; SSE2-NEXT:    adcq $0, %r12
+; SSE2-NEXT:    addq $1, %rdi
+; SSE2-NEXT:    adcq $0, %rcx
+; SSE2-NEXT:    shldq $63, %rdi, %rcx
+; SSE2-NEXT:    shldq $63, %rax, %r12
+; SSE2-NEXT:    shldq $63, %rbp, %r13
+; SSE2-NEXT:    shldq $63, %rbx, %r14
+; SSE2-NEXT:    shldq $63, %r9, %r15
+; SSE2-NEXT:    shldq $63, %rsi, %r10
+; SSE2-NEXT:    shldq $63, %rdx, %r11
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rax, %r8
+; SSE2-NEXT:    movq %r8, %xmm0
+; SSE2-NEXT:    movq %r11, %xmm4
+; SSE2-NEXT:    movq %r10, %xmm1
+; SSE2-NEXT:    movq %r15, %xmm5
+; SSE2-NEXT:    movq %r14, %xmm2
+; SSE2-NEXT:    movq %r13, %xmm6
+; SSE2-NEXT:    movq %r12, %xmm3
+; SSE2-NEXT:    movq %rcx, %xmm7
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    subq $16, %rsp
+; SSE4-NEXT:    .cfi_def_cfa_offset 72
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm0, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm1, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm1, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm2, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm2, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm3, %r13
+; SSE4-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r13
+; SSE4-NEXT:    movq %xmm3, %rax
+; SSE4-NEXT:    movq %rax, %rsi
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    pextrq $1, %xmm4, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm4, %r11
+; SSE4-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r11
+; SSE4-NEXT:    pextrq $1, %xmm5, %r10
+; SSE4-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    movq %xmm5, %rax
+; SSE4-NEXT:    movq %rax, %r14
+; SSE4-NEXT:    sarq $63, %r14
+; SSE4-NEXT:    pextrq $1, %xmm6, %rdi
+; SSE4-NEXT:    movq %rdi, %rbx
+; SSE4-NEXT:    sarq $63, %rbx
+; SSE4-NEXT:    movq %xmm6, %rdx
+; SSE4-NEXT:    movq %rdx, %r12
+; SSE4-NEXT:    sarq $63, %r12
+; SSE4-NEXT:    pextrq $1, %xmm7, %r15
+; SSE4-NEXT:    movq %r15, %r9
+; SSE4-NEXT:    sarq $63, %r9
+; SSE4-NEXT:    movq %xmm7, %rbp
+; SSE4-NEXT:    movq %rbp, %r8
+; SSE4-NEXT:    sarq $63, %r8
+; SSE4-NEXT:    addq %rbp, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    adcq %rsi, %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    addq %r15, %rcx
+; SSE4-NEXT:    adcq %r13, %r9
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE4-NEXT:    addq %rdx, %rbp
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; SSE4-NEXT:    addq %rdi, %r13
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    addq %rax, %r15
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE4-NEXT:    adcq (%rsp), %r11 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE4-NEXT:    addq $1, %rdx
+; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    adcq $0, %rax
+; SSE4-NEXT:    movq %rax, %rdx
+; SSE4-NEXT:    addq $1, %rsi
+; SSE4-NEXT:    adcq $0, %r11
+; SSE4-NEXT:    addq $1, %rdi
+; SSE4-NEXT:    adcq $0, %r10
+; SSE4-NEXT:    addq $1, %r15
+; SSE4-NEXT:    adcq $0, %r14
+; SSE4-NEXT:    addq $1, %r13
+; SSE4-NEXT:    adcq $0, %rbx
+; SSE4-NEXT:    addq $1, %rbp
+; SSE4-NEXT:    adcq $0, %r12
+; SSE4-NEXT:    addq $1, %rcx
+; SSE4-NEXT:    adcq $0, %r9
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    addq $1, %rax
+; SSE4-NEXT:    adcq $0, %r8
+; SSE4-NEXT:    shldq $63, %rax, %r8
+; SSE4-NEXT:    shldq $63, %rcx, %r9
+; SSE4-NEXT:    shldq $63, %rbp, %r12
+; SSE4-NEXT:    shldq $63, %r13, %rbx
+; SSE4-NEXT:    shldq $63, %r15, %r14
+; SSE4-NEXT:    shldq $63, %rdi, %r10
+; SSE4-NEXT:    shldq $63, %rsi, %r11
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rcx, %rdx
+; SSE4-NEXT:    movq %rdx, %xmm4
+; SSE4-NEXT:    movq %r11, %xmm0
+; SSE4-NEXT:    movq %r10, %xmm5
+; SSE4-NEXT:    movq %r14, %xmm1
+; SSE4-NEXT:    movq %rbx, %xmm6
+; SSE4-NEXT:    movq %r12, %xmm2
+; SSE4-NEXT:    movq %r9, %xmm7
+; SSE4-NEXT:    movq %r8, %xmm3
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE4-NEXT:    addq $16, %rsp
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    pushq %rax
+; AVX1-NEXT:    .cfi_def_cfa_offset 64
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm4, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX1-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vmovq %xmm1, %r8
+; AVX1-NEXT:    movq %r8, %rbp
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX1-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vmovq %xmm0, %r10
+; AVX1-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX1-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r11
+; AVX1-NEXT:    vmovq %xmm2, %r15
+; AVX1-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX1-NEXT:    movq %rdi, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vmovq %xmm0, %rsi
+; AVX1-NEXT:    movq %rsi, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpextrq $1, %xmm3, %r13
+; AVX1-NEXT:    movq %r13, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    addq %rax, %r8
+; AVX1-NEXT:    adcq %rbp, %rcx
+; AVX1-NEXT:    addq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    adcq %rbx, %rdx
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT:    addq %rsi, %rbp
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX1-NEXT:    addq %rdi, %r13
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX1-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX1-NEXT:    addq $1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    adcq $0, %r9
+; AVX1-NEXT:    addq $1, %rsi
+; AVX1-NEXT:    adcq $0, %r10
+; AVX1-NEXT:    addq $1, %rdi
+; AVX1-NEXT:    adcq $0, %r11
+; AVX1-NEXT:    addq $1, %rbx
+; AVX1-NEXT:    adcq $0, %r15
+; AVX1-NEXT:    addq $1, %r13
+; AVX1-NEXT:    adcq $0, %r14
+; AVX1-NEXT:    addq $1, %rbp
+; AVX1-NEXT:    adcq $0, %r12
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    addq $1, %rax
+; AVX1-NEXT:    adcq $0, %rdx
+; AVX1-NEXT:    addq $1, %r8
+; AVX1-NEXT:    adcq $0, %rcx
+; AVX1-NEXT:    shldq $63, %r8, %rcx
+; AVX1-NEXT:    shldq $63, %rax, %rdx
+; AVX1-NEXT:    shldq $63, %rbp, %r12
+; AVX1-NEXT:    shldq $63, %r13, %r14
+; AVX1-NEXT:    shldq $63, %rbx, %r15
+; AVX1-NEXT:    shldq $63, %rdi, %r11
+; AVX1-NEXT:    shldq $63, %rsi, %r10
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %r9
+; AVX1-NEXT:    vmovq %r9, %xmm0
+; AVX1-NEXT:    vmovq %r10, %xmm1
+; AVX1-NEXT:    vmovq %r11, %xmm2
+; AVX1-NEXT:    vmovq %r15, %xmm3
+; AVX1-NEXT:    vmovq %r14, %xmm4
+; AVX1-NEXT:    vmovq %r12, %xmm5
+; AVX1-NEXT:    vmovq %rdx, %xmm6
+; AVX1-NEXT:    vmovq %rcx, %xmm7
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    addq $8, %rsp
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    .cfi_def_cfa_offset 64
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm4, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    movq %r8, %rbp
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vmovq %xmm0, %r10
+; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r11
+; AVX2-NEXT:    vmovq %xmm2, %r15
+; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    movq %rdi, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r13
+; AVX2-NEXT:    movq %r13, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    addq %rax, %r8
+; AVX2-NEXT:    adcq %rbp, %rcx
+; AVX2-NEXT:    addq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    adcq %rbx, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX2-NEXT:    addq %rsi, %rbp
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    addq %rdi, %r13
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX2-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX2-NEXT:    addq $1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    adcq $0, %r9
+; AVX2-NEXT:    addq $1, %rsi
+; AVX2-NEXT:    adcq $0, %r10
+; AVX2-NEXT:    addq $1, %rdi
+; AVX2-NEXT:    adcq $0, %r11
+; AVX2-NEXT:    addq $1, %rbx
+; AVX2-NEXT:    adcq $0, %r15
+; AVX2-NEXT:    addq $1, %r13
+; AVX2-NEXT:    adcq $0, %r14
+; AVX2-NEXT:    addq $1, %rbp
+; AVX2-NEXT:    adcq $0, %r12
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    addq $1, %rax
+; AVX2-NEXT:    adcq $0, %rdx
+; AVX2-NEXT:    addq $1, %r8
+; AVX2-NEXT:    adcq $0, %rcx
+; AVX2-NEXT:    shldq $63, %r8, %rcx
+; AVX2-NEXT:    shldq $63, %rax, %rdx
+; AVX2-NEXT:    shldq $63, %rbp, %r12
+; AVX2-NEXT:    shldq $63, %r13, %r14
+; AVX2-NEXT:    shldq $63, %rbx, %r15
+; AVX2-NEXT:    shldq $63, %rdi, %r11
+; AVX2-NEXT:    shldq $63, %rsi, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rax, %r9
+; AVX2-NEXT:    vmovq %r9, %xmm0
+; AVX2-NEXT:    vmovq %r10, %xmm1
+; AVX2-NEXT:    vmovq %r11, %xmm2
+; AVX2-NEXT:    vmovq %r15, %xmm3
+; AVX2-NEXT:    vmovq %r14, %xmm4
+; AVX2-NEXT:    vmovq %r12, %xmm5
+; AVX2-NEXT:    vmovq %rdx, %xmm6
+; AVX2-NEXT:    vmovq %rcx, %xmm7
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    .cfi_def_cfa_offset 64
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %rax
+; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX512-NEXT:    vmovq %xmm3, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rbx
+; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %rbx
+; AVX512-NEXT:    vmovq %xmm0, %r8
+; AVX512-NEXT:    movq %r8, %r13
+; AVX512-NEXT:    sarq $63, %r13
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r9
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r9
+; AVX512-NEXT:    vmovq %xmm2, %r10
+; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r10
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r11
+; AVX512-NEXT:    vmovq %xmm0, %r14
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r14
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512-NEXT:    movq %rdi, %r15
+; AVX512-NEXT:    sarq $63, %r15
+; AVX512-NEXT:    vmovq %xmm0, %rsi
+; AVX512-NEXT:    movq %rsi, %r12
+; AVX512-NEXT:    sarq $63, %r12
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rbp
+; AVX512-NEXT:    movq %rbp, %rdx
+; AVX512-NEXT:    sarq $63, %rdx
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    addq %rax, %r8
+; AVX512-NEXT:    adcq %r13, %rcx
+; AVX512-NEXT:    addq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    adcq %rbx, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT:    addq %rsi, %rbp
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    addq %rdi, %r13
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX512-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX512-NEXT:    addq $1, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    adcq $0, %r9
+; AVX512-NEXT:    addq $1, %rsi
+; AVX512-NEXT:    adcq $0, %r10
+; AVX512-NEXT:    addq $1, %rdi
+; AVX512-NEXT:    adcq $0, %r11
+; AVX512-NEXT:    addq $1, %rbx
+; AVX512-NEXT:    adcq $0, %r14
+; AVX512-NEXT:    addq $1, %r13
+; AVX512-NEXT:    adcq $0, %r15
+; AVX512-NEXT:    addq $1, %rbp
+; AVX512-NEXT:    adcq $0, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    addq $1, %rax
+; AVX512-NEXT:    adcq $0, %rdx
+; AVX512-NEXT:    addq $1, %r8
+; AVX512-NEXT:    adcq $0, %rcx
+; AVX512-NEXT:    shldq $63, %r8, %rcx
+; AVX512-NEXT:    shldq $63, %rax, %rdx
+; AVX512-NEXT:    shldq $63, %rbp, %r12
+; AVX512-NEXT:    shldq $63, %r13, %r15
+; AVX512-NEXT:    shldq $63, %rbx, %r14
+; AVX512-NEXT:    shldq $63, %rdi, %r11
+; AVX512-NEXT:    shldq $63, %rsi, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rax, %r9
+; AVX512-NEXT:    vmovq %r9, %xmm0
+; AVX512-NEXT:    vmovq %r10, %xmm1
+; AVX512-NEXT:    vmovq %r11, %xmm2
+; AVX512-NEXT:    vmovq %r14, %xmm3
+; AVX512-NEXT:    vmovq %r15, %xmm4
+; AVX512-NEXT:    vmovq %r12, %xmm5
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vmovq %rcx, %xmm7
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    addq $8, %rsp
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i64> %a0 to <8 x i128>
+  %x1 = sext <8 x i64> %a1 to <8 x i128>
+  %sum = add <8 x i128> %x0, %x1
+  %inc = add <8 x i128> %sum, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %shift = ashr <8 x i128> %inc, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <8 x i128> %shift to <8 x i64>
+  ret <8 x i64> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/avgceilu.ll b/llvm/test/CodeGen/X86/avgceilu.ll
new file mode 100644
index 0000000000000..3a74fca237737
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgceilu.ll
@@ -0,0 +1,2219 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+;
+; 128-bit vectors
+;
+
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_fixed_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %or = or <16 x i8> %a0, %a1
+  %xor = xor <16 x i8> %a0, %a1
+  %shift = lshr <16 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <16 x i8> %or, %shift
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_ext_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x0 = zext <16 x i8> %a0 to <16 x i16>
+  %x1 = zext <16 x i8> %a1 to <16 x i16>
+  %sum = add <16 x i16> %x0, %x1
+  %inc = add <16 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = lshr <16 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <16 x i16> %shift to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_fixed_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %or = or <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a1, %a0
+  %shift = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <8 x i16> %or, %shift
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_ext_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x0 = zext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %sum = add <8 x i32> %x0, %x1
+  %inc = add <8 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = lshr <8 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <8 x i32> %shift to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_fixed_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    psubd %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %or = or <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a1, %a0
+  %shift = lshr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
+  %res = sub <4 x i32> %or, %shift
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE2-LABEL: test_ext_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    psubq %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE4-NEXT:    paddq %xmm0, %xmm1
+; SSE4-NEXT:    paddq %xmm4, %xmm2
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    psubq %xmm0, %xmm1
+; SSE4-NEXT:    psubq %xmm0, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE4-NEXT:    movaps %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = zext <4 x i32> %a0 to <4 x i64>
+  %x1 = zext <4 x i32> %a1 to <4 x i64>
+  %sum = add <4 x i64> %x0, %x1
+  %inc = add <4 x i64> %sum, <i64 1, i64 1, i64 1, i64 1>
+  %shift = lshr <4 x i64> %inc, <i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <4 x i64> %shift to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE-LABEL: test_fixed_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    psubq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %or = or <2 x i64> %a0, %a1
+  %xor = xor <2 x i64> %a1, %a0
+  %shift = lshr <2 x i64> %xor, <i64 1, i64 1>
+  %res = sub <2 x i64> %or, %shift
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_ext_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rcx
+; SSE2-NEXT:    movb $1, %dl
+; SSE2-NEXT:    movb $1, %sil
+; SSE2-NEXT:    addb $-1, %sil
+; SSE2-NEXT:    leaq 1(%rax,%rcx), %rsi
+; SSE2-NEXT:    adcq %rcx, %rax
+; SSE2-NEXT:    setb %al
+; SSE2-NEXT:    addb $-1, %dl
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %xmm1, %rdx
+; SSE2-NEXT:    leaq 1(%rcx,%rdx), %rdi
+; SSE2-NEXT:    adcq %rdx, %rcx
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    movzbl %cl, %ecx
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    shrdq $1, %rcx, %rdi
+; SSE2-NEXT:    shrdq $1, %rax, %rsi
+; SSE2-NEXT:    movq %rdi, %xmm0
+; SSE2-NEXT:    movq %rsi, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm0, %rax
+; SSE4-NEXT:    movq %xmm1, %rcx
+; SSE4-NEXT:    movb $1, %dl
+; SSE4-NEXT:    movb $1, %sil
+; SSE4-NEXT:    addb $-1, %sil
+; SSE4-NEXT:    leaq 1(%rax,%rcx), %rsi
+; SSE4-NEXT:    adcq %rcx, %rax
+; SSE4-NEXT:    setb %al
+; SSE4-NEXT:    addb $-1, %dl
+; SSE4-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE4-NEXT:    pextrq $1, %xmm1, %rdx
+; SSE4-NEXT:    leaq 1(%rcx,%rdx), %rdi
+; SSE4-NEXT:    adcq %rdx, %rcx
+; SSE4-NEXT:    setb %cl
+; SSE4-NEXT:    movzbl %cl, %ecx
+; SSE4-NEXT:    movzbl %al, %eax
+; SSE4-NEXT:    shrdq $1, %rcx, %rdi
+; SSE4-NEXT:    shrdq $1, %rax, %rsi
+; SSE4-NEXT:    movq %rdi, %xmm1
+; SSE4-NEXT:    movq %rsi, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vmovq %xmm1, %rcx
+; AVX-NEXT:    movb $1, %dl
+; AVX-NEXT:    movb $1, %sil
+; AVX-NEXT:    addb $-1, %sil
+; AVX-NEXT:    leaq 1(%rax,%rcx), %rsi
+; AVX-NEXT:    adcq %rcx, %rax
+; AVX-NEXT:    setb %al
+; AVX-NEXT:    addb $-1, %dl
+; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX-NEXT:    leaq 1(%rcx,%rdx), %rdi
+; AVX-NEXT:    adcq %rdx, %rcx
+; AVX-NEXT:    setb %cl
+; AVX-NEXT:    movzbl %cl, %ecx
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    shrdq $1, %rcx, %rdi
+; AVX-NEXT:    shrdq $1, %rax, %rsi
+; AVX-NEXT:    vmovq %rdi, %xmm0
+; AVX-NEXT:    vmovq %rsi, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %x0 = zext <2 x i64> %a0 to <2 x i128>
+  %x1 = zext <2 x i64> %a1 to <2 x i128>
+  %sum = add <2 x i128> %x0, %x1
+  %inc = add <2 x i128> %sum, <i128 1, i128 1>
+  %shift = lshr <2 x i128> %inc, <i128 1, i128 1>
+  %res = trunc <2 x i128> %shift to <2 x i64>
+  ret <2 x i64> %res
+}
+
+;
+; 256-bit vectors
+;
+
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_fixed_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm2, %xmm0
+; SSE-NEXT:    pavgb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <32 x i8> %a0, %a1
+  %xor = xor <32 x i8> %a0, %a1
+  %shift = lshr <32 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <32 x i8> %or, %shift
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_ext_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm2, %xmm0
+; SSE-NEXT:    pavgb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <32 x i8> %a0 to <32 x i16>
+  %x1 = zext <32 x i8> %a1 to <32 x i16>
+  %sum = add <32 x i16> %x0, %x1
+  %inc = add <32 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = lshr <32 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <32 x i16> %shift to <32 x i8>
+  ret <32 x i8> %res
+}
+
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_fixed_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm2, %xmm0
+; SSE-NEXT:    pavgw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i16> %a0, %a1
+  %xor = xor <16 x i16> %a1, %a0
+  %shift = lshr <16 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <16 x i16> %or, %shift
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_ext_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm2, %xmm0
+; SSE-NEXT:    pavgw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i16> %a0 to <16 x i32>
+  %x1 = zext <16 x i16> %a1 to <16 x i32>
+  %sum = add <16 x i32> %x0, %x1
+  %inc = add <16 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = lshr <16 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <16 x i32> %shift to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_fixed_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    psrld $1, %xmm3
+; SSE-NEXT:    psubd %xmm3, %xmm4
+; SSE-NEXT:    psrld $1, %xmm2
+; SSE-NEXT:    psubd %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <8 x i32> %a0, %a1
+  %xor = xor <8 x i32> %a1, %a0
+  %shift = lshr <8 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = sub <8 x i32> %or, %shift
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE2-LABEL: test_ext_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm0
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT:    paddq %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm4
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE2-NEXT:    paddq %xmm1, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    psubq %xmm1, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm4
+; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    pxor %xmm5, %xmm5
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE4-NEXT:    paddq %xmm4, %xmm2
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE4-NEXT:    paddq %xmm1, %xmm3
+; SSE4-NEXT:    paddq %xmm6, %xmm0
+; SSE4-NEXT:    paddq %xmm7, %xmm4
+; SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT:    psubq %xmm1, %xmm2
+; SSE4-NEXT:    psubq %xmm1, %xmm3
+; SSE4-NEXT:    psubq %xmm1, %xmm0
+; SSE4-NEXT:    psubq %xmm1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE4-NEXT:    movaps %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpaddq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i32> %a0 to <8 x i64>
+  %x1 = zext <8 x i32> %a1 to <8 x i64>
+  %sum = add <8 x i64> %x0, %x1
+  %inc = add <8 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shift = lshr <8 x i64> %inc, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <8 x i64> %shift to <8 x i32>
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE-LABEL: test_fixed_v4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    psrlq $1, %xmm3
+; SSE-NEXT:    psubq %xmm3, %xmm4
+; SSE-NEXT:    psrlq $1, %xmm2
+; SSE-NEXT:    psubq %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <4 x i64> %a0, %a1
+  %xor = xor <4 x i64> %a1, %a0
+  %shift = lshr <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1>
+  %res = sub <4 x i64> %or, %shift
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_ext_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm4, %rcx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm4, %rdx
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    movb $1, %sil
+; SSE2-NEXT:    addb $-1, %sil
+; SSE2-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; SSE2-NEXT:    adcq %rdx, %rcx
+; SSE2-NEXT:    setb %dl
+; SSE2-NEXT:    movb $1, %cl
+; SSE2-NEXT:    addb $-1, %cl
+; SSE2-NEXT:    movq %xmm1, %rdi
+; SSE2-NEXT:    movq %xmm3, %r8
+; SSE2-NEXT:    leaq 1(%rdi,%r8), %rcx
+; SSE2-NEXT:    adcq %r8, %rdi
+; SSE2-NEXT:    setb %dil
+; SSE2-NEXT:    movb $1, %r8b
+; SSE2-NEXT:    addb $-1, %r8b
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    leaq 1(%r8,%r9), %r10
+; SSE2-NEXT:    adcq %r9, %r8
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %xmm2, %r9
+; SSE2-NEXT:    leaq 1(%rax,%r9), %r11
+; SSE2-NEXT:    adcq %r9, %rax
+; SSE2-NEXT:    setb %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    movzbl %r8b, %r8d
+; SSE2-NEXT:    movzbl %dil, %edi
+; SSE2-NEXT:    movzbl %dl, %edx
+; SSE2-NEXT:    shrdq $1, %rax, %r11
+; SSE2-NEXT:    shrdq $1, %r8, %r10
+; SSE2-NEXT:    shrdq $1, %rdi, %rcx
+; SSE2-NEXT:    shrdq $1, %rdx, %rsi
+; SSE2-NEXT:    movq %r11, %xmm0
+; SSE2-NEXT:    movq %r10, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movq %rcx, %xmm1
+; SSE2-NEXT:    movq %rsi, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm1, %rcx
+; SSE4-NEXT:    movq %xmm3, %rdx
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    movb $1, %sil
+; SSE4-NEXT:    addb $-1, %sil
+; SSE4-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; SSE4-NEXT:    adcq %rdx, %rcx
+; SSE4-NEXT:    setb %dl
+; SSE4-NEXT:    movb $1, %cl
+; SSE4-NEXT:    addb $-1, %cl
+; SSE4-NEXT:    pextrq $1, %xmm1, %rdi
+; SSE4-NEXT:    pextrq $1, %xmm3, %r8
+; SSE4-NEXT:    leaq 1(%rdi,%r8), %rcx
+; SSE4-NEXT:    adcq %r8, %rdi
+; SSE4-NEXT:    setb %dil
+; SSE4-NEXT:    movb $1, %r8b
+; SSE4-NEXT:    addb $-1, %r8b
+; SSE4-NEXT:    movq %xmm0, %r8
+; SSE4-NEXT:    movq %xmm2, %r9
+; SSE4-NEXT:    leaq 1(%r8,%r9), %r10
+; SSE4-NEXT:    adcq %r9, %r8
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; SSE4-NEXT:    pextrq $1, %xmm2, %r9
+; SSE4-NEXT:    leaq 1(%rax,%r9), %r11
+; SSE4-NEXT:    adcq %r9, %rax
+; SSE4-NEXT:    setb %al
+; SSE4-NEXT:    movzbl %al, %eax
+; SSE4-NEXT:    movzbl %r8b, %r8d
+; SSE4-NEXT:    movzbl %dil, %edi
+; SSE4-NEXT:    movzbl %dl, %edx
+; SSE4-NEXT:    shrdq $1, %rax, %r11
+; SSE4-NEXT:    shrdq $1, %r8, %r10
+; SSE4-NEXT:    shrdq $1, %rdi, %rcx
+; SSE4-NEXT:    shrdq $1, %rdx, %rsi
+; SSE4-NEXT:    movq %r11, %xmm1
+; SSE4-NEXT:    movq %r10, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    movq %rcx, %xmm2
+; SSE4-NEXT:    movq %rsi, %xmm1
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rcx
+; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    movb $1, %sil
+; AVX1-NEXT:    addb $-1, %sil
+; AVX1-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; AVX1-NEXT:    adcq %rdx, %rcx
+; AVX1-NEXT:    setb %dl
+; AVX1-NEXT:    movb $1, %cl
+; AVX1-NEXT:    addb $-1, %cl
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX1-NEXT:    leaq 1(%rdi,%r8), %rcx
+; AVX1-NEXT:    adcq %r8, %rdi
+; AVX1-NEXT:    setb %dil
+; AVX1-NEXT:    movb $1, %r8b
+; AVX1-NEXT:    addb $-1, %r8b
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %r9
+; AVX1-NEXT:    leaq 1(%r8,%r9), %r10
+; AVX1-NEXT:    adcq %r9, %r8
+; AVX1-NEXT:    setb %r8b
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX1-NEXT:    leaq 1(%rax,%r9), %r11
+; AVX1-NEXT:    adcq %r9, %rax
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    movzbl %al, %eax
+; AVX1-NEXT:    movzbl %r8b, %r8d
+; AVX1-NEXT:    movzbl %dil, %edi
+; AVX1-NEXT:    movzbl %dl, %edx
+; AVX1-NEXT:    shrdq $1, %rax, %r11
+; AVX1-NEXT:    shrdq $1, %r8, %r10
+; AVX1-NEXT:    shrdq $1, %rdi, %rcx
+; AVX1-NEXT:    shrdq $1, %rdx, %rsi
+; AVX1-NEXT:    vmovq %r11, %xmm0
+; AVX1-NEXT:    vmovq %r10, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vmovq %rsi, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    movb $1, %sil
+; AVX2-NEXT:    addb $-1, %sil
+; AVX2-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; AVX2-NEXT:    adcq %rdx, %rcx
+; AVX2-NEXT:    setb %dl
+; AVX2-NEXT:    movb $1, %cl
+; AVX2-NEXT:    addb $-1, %cl
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX2-NEXT:    leaq 1(%rdi,%r8), %rcx
+; AVX2-NEXT:    adcq %r8, %rdi
+; AVX2-NEXT:    setb %dil
+; AVX2-NEXT:    movb $1, %r8b
+; AVX2-NEXT:    addb $-1, %r8b
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r8
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %r9
+; AVX2-NEXT:    leaq 1(%r8,%r9), %r10
+; AVX2-NEXT:    adcq %r9, %r8
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    leaq 1(%rax,%r9), %r11
+; AVX2-NEXT:    adcq %r9, %rax
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    movzbl %r8b, %r8d
+; AVX2-NEXT:    movzbl %dil, %edi
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    shrdq $1, %rax, %r11
+; AVX2-NEXT:    shrdq $1, %r8, %r10
+; AVX2-NEXT:    shrdq $1, %rdi, %rcx
+; AVX2-NEXT:    shrdq $1, %rdx, %rsi
+; AVX2-NEXT:    vmovq %r11, %xmm0
+; AVX2-NEXT:    vmovq %r10, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vmovq %rsi, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    vmovq %xmm1, %rdx
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    movb $1, %sil
+; AVX512-NEXT:    addb $-1, %sil
+; AVX512-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; AVX512-NEXT:    adcq %rdx, %rcx
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    movb $1, %cl
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512-NEXT:    addb $-1, %cl
+; AVX512-NEXT:    leaq 1(%rdi,%r8), %rcx
+; AVX512-NEXT:    adcq %r8, %rdi
+; AVX512-NEXT:    setb %dil
+; AVX512-NEXT:    movb $1, %r8b
+; AVX512-NEXT:    addb $-1, %r8b
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r8
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %r9
+; AVX512-NEXT:    leaq 1(%r8,%r9), %r10
+; AVX512-NEXT:    adcq %r9, %r8
+; AVX512-NEXT:    setb %r8b
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX512-NEXT:    leaq 1(%rax,%r9), %r11
+; AVX512-NEXT:    adcq %r9, %rax
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    movzbl %al, %eax
+; AVX512-NEXT:    movzbl %r8b, %r8d
+; AVX512-NEXT:    movzbl %dil, %edi
+; AVX512-NEXT:    movzbl %dl, %edx
+; AVX512-NEXT:    shrdq $1, %rax, %r11
+; AVX512-NEXT:    shrdq $1, %r8, %r10
+; AVX512-NEXT:    shrdq $1, %rdi, %rcx
+; AVX512-NEXT:    shrdq $1, %rdx, %rsi
+; AVX512-NEXT:    vmovq %r11, %xmm0
+; AVX512-NEXT:    vmovq %r10, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    vmovq %rsi, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <4 x i64> %a0 to <4 x i128>
+  %x1 = zext <4 x i64> %a1 to <4 x i128>
+  %sum = add <4 x i128> %x0, %x1
+  %inc = add <4 x i128> %sum, <i128 1, i128 1, i128 1, i128 1>
+  %shift = lshr <4 x i128> %inc, <i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <4 x i128> %shift to <4 x i64>
+  ret <4 x i64> %res
+}
+
+;
+; 512-bit vectors
+;
+
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_fixed_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm4, %xmm0
+; SSE-NEXT:    pavgb %xmm5, %xmm1
+; SSE-NEXT:    pavgb %xmm6, %xmm2
+; SSE-NEXT:    pavgb %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubb %xmm6, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <64 x i8> %a0, %a1
+  %xor = xor <64 x i8> %a0, %a1
+  %shift = lshr <64 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <64 x i8> %or, %shift
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_ext_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm4, %xmm0
+; SSE-NEXT:    pavgb %xmm5, %xmm1
+; SSE-NEXT:    pavgb %xmm6, %xmm2
+; SSE-NEXT:    pavgb %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <64 x i8> %a0 to <64 x i16>
+  %x1 = zext <64 x i8> %a1 to <64 x i16>
+  %sum = add <64 x i16> %x0, %x1
+  %inc = add <64 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = lshr <64 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <64 x i16> %shift to <64 x i8>
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_fixed_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm4, %xmm0
+; SSE-NEXT:    pavgw %xmm5, %xmm1
+; SSE-NEXT:    pavgw %xmm6, %xmm2
+; SSE-NEXT:    pavgw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubw %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <32 x i16> %a0, %a1
+  %xor = xor <32 x i16> %a1, %a0
+  %shift = lshr <32 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <32 x i16> %or, %shift
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_ext_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm4, %xmm0
+; SSE-NEXT:    pavgw %xmm5, %xmm1
+; SSE-NEXT:    pavgw %xmm6, %xmm2
+; SSE-NEXT:    pavgw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <32 x i16> %a0 to <32 x i32>
+  %x1 = zext <32 x i16> %a1 to <32 x i32>
+  %sum = add <32 x i32> %x0, %x1
+  %inc = add <32 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = lshr <32 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <32 x i32> %shift to <32 x i16>
+  ret <32 x i16> %res
+}
+
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE-LABEL: test_fixed_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    por %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    por %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm8, %xmm7
+; SSE-NEXT:    psrld $1, %xmm7
+; SSE-NEXT:    psubd %xmm7, %xmm3
+; SSE-NEXT:    psrld $1, %xmm6
+; SSE-NEXT:    psubd %xmm6, %xmm9
+; SSE-NEXT:    psrld $1, %xmm5
+; SSE-NEXT:    psubd %xmm5, %xmm10
+; SSE-NEXT:    psrld $1, %xmm4
+; SSE-NEXT:    psubd %xmm4, %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxord %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i32> %a0, %a1
+  %xor = xor <16 x i32> %a1, %a0
+  %shift = lshr <16 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = sub <16 x i32> %or, %shift
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE2-LABEL: test_ext_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
+; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm13
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm10, %xmm0
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; SSE2-NEXT:    paddq %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm11, %xmm1
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
+; SSE2-NEXT:    paddq %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm12, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; SSE2-NEXT:    paddq %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm13, %xmm8
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; SSE2-NEXT:    paddq %xmm3, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm0
+; SSE2-NEXT:    psubq %xmm3, %xmm4
+; SSE2-NEXT:    psubq %xmm3, %xmm1
+; SSE2-NEXT:    psubq %xmm3, %xmm5
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm3, %xmm6
+; SSE2-NEXT:    psubq %xmm3, %xmm8
+; SSE2-NEXT:    psubq %xmm3, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm8
+; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm7[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT:    movaps %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm8
+; SSE4-NEXT:    movdqa %xmm2, %xmm3
+; SSE4-NEXT:    movdqa %xmm1, %xmm2
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    pxor %xmm10, %xmm10
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm11 = xmm2[0],zero,xmm2[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm13 = xmm8[0],zero,xmm8[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
+; SSE4-NEXT:    paddq %xmm1, %xmm4
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; SSE4-NEXT:    paddq %xmm2, %xmm5
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3]
+; SSE4-NEXT:    paddq %xmm3, %xmm6
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3]
+; SSE4-NEXT:    paddq %xmm8, %xmm7
+; SSE4-NEXT:    paddq %xmm9, %xmm0
+; SSE4-NEXT:    paddq %xmm11, %xmm1
+; SSE4-NEXT:    paddq %xmm12, %xmm2
+; SSE4-NEXT:    paddq %xmm13, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm8, %xmm8
+; SSE4-NEXT:    psubq %xmm8, %xmm4
+; SSE4-NEXT:    psubq %xmm8, %xmm5
+; SSE4-NEXT:    psubq %xmm8, %xmm6
+; SSE4-NEXT:    psubq %xmm8, %xmm7
+; SSE4-NEXT:    psubq %xmm8, %xmm0
+; SSE4-NEXT:    psubq %xmm8, %xmm1
+; SSE4-NEXT:    psubq %xmm8, %xmm2
+; SSE4-NEXT:    psubq %xmm8, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm7
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX1-NEXT:    vpaddq %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm11
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; AVX1-NEXT:    vpaddq %xmm7, %xmm12, %xmm7
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX1-NEXT:    vpaddq %xmm12, %xmm8, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm12
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX1-NEXT:    vpaddq %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm11[0],zero,xmm11[1],zero
+; AVX1-NEXT:    vpaddq %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero
+; AVX1-NEXT:    vpaddq %xmm3, %xmm9, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubq %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubq %xmm6, %xmm8, %xmm8
+; AVX1-NEXT:    vpsubq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $1, %xmm8, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm7, %xmm7
+; AVX1-NEXT:    vpsrlq $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT:    vpaddq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpaddq %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubq %ymm2, %ymm4, %ymm3
+; AVX2-NEXT:    vpsubq %ymm2, %ymm5, %ymm4
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm0[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i32> %a0 to <16 x i64>
+  %x1 = zext <16 x i32> %a1 to <16 x i64>
+  %sum = add <16 x i64> %x0, %x1
+  %inc = add <16 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shift = lshr <16 x i64> %inc, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <16 x i64> %shift to <16 x i32>
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE-LABEL: test_fixed_v8i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    por %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    por %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm8, %xmm7
+; SSE-NEXT:    psrlq $1, %xmm7
+; SSE-NEXT:    psubq %xmm7, %xmm3
+; SSE-NEXT:    psrlq $1, %xmm6
+; SSE-NEXT:    psubq %xmm6, %xmm9
+; SSE-NEXT:    psrlq $1, %xmm5
+; SSE-NEXT:    psubq %xmm5, %xmm10
+; SSE-NEXT:    psrlq $1, %xmm4
+; SSE-NEXT:    psubq %xmm4, %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <8 x i64> %a0, %a1
+  %xor = xor <8 x i64> %a1, %a0
+  %shift = lshr <8 x i64> %xor, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = sub <8 x i64> %or, %shift
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_ext_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm8, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
+; SSE2-NEXT:    movq %xmm8, %rdx
+; SSE2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    adcq %rdx, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %xmm3, %r12
+; SSE2-NEXT:    movq %xmm7, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %r12, %rax
+; SSE2-NEXT:    adcq %rcx, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %r11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %rbx
+; SSE2-NEXT:    movq %r11, %rax
+; SSE2-NEXT:    adcq %rbx, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %xmm2, %r14
+; SSE2-NEXT:    movq %xmm6, %r15
+; SSE2-NEXT:    movq %r14, %rax
+; SSE2-NEXT:    adcq %r15, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %r13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %r10
+; SSE2-NEXT:    movq %r13, %rax
+; SSE2-NEXT:    adcq %r10, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    movq %xmm5, %r8
+; SSE2-NEXT:    movq %r9, %rax
+; SSE2-NEXT:    adcq %r8, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rdi
+; SSE2-NEXT:    movq %xmm2, %rsi
+; SSE2-NEXT:    movq %rdi, %rdx
+; SSE2-NEXT:    adcq %rsi, %rdx
+; SSE2-NEXT:    movb $1, %dl
+; SSE2-NEXT:    setb %bpl
+; SSE2-NEXT:    addb $-1, %dl
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %xmm4, %rax
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    adcq %rax, %rdx
+; SSE2-NEXT:    leaq 1(%rcx,%rax), %rdx
+; SSE2-NEXT:    leaq 1(%rdi,%rsi), %rax
+; SSE2-NEXT:    leaq 1(%r9,%r8), %rcx
+; SSE2-NEXT:    leaq 1(%r13,%r10), %rdi
+; SSE2-NEXT:    leaq 1(%r14,%r15), %rsi
+; SSE2-NEXT:    leaq 1(%r11,%rbx), %r11
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    leaq 1(%r12,%r8), %r9
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT:    leaq 1(%r8,%r10), %r10
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    movzbl %r8b, %r8d
+; SSE2-NEXT:    shrdq $1, %r8, %rdx
+; SSE2-NEXT:    movzbl %bpl, %r8d
+; SSE2-NEXT:    shrdq $1, %r8, %rax
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %rcx
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %rdi
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %rsi
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %r11
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %r9
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %r10
+; SSE2-NEXT:    movq %rdx, %xmm0
+; SSE2-NEXT:    movq %rax, %xmm4
+; SSE2-NEXT:    movq %rcx, %xmm1
+; SSE2-NEXT:    movq %rdi, %xmm5
+; SSE2-NEXT:    movq %rsi, %xmm2
+; SSE2-NEXT:    movq %r11, %xmm6
+; SSE2-NEXT:    movq %r9, %xmm3
+; SSE2-NEXT:    movq %r10, %xmm7
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    movq %xmm3, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm7, %rdx
+; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    movq %rcx, %rax
+; SSE4-NEXT:    adcq %rdx, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    pextrq $1, %xmm3, %r12
+; SSE4-NEXT:    pextrq $1, %xmm7, %rbp
+; SSE4-NEXT:    movq %r12, %rax
+; SSE4-NEXT:    adcq %rbp, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    movq %xmm2, %r11
+; SSE4-NEXT:    movq %xmm6, %rbx
+; SSE4-NEXT:    movq %r11, %rax
+; SSE4-NEXT:    adcq %rbx, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    pextrq $1, %xmm2, %r14
+; SSE4-NEXT:    pextrq $1, %xmm6, %r15
+; SSE4-NEXT:    movq %r14, %rax
+; SSE4-NEXT:    adcq %r15, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    movq %xmm1, %r13
+; SSE4-NEXT:    movq %xmm5, %r10
+; SSE4-NEXT:    movq %r13, %rax
+; SSE4-NEXT:    adcq %r10, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    pextrq $1, %xmm1, %r9
+; SSE4-NEXT:    pextrq $1, %xmm5, %r8
+; SSE4-NEXT:    movq %r9, %rax
+; SSE4-NEXT:    adcq %r8, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    movq %xmm0, %rdi
+; SSE4-NEXT:    movq %xmm4, %rsi
+; SSE4-NEXT:    movq %rdi, %rdx
+; SSE4-NEXT:    adcq %rsi, %rdx
+; SSE4-NEXT:    movb $1, %dl
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    addb $-1, %dl
+; SSE4-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE4-NEXT:    pextrq $1, %xmm4, %rax
+; SSE4-NEXT:    movq %rcx, %rdx
+; SSE4-NEXT:    adcq %rax, %rdx
+; SSE4-NEXT:    leaq 1(%rcx,%rax), %rdx
+; SSE4-NEXT:    leaq 1(%rdi,%rsi), %rax
+; SSE4-NEXT:    leaq 1(%r9,%r8), %rcx
+; SSE4-NEXT:    leaq 1(%r13,%r10), %rdi
+; SSE4-NEXT:    leaq 1(%r14,%r15), %rsi
+; SSE4-NEXT:    leaq 1(%r11,%rbx), %r11
+; SSE4-NEXT:    leaq 1(%r12,%rbp), %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    leaq 1(%r9,%r10), %r9
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    movzbl %r10b, %r10d
+; SSE4-NEXT:    shrdq $1, %r10, %rdx
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %rax
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %rcx
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %rdi
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %rsi
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %r11
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %r8
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %r9
+; SSE4-NEXT:    movq %rdx, %xmm4
+; SSE4-NEXT:    movq %rax, %xmm0
+; SSE4-NEXT:    movq %rcx, %xmm5
+; SSE4-NEXT:    movq %rdi, %xmm1
+; SSE4-NEXT:    movq %rsi, %xmm6
+; SSE4-NEXT:    movq %r11, %xmm2
+; SSE4-NEXT:    movq %r8, %xmm7
+; SSE4-NEXT:    movq %r9, %xmm3
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vmovq %xmm1, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm3, %rdx
+; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    movq %rcx, %rax
+; AVX1-NEXT:    adcq %rdx, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r12
+; AVX1-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %r12, %rax
+; AVX1-NEXT:    adcq %rcx, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %r11
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vmovq %xmm3, %rbx
+; AVX1-NEXT:    movq %r11, %rax
+; AVX1-NEXT:    adcq %rbx, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX1-NEXT:    vpextrq $1, %xmm3, %r15
+; AVX1-NEXT:    movq %r14, %rax
+; AVX1-NEXT:    adcq %r15, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vmovq %xmm0, %r13
+; AVX1-NEXT:    vmovq %xmm2, %r10
+; AVX1-NEXT:    movq %r13, %rax
+; AVX1-NEXT:    adcq %r10, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r8
+; AVX1-NEXT:    movq %r9, %rax
+; AVX1-NEXT:    adcq %r8, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT:    vmovq %xmm0, %rdi
+; AVX1-NEXT:    vmovq %xmm1, %rsi
+; AVX1-NEXT:    movq %rdi, %rcx
+; AVX1-NEXT:    adcq %rsi, %rcx
+; AVX1-NEXT:    movb $1, %cl
+; AVX1-NEXT:    setb %bpl
+; AVX1-NEXT:    addb $-1, %cl
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    movq %rdx, %rcx
+; AVX1-NEXT:    adcq %rax, %rcx
+; AVX1-NEXT:    leaq 1(%rdx,%rax), %rcx
+; AVX1-NEXT:    leaq 1(%rdi,%rsi), %rax
+; AVX1-NEXT:    leaq 1(%r9,%r8), %rdx
+; AVX1-NEXT:    leaq 1(%r13,%r10), %rdi
+; AVX1-NEXT:    leaq 1(%r14,%r15), %rsi
+; AVX1-NEXT:    leaq 1(%r11,%rbx), %r11
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX1-NEXT:    leaq 1(%r12,%r8), %r9
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX1-NEXT:    leaq 1(%r8,%r10), %r8
+; AVX1-NEXT:    setb %r10b
+; AVX1-NEXT:    movzbl %r10b, %r10d
+; AVX1-NEXT:    shrdq $1, %r10, %rcx
+; AVX1-NEXT:    movzbl %bpl, %r10d
+; AVX1-NEXT:    shrdq $1, %r10, %rax
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %rdx
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %rdi
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %rsi
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %r11
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %r9
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %r8
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vmovq %rdx, %xmm2
+; AVX1-NEXT:    vmovq %rdi, %xmm3
+; AVX1-NEXT:    vmovq %rsi, %xmm4
+; AVX1-NEXT:    vmovq %r11, %xmm5
+; AVX1-NEXT:    vmovq %r9, %xmm6
+; AVX1-NEXT:    vmovq %r8, %xmm7
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm3, %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    adcq %rdx, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r12
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %r12, %rax
+; AVX2-NEXT:    adcq %rcx, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %r11
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX2-NEXT:    vmovq %xmm3, %rbx
+; AVX2-NEXT:    movq %r11, %rax
+; AVX2-NEXT:    adcq %rbx, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r15
+; AVX2-NEXT:    movq %r14, %rax
+; AVX2-NEXT:    adcq %r15, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vmovq %xmm0, %r13
+; AVX2-NEXT:    vmovq %xmm2, %r10
+; AVX2-NEXT:    movq %r13, %rax
+; AVX2-NEXT:    adcq %r10, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r8
+; AVX2-NEXT:    movq %r9, %rax
+; AVX2-NEXT:    adcq %r8, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT:    vmovq %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %rsi
+; AVX2-NEXT:    movq %rdi, %rcx
+; AVX2-NEXT:    adcq %rsi, %rcx
+; AVX2-NEXT:    movb $1, %cl
+; AVX2-NEXT:    setb %bpl
+; AVX2-NEXT:    addb $-1, %cl
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    movq %rdx, %rcx
+; AVX2-NEXT:    adcq %rax, %rcx
+; AVX2-NEXT:    leaq 1(%rdx,%rax), %rcx
+; AVX2-NEXT:    leaq 1(%rdi,%rsi), %rax
+; AVX2-NEXT:    leaq 1(%r9,%r8), %rdx
+; AVX2-NEXT:    leaq 1(%r13,%r10), %rdi
+; AVX2-NEXT:    leaq 1(%r14,%r15), %rsi
+; AVX2-NEXT:    leaq 1(%r11,%rbx), %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    leaq 1(%r12,%r8), %r9
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    leaq 1(%r8,%r10), %r8
+; AVX2-NEXT:    setb %r10b
+; AVX2-NEXT:    movzbl %r10b, %r10d
+; AVX2-NEXT:    shrdq $1, %r10, %rcx
+; AVX2-NEXT:    movzbl %bpl, %r10d
+; AVX2-NEXT:    shrdq $1, %r10, %rax
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %rdx
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %rdi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %rsi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %r11
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %r9
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %r8
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vmovq %rdi, %xmm3
+; AVX2-NEXT:    vmovq %rsi, %xmm4
+; AVX2-NEXT:    vmovq %r11, %xmm5
+; AVX2-NEXT:    vmovq %r9, %xmm6
+; AVX2-NEXT:    vmovq %r8, %xmm7
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vmovq %xmm1, %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    movq %rcx, %rax
+; AVX512-NEXT:    adcq %rdx, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    movq %r12, %rax
+; AVX512-NEXT:    adcq %rcx, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %r11
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512-NEXT:    vmovq %xmm3, %rbx
+; AVX512-NEXT:    movq %r11, %rax
+; AVX512-NEXT:    adcq %rbx, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r14
+; AVX512-NEXT:    vpextrq $1, %xmm3, %r15
+; AVX512-NEXT:    movq %r14, %rax
+; AVX512-NEXT:    adcq %r15, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vmovq %xmm0, %r13
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vmovq %xmm1, %r10
+; AVX512-NEXT:    movq %r13, %rax
+; AVX512-NEXT:    adcq %r10, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512-NEXT:    movq %r9, %rax
+; AVX512-NEXT:    adcq %r8, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm0, %rdi
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    movq %rdi, %rcx
+; AVX512-NEXT:    adcq %rsi, %rcx
+; AVX512-NEXT:    movb $1, %cl
+; AVX512-NEXT:    setb %bpl
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512-NEXT:    addb $-1, %cl
+; AVX512-NEXT:    movq %rdx, %rcx
+; AVX512-NEXT:    adcq %rax, %rcx
+; AVX512-NEXT:    leaq 1(%rdx,%rax), %rcx
+; AVX512-NEXT:    leaq 1(%rdi,%rsi), %rax
+; AVX512-NEXT:    leaq 1(%r9,%r8), %rdx
+; AVX512-NEXT:    leaq 1(%r13,%r10), %rdi
+; AVX512-NEXT:    leaq 1(%r14,%r15), %rsi
+; AVX512-NEXT:    leaq 1(%r11,%rbx), %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    leaq 1(%r12,%r8), %r9
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    leaq 1(%r8,%r10), %r8
+; AVX512-NEXT:    setb %r10b
+; AVX512-NEXT:    movzbl %r10b, %r10d
+; AVX512-NEXT:    shrdq $1, %r10, %rcx
+; AVX512-NEXT:    movzbl %bpl, %r10d
+; AVX512-NEXT:    shrdq $1, %r10, %rax
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %rdx
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %rdi
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %rsi
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %r11
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %r9
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %r8
+; AVX512-NEXT:    vmovq %rcx, %xmm0
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vmovq %rdx, %xmm2
+; AVX512-NEXT:    vmovq %rdi, %xmm3
+; AVX512-NEXT:    vmovq %rsi, %xmm4
+; AVX512-NEXT:    vmovq %r11, %xmm5
+; AVX512-NEXT:    vmovq %r9, %xmm6
+; AVX512-NEXT:    vmovq %r8, %xmm7
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i64> %a0 to <8 x i128>
+  %x1 = zext <8 x i64> %a1 to <8 x i128>
+  %sum = add <8 x i128> %x0, %x1
+  %inc = add <8 x i128> %sum, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %shift = lshr <8 x i128> %inc, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <8 x i128> %shift to <8 x i64>
+  ret <8 x i64> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
new file mode 100644
index 0000000000000..a3864ab4bb44e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -0,0 +1,3437 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+;
+; 128-bit vectors
+;
+
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_fixed_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    paddb %xmm2, %xmm0
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i8> %a0, %a1
+  %xor = xor <16 x i8> %a0, %a1
+  %shift = ashr <16 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <16 x i8> %and, %shift
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE2-LABEL: test_ext_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm3, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm2
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm4
+; SSE4-NEXT:    paddw %xmm2, %xmm4
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm0
+; SSE4-NEXT:    paddw %xmm3, %xmm0
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm1, %xmm0
+; SSE4-NEXT:    pand %xmm1, %xmm4
+; SSE4-NEXT:    packuswb %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i8> %a0 to <16 x i16>
+  %x1 = sext <16 x i8> %a1 to <16 x i16>
+  %sum = add <16 x i16> %x0, %x1
+  %shift = ashr <16 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <16 x i16> %shift to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_fixed_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a1, %a0
+  %shift = ashr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <8 x i16> %and, %shift
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE2-LABEL: test_ext_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    paddd %xmm2, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm2
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm4
+; SSE4-NEXT:    paddd %xmm2, %xmm4
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm0
+; SSE4-NEXT:    paddd %xmm3, %xmm0
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
+; SSE4-NEXT:    packusdw %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %sum = add <8 x i32> %x0, %x1
+  %shift = ashr <8 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <8 x i32> %shift to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_fixed_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrad $1, %xmm0
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a1, %a0
+  %shift = ashr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
+  %res = add <4 x i32> %and, %shift
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE2-LABEL: test_ext_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm3
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm0
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm1
+; SSE4-NEXT:    paddq %xmm3, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <4 x i32> %a0 to <4 x i64>
+  %x1 = sext <4 x i32> %a1 to <4 x i64>
+  %sum = add <4 x i64> %x0, %x1
+  %shift = ashr <4 x i64> %sum, <i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <4 x i64> %shift to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    pand %xmm1, %xmm2
+; SSE4-NEXT:    pxor %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    psrad $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpsraq $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %and = and <2 x i64> %a0, %a1
+  %xor = xor <2 x i64> %a1, %a0
+  %shift = ashr <2 x i64> %xor, <i64 1, i64 1>
+  %res = add <2 x i64> %and, %shift
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_ext_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    movq %rdi, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    movq %r9, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    addq %r9, %rdx
+; SSE2-NEXT:    adcq %rsi, %r10
+; SSE2-NEXT:    addq %rdi, %rax
+; SSE2-NEXT:    adcq %rcx, %r8
+; SSE2-NEXT:    shldq $63, %rax, %r8
+; SSE2-NEXT:    shldq $63, %rdx, %r10
+; SSE2-NEXT:    movq %r10, %xmm0
+; SSE2-NEXT:    movq %r8, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm0, %rax
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE4-NEXT:    movq %rdx, %rsi
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    movq %xmm1, %rdi
+; SSE4-NEXT:    movq %rdi, %r8
+; SSE4-NEXT:    sarq $63, %r8
+; SSE4-NEXT:    pextrq $1, %xmm1, %r9
+; SSE4-NEXT:    movq %r9, %r10
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    addq %r9, %rdx
+; SSE4-NEXT:    adcq %rsi, %r10
+; SSE4-NEXT:    addq %rdi, %rax
+; SSE4-NEXT:    adcq %rcx, %r8
+; SSE4-NEXT:    shldq $63, %rax, %r8
+; SSE4-NEXT:    shldq $63, %rdx, %r10
+; SSE4-NEXT:    movq %r10, %xmm1
+; SSE4-NEXT:    movq %r8, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX-NEXT:    movq %rdx, %rsi
+; AVX-NEXT:    sarq $63, %rsi
+; AVX-NEXT:    vmovq %xmm1, %rdi
+; AVX-NEXT:    movq %rdi, %r8
+; AVX-NEXT:    sarq $63, %r8
+; AVX-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX-NEXT:    movq %r9, %r10
+; AVX-NEXT:    sarq $63, %r10
+; AVX-NEXT:    addq %r9, %rdx
+; AVX-NEXT:    adcq %rsi, %r10
+; AVX-NEXT:    addq %rdi, %rax
+; AVX-NEXT:    adcq %rcx, %r8
+; AVX-NEXT:    shldq $63, %rax, %r8
+; AVX-NEXT:    shldq $63, %rdx, %r10
+; AVX-NEXT:    vmovq %r10, %xmm0
+; AVX-NEXT:    vmovq %r8, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %x0 = sext <2 x i64> %a0 to <2 x i128>
+  %x1 = sext <2 x i64> %a1 to <2 x i128>
+  %sum = add <2 x i128> %x0, %x1
+  %shift = ashr <2 x i128> %sum, <i128 1, i128 1>
+  %res = trunc <2 x i128> %shift to <2 x i64>
+  ret <2 x i64> %res
+}
+
+;
+; 256-bit vectors
+;
+
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_fixed_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    paddb %xmm4, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    paddb %xmm5, %xmm0
+; SSE-NEXT:    psubb %xmm3, %xmm0
+; SSE-NEXT:    psubb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <32 x i8> %a0, %a1
+  %xor = xor <32 x i8> %a0, %a1
+  %shift = ashr <32 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <32 x i8> %and, %shift
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE2-LABEL: test_ext_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm4, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    paddw %xmm5, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm6, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    paddw %xmm7, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm4, %xmm5
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm7
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm4
+; SSE4-NEXT:    paddw %xmm5, %xmm4
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm1
+; SSE4-NEXT:    paddw %xmm6, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm3
+; SSE4-NEXT:    paddw %xmm7, %xmm3
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm0
+; SSE4-NEXT:    paddw %xmm8, %xmm0
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm2, %xmm0
+; SSE4-NEXT:    pand %xmm2, %xmm3
+; SSE4-NEXT:    packuswb %xmm3, %xmm0
+; SSE4-NEXT:    pand %xmm2, %xmm1
+; SSE4-NEXT:    pand %xmm2, %xmm4
+; SSE4-NEXT:    packuswb %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm5, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm5, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <32 x i8> %a0 to <32 x i16>
+  %x1 = sext <32 x i8> %a1 to <32 x i16>
+  %sum = add <32 x i16> %x0, %x1
+  %shift = ashr <32 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <32 x i16> %shift to <32 x i8>
+  ret <32 x i8> %res
+}
+
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_fixed_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psraw $1, %xmm1
+; SSE-NEXT:    paddw %xmm4, %xmm1
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i16> %a0, %a1
+  %xor = xor <16 x i16> %a1, %a0
+  %shift = ashr <16 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <16 x i16> %and, %shift
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE2-LABEL: test_ext_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    paddd %xmm4, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm5, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm7, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm8
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm8, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm4, %xmm5
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm7
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm4
+; SSE4-NEXT:    paddd %xmm5, %xmm4
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm1
+; SSE4-NEXT:    paddd %xmm6, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm3
+; SSE4-NEXT:    paddd %xmm7, %xmm3
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm0
+; SSE4-NEXT:    paddd %xmm8, %xmm0
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; SSE4-NEXT:    packusdw %xmm3, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
+; SSE4-NEXT:    packusdw %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpackusdw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i16> %a0 to <16 x i32>
+  %x1 = sext <16 x i16> %a1 to <16 x i32>
+  %sum = add <16 x i32> %x0, %x1
+  %shift = ashr <16 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <16 x i32> %shift to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_fixed_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrad $1, %xmm1
+; SSE-NEXT:    paddd %xmm4, %xmm1
+; SSE-NEXT:    psrad $1, %xmm0
+; SSE-NEXT:    paddd %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <8 x i32> %a0, %a1
+  %xor = xor <8 x i32> %a1, %a0
+  %shift = ashr <8 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = add <8 x i32> %and, %shift
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE2-LABEL: test_ext_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm8
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm8
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm4
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm5
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm7
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm1
+; SSE4-NEXT:    paddq %xmm4, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm3
+; SSE4-NEXT:    paddq %xmm5, %xmm3
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm0
+; SSE4-NEXT:    paddq %xmm6, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm2
+; SSE4-NEXT:    paddq %xmm7, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm6, %xmm6
+; AVX1-NEXT:    vpaddq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm6, %xmm6
+; AVX1-NEXT:    vpaddq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i32> %a0 to <8 x i64>
+  %x1 = sext <8 x i32> %a1 to <8 x i64>
+  %sum = add <8 x i64> %x0, %x1
+  %shift = ashr <8 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <8 x i64> %shift to <8 x i32>
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm5, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm1, %xmm4
+; SSE4-NEXT:    pand %xmm3, %xmm4
+; SSE4-NEXT:    movdqa %xmm0, %xmm5
+; SSE4-NEXT:    pand %xmm2, %xmm5
+; SSE4-NEXT:    pxor %xmm2, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm2
+; SSE4-NEXT:    psrad $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE4-NEXT:    paddq %xmm4, %xmm1
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    psrad $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE4-NEXT:    paddq %xmm5, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsraq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <4 x i64> %a0, %a1
+  %xor = xor <4 x i64> %a1, %a0
+  %shift = ashr <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1>
+  %res = add <4 x i64> %and, %shift
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_ext_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm4, %rdx
+; SSE2-NEXT:    movq %rdx, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    movq %xmm1, %rcx
+; SSE2-NEXT:    movq %rcx, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rsi
+; SSE2-NEXT:    movq %rsi, %r11
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movq %xmm0, %r8
+; SSE2-NEXT:    movq %r8, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movq %xmm3, %r15
+; SSE2-NEXT:    movq %r15, %r9
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r12
+; SSE2-NEXT:    movq %r12, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    addq %rax, %r8
+; SSE2-NEXT:    adcq %rbx, %rbp
+; SSE2-NEXT:    addq %r12, %rsi
+; SSE2-NEXT:    adcq %r11, %r13
+; SSE2-NEXT:    addq %r15, %rcx
+; SSE2-NEXT:    adcq %r10, %r9
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; SSE2-NEXT:    adcq %r14, %rdi
+; SSE2-NEXT:    shldq $63, %rdx, %rdi
+; SSE2-NEXT:    shldq $63, %rcx, %r9
+; SSE2-NEXT:    shldq $63, %rsi, %r13
+; SSE2-NEXT:    shldq $63, %r8, %rbp
+; SSE2-NEXT:    movq %rbp, %xmm0
+; SSE2-NEXT:    movq %r13, %xmm2
+; SSE2-NEXT:    movq %r9, %xmm1
+; SSE2-NEXT:    movq %rdi, %xmm3
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    movq %xmm1, %rdi
+; SSE4-NEXT:    movq %rdi, %r14
+; SSE4-NEXT:    sarq $63, %r14
+; SSE4-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE4-NEXT:    movq %rcx, %r10
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    movq %xmm0, %rsi
+; SSE4-NEXT:    movq %rsi, %r11
+; SSE4-NEXT:    sarq $63, %r11
+; SSE4-NEXT:    pextrq $1, %xmm0, %r8
+; SSE4-NEXT:    movq %r8, %rbx
+; SSE4-NEXT:    sarq $63, %rbx
+; SSE4-NEXT:    movq %xmm3, %rdx
+; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rdx
+; SSE4-NEXT:    pextrq $1, %xmm3, %r15
+; SSE4-NEXT:    movq %r15, %r9
+; SSE4-NEXT:    sarq $63, %r9
+; SSE4-NEXT:    movq %xmm2, %r12
+; SSE4-NEXT:    movq %r12, %r13
+; SSE4-NEXT:    sarq $63, %r13
+; SSE4-NEXT:    pextrq $1, %xmm2, %rax
+; SSE4-NEXT:    movq %rax, %rbp
+; SSE4-NEXT:    sarq $63, %rbp
+; SSE4-NEXT:    addq %rax, %r8
+; SSE4-NEXT:    adcq %rbx, %rbp
+; SSE4-NEXT:    addq %r12, %rsi
+; SSE4-NEXT:    adcq %r11, %r13
+; SSE4-NEXT:    addq %r15, %rcx
+; SSE4-NEXT:    adcq %r10, %r9
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; SSE4-NEXT:    adcq %r14, %rdx
+; SSE4-NEXT:    shldq $63, %rdi, %rdx
+; SSE4-NEXT:    shldq $63, %rcx, %r9
+; SSE4-NEXT:    shldq $63, %rsi, %r13
+; SSE4-NEXT:    shldq $63, %r8, %rbp
+; SSE4-NEXT:    movq %rbp, %xmm2
+; SSE4-NEXT:    movq %r13, %xmm0
+; SSE4-NEXT:    movq %r9, %xmm3
+; SSE4-NEXT:    movq %rdx, %xmm1
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vmovq %xmm0, %rdx
+; AVX1-NEXT:    movq %rdx, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT:    movq %rcx, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rsi
+; AVX1-NEXT:    movq %rsi, %r11
+; AVX1-NEXT:    sarq $63, %r11
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX1-NEXT:    movq %rdi, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vmovq %xmm1, %r8
+; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r15
+; AVX1-NEXT:    movq %r15, %r9
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r12
+; AVX1-NEXT:    movq %r12, %r13
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    addq %rax, %rdi
+; AVX1-NEXT:    adcq %rbx, %rbp
+; AVX1-NEXT:    addq %r12, %rsi
+; AVX1-NEXT:    adcq %r11, %r13
+; AVX1-NEXT:    addq %r15, %rcx
+; AVX1-NEXT:    adcq %r10, %r9
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; AVX1-NEXT:    adcq %r14, %r8
+; AVX1-NEXT:    shldq $63, %rdx, %r8
+; AVX1-NEXT:    shldq $63, %rcx, %r9
+; AVX1-NEXT:    shldq $63, %rsi, %r13
+; AVX1-NEXT:    shldq $63, %rdi, %rbp
+; AVX1-NEXT:    vmovq %rbp, %xmm0
+; AVX1-NEXT:    vmovq %r13, %xmm1
+; AVX1-NEXT:    vmovq %r9, %xmm2
+; AVX1-NEXT:    vmovq %r8, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    movq %rsi, %r11
+; AVX2-NEXT:    sarq $63, %r11
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    movq %rdi, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r15
+; AVX2-NEXT:    movq %r15, %r9
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r12
+; AVX2-NEXT:    movq %r12, %r13
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    movq %rax, %rbp
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    addq %rax, %rdi
+; AVX2-NEXT:    adcq %rbx, %rbp
+; AVX2-NEXT:    addq %r12, %rsi
+; AVX2-NEXT:    adcq %r11, %r13
+; AVX2-NEXT:    addq %r15, %rcx
+; AVX2-NEXT:    adcq %r10, %r9
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; AVX2-NEXT:    adcq %r14, %r8
+; AVX2-NEXT:    shldq $63, %rdx, %r8
+; AVX2-NEXT:    shldq $63, %rcx, %r9
+; AVX2-NEXT:    shldq $63, %rsi, %r13
+; AVX2-NEXT:    shldq $63, %rdi, %rbp
+; AVX2-NEXT:    vmovq %rbp, %xmm0
+; AVX2-NEXT:    vmovq %r13, %xmm1
+; AVX2-NEXT:    vmovq %r9, %xmm2
+; AVX2-NEXT:    vmovq %r8, %xmm3
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vmovq %xmm0, %rdx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    sarq $63, %r14
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    movq %rcx, %r10
+; AVX512-NEXT:    sarq $63, %r10
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rsi
+; AVX512-NEXT:    movq %rsi, %r11
+; AVX512-NEXT:    sarq $63, %r11
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512-NEXT:    movq %rdi, %rbx
+; AVX512-NEXT:    sarq $63, %rbx
+; AVX512-NEXT:    vmovq %xmm1, %r8
+; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r8
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r15
+; AVX512-NEXT:    movq %r15, %r9
+; AVX512-NEXT:    sarq $63, %r9
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r12
+; AVX512-NEXT:    movq %r12, %r13
+; AVX512-NEXT:    sarq $63, %r13
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    movq %rax, %rbp
+; AVX512-NEXT:    sarq $63, %rbp
+; AVX512-NEXT:    addq %rax, %rdi
+; AVX512-NEXT:    adcq %rbx, %rbp
+; AVX512-NEXT:    addq %r12, %rsi
+; AVX512-NEXT:    adcq %r11, %r13
+; AVX512-NEXT:    addq %r15, %rcx
+; AVX512-NEXT:    adcq %r10, %r9
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; AVX512-NEXT:    adcq %r14, %r8
+; AVX512-NEXT:    shldq $63, %rdx, %r8
+; AVX512-NEXT:    shldq $63, %rcx, %r9
+; AVX512-NEXT:    shldq $63, %rsi, %r13
+; AVX512-NEXT:    shldq $63, %rdi, %rbp
+; AVX512-NEXT:    vmovq %rbp, %xmm0
+; AVX512-NEXT:    vmovq %r13, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vmovq %r8, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = sext <4 x i64> %a0 to <4 x i128>
+  %x1 = sext <4 x i64> %a1 to <4 x i128>
+  %sum = add <4 x i128> %x0, %x1
+  %shift = ashr <4 x i128> %sum, <i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <4 x i128> %shift to <4 x i64>
+  ret <4 x i64> %res
+}
+
+;
+; 512-bit vectors
+;
+
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_fixed_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm10
+; SSE-NEXT:    pand %xmm7, %xmm10
+; SSE-NEXT:    movdqa %xmm2, %xmm11
+; SSE-NEXT:    pand %xmm6, %xmm11
+; SSE-NEXT:    movdqa %xmm1, %xmm9
+; SSE-NEXT:    pand %xmm5, %xmm9
+; SSE-NEXT:    movdqa %xmm0, %xmm8
+; SSE-NEXT:    pand %xmm4, %xmm8
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm5, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm4, %xmm3
+; SSE-NEXT:    paddb %xmm10, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm2
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    paddb %xmm11, %xmm2
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm4, %xmm1
+; SSE-NEXT:    paddb %xmm9, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    paddb %xmm8, %xmm0
+; SSE-NEXT:    psubb %xmm4, %xmm0
+; SSE-NEXT:    psubb %xmm4, %xmm1
+; SSE-NEXT:    psubb %xmm4, %xmm2
+; SSE-NEXT:    psubb %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm5, %xmm7, %xmm3
+; AVX1-NEXT:    vpsubb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <64 x i8> %a0, %a1
+  %xor = xor <64 x i8> %a0, %a1
+  %shift = ashr <64 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <64 x i8> %and, %shift
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE2-LABEL: test_ext_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; SSE2-NEXT:    psraw $8, %xmm13
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15]
+; SSE2-NEXT:    psraw $8, %xmm14
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
+; SSE2-NEXT:    psraw $8, %xmm15
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15]
+; SSE2-NEXT:    psraw $8, %xmm12
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm11
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm10
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm8
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    paddw %xmm13, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm7
+; SSE2-NEXT:    paddw %xmm14, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    paddw %xmm15, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    paddw %xmm12, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm11, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    paddw %xmm10, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm9, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    paddw %xmm8, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm7
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm6
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    packuswb %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    packuswb %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    packuswb %xmm7, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v64i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm8, %xmm9
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm11
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm13
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm15
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm8
+; SSE4-NEXT:    paddw %xmm9, %xmm8
+; SSE4-NEXT:    pmovsxbw %xmm7, %xmm3
+; SSE4-NEXT:    paddw %xmm10, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm7
+; SSE4-NEXT:    paddw %xmm11, %xmm7
+; SSE4-NEXT:    pmovsxbw %xmm6, %xmm2
+; SSE4-NEXT:    paddw %xmm12, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm6
+; SSE4-NEXT:    paddw %xmm13, %xmm6
+; SSE4-NEXT:    pmovsxbw %xmm5, %xmm1
+; SSE4-NEXT:    paddw %xmm14, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm5
+; SSE4-NEXT:    paddw %xmm15, %xmm5
+; SSE4-NEXT:    pmovsxbw %xmm4, %xmm0
+; SSE4-NEXT:    paddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE4-NEXT:    psrlw $1, %xmm8
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm7
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm6
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm5
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm4, %xmm0
+; SSE4-NEXT:    pand %xmm4, %xmm5
+; SSE4-NEXT:    packuswb %xmm5, %xmm0
+; SSE4-NEXT:    pand %xmm4, %xmm1
+; SSE4-NEXT:    pand %xmm4, %xmm6
+; SSE4-NEXT:    packuswb %xmm6, %xmm1
+; SSE4-NEXT:    pand %xmm4, %xmm2
+; SSE4-NEXT:    pand %xmm4, %xmm7
+; SSE4-NEXT:    packuswb %xmm7, %xmm2
+; SSE4-NEXT:    pand %xmm4, %xmm3
+; SSE4-NEXT:    pand %xmm4, %xmm8
+; SSE4-NEXT:    packuswb %xmm8, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm6, %xmm6
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm9, %xmm9
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm10, %xmm10
+; AVX1-NEXT:    vpaddw %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm10
+; AVX1-NEXT:    vpaddw %xmm5, %xmm10, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm10, %xmm10
+; AVX1-NEXT:    vpaddw %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm7
+; AVX1-NEXT:    vpaddw %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm8, %xmm8
+; AVX1-NEXT:    vpaddw %xmm8, %xmm9, %xmm8
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrlw $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpackuswb %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm8, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm6
+; AVX2-NEXT:    vpaddw %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm4, %ymm2
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpackuswb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovsxbw %ymm2, %zmm2
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovsxbw %ymm3, %zmm3
+; AVX512-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <64 x i8> %a0 to <64 x i16>
+  %x1 = sext <64 x i8> %a1 to <64 x i16>
+  %sum = add <64 x i16> %x0, %x1
+  %shift = ashr <64 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <64 x i16> %shift to <64 x i8>
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_fixed_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psraw $1, %xmm3
+; SSE-NEXT:    paddw %xmm8, %xmm3
+; SSE-NEXT:    psraw $1, %xmm2
+; SSE-NEXT:    paddw %xmm9, %xmm2
+; SSE-NEXT:    psraw $1, %xmm1
+; SSE-NEXT:    paddw %xmm10, %xmm1
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddw %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsraw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsraw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <32 x i16> %a0, %a1
+  %xor = xor <32 x i16> %a1, %a0
+  %shift = ashr <32 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <32 x i16> %and, %shift
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE2-LABEL: test_ext_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
+; SSE2-NEXT:    psrad $16, %xmm13
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
+; SSE2-NEXT:    psrad $16, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; SSE2-NEXT:    psrad $16, %xmm11
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE2-NEXT:    psrad $16, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    paddd %xmm8, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm15, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    paddd %xmm3, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm14, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    paddd %xmm13, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    paddd %xmm12, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    paddd %xmm11, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    paddd %xmm10, %xmm3
+; SSE2-NEXT:    pslld $15, %xmm9
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm9, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm8
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm8, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm5, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    packssdw %xmm4, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm8, %xmm9
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm11
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm13
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm15
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm8
+; SSE4-NEXT:    paddd %xmm9, %xmm8
+; SSE4-NEXT:    pmovsxwd %xmm7, %xmm3
+; SSE4-NEXT:    paddd %xmm10, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm7
+; SSE4-NEXT:    paddd %xmm11, %xmm7
+; SSE4-NEXT:    pmovsxwd %xmm6, %xmm2
+; SSE4-NEXT:    paddd %xmm12, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm6
+; SSE4-NEXT:    paddd %xmm13, %xmm6
+; SSE4-NEXT:    pmovsxwd %xmm5, %xmm1
+; SSE4-NEXT:    paddd %xmm14, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm5
+; SSE4-NEXT:    paddd %xmm15, %xmm5
+; SSE4-NEXT:    pmovsxwd %xmm4, %xmm0
+; SSE4-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE4-NEXT:    psrld $1, %xmm8
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm7
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm6
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm5
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm4, %xmm4
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm5, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm6, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2],xmm4[3],xmm7[4],xmm4[5],xmm7[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm7, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4],xmm4[5],xmm8[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm8, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm6, %xmm6
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm9, %xmm9
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm10, %xmm10
+; AVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm10
+; AVX1-NEXT:    vpaddd %xmm5, %xmm10, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm10, %xmm10
+; AVX1-NEXT:    vpaddd %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm7
+; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm8, %xmm8
+; AVX1-NEXT:    vpaddd %xmm8, %xmm9, %xmm8
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2],xmm8[3],xmm0[4],xmm8[5],xmm0[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2],xmm8[3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4],xmm8[5],xmm6[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2],xmm8[3],xmm3[4],xmm8[5],xmm3[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2],xmm8[3],xmm1[4],xmm8[5],xmm1[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4],xmm8[5],xmm5[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm8[1],xmm4[2],xmm8[3],xmm4[4],xmm8[5],xmm4[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2],xmm8[3],xmm2[4],xmm8[5],xmm2[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm6
+; AVX2-NEXT:    vpaddd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7],ymm3[8],ymm4[9],ymm3[10],ymm4[11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-NEXT:    vpackusdw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7],ymm2[8],ymm4[9],ymm2[10],ymm4[11],ymm2[12],ymm4[13],ymm2[14],ymm4[15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovsxwd %ymm3, %zmm3
+; AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <32 x i16> %a0 to <32 x i32>
+  %x1 = sext <32 x i16> %a1 to <32 x i32>
+  %sum = add <32 x i32> %x0, %x1
+  %shift = ashr <32 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <32 x i32> %shift to <32 x i16>
+  ret <32 x i16> %res
+}
+
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE-LABEL: test_fixed_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrad $1, %xmm3
+; SSE-NEXT:    paddd %xmm8, %xmm3
+; SSE-NEXT:    psrad $1, %xmm2
+; SSE-NEXT:    paddd %xmm9, %xmm2
+; SSE-NEXT:    psrad $1, %xmm1
+; SSE-NEXT:    paddd %xmm10, %xmm1
+; SSE-NEXT:    psrad $1, %xmm0
+; SSE-NEXT:    paddd %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxord %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrad $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i32> %a0, %a1
+  %xor = xor <16 x i32> %a1, %a0
+  %shift = ashr <16 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = add <16 x i32> %and, %shift
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE2-LABEL: test_ext_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm3[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm13, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm10, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
+; SSE2-NEXT:    pxor %xmm10, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; SSE2-NEXT:    pxor %xmm14, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
+; SSE2-NEXT:    paddq %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; SSE2-NEXT:    paddq %xmm12, %xmm7
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
+; SSE2-NEXT:    paddq %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm11, %xmm6
+; SSE2-NEXT:    pxor %xmm11, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm9, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm10
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm9
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm11
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm13
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm15
+; SSE4-NEXT:    pmovsxdq %xmm7, %xmm3
+; SSE4-NEXT:    paddq %xmm8, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm7
+; SSE4-NEXT:    paddq %xmm9, %xmm7
+; SSE4-NEXT:    pmovsxdq %xmm6, %xmm2
+; SSE4-NEXT:    paddq %xmm10, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm6
+; SSE4-NEXT:    paddq %xmm11, %xmm6
+; SSE4-NEXT:    pmovsxdq %xmm5, %xmm1
+; SSE4-NEXT:    paddq %xmm12, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm5
+; SSE4-NEXT:    paddq %xmm13, %xmm5
+; SSE4-NEXT:    pmovsxdq %xmm4, %xmm0
+; SSE4-NEXT:    paddq %xmm14, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm4, %xmm4
+; SSE4-NEXT:    paddq %xmm15, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm7
+; SSE4-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm6, %xmm6
+; AVX1-NEXT:    vpmovsxdq %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm8, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm9, %xmm9
+; AVX1-NEXT:    vpmovsxdq %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm10
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm11, %xmm11
+; AVX1-NEXT:    vpaddq %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm11, %xmm11
+; AVX1-NEXT:    vpaddq %xmm6, %xmm11, %xmm6
+; AVX1-NEXT:    vpmovsxdq %xmm10, %xmm10
+; AVX1-NEXT:    vpaddq %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm10, %xmm10
+; AVX1-NEXT:    vpaddq %xmm10, %xmm8, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm10, %xmm10
+; AVX1-NEXT:    vpaddq %xmm10, %xmm9, %xmm9
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm5, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm8, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm9, %xmm7
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm3
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vpmovsxdq %xmm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT:    vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT:    vpaddq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT:    vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT:    vpaddq %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm5[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],ymm4[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovsxdq %ymm2, %zmm2
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovsxdq %ymm3, %zmm3
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i32> %a0 to <16 x i64>
+  %x1 = sext <16 x i32> %a1 to <16 x i64>
+  %sum = add <16 x i64> %x0, %x1
+  %shift = ashr <16 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <16 x i64> %shift to <16 x i32>
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm11
+; SSE2-NEXT:    pand %xmm7, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pand %xmm6, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm9
+; SSE2-NEXT:    pand %xmm5, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm8
+; SSE2-NEXT:    pand %xmm4, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm11, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm10, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm10
+; SSE4-NEXT:    pand %xmm7, %xmm10
+; SSE4-NEXT:    movdqa %xmm2, %xmm11
+; SSE4-NEXT:    pand %xmm6, %xmm11
+; SSE4-NEXT:    movdqa %xmm1, %xmm9
+; SSE4-NEXT:    pand %xmm5, %xmm9
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    pand %xmm4, %xmm8
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pxor %xmm5, %xmm1
+; SSE4-NEXT:    pxor %xmm6, %xmm2
+; SSE4-NEXT:    pxor %xmm7, %xmm3
+; SSE4-NEXT:    movdqa %xmm3, %xmm4
+; SSE4-NEXT:    psrad $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; SSE4-NEXT:    paddq %xmm10, %xmm3
+; SSE4-NEXT:    movdqa %xmm2, %xmm4
+; SSE4-NEXT:    psrad $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; SSE4-NEXT:    paddq %xmm11, %xmm2
+; SSE4-NEXT:    movdqa %xmm1, %xmm4
+; SSE4-NEXT:    psrad $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; SSE4-NEXT:    paddq %xmm9, %xmm1
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    psrad $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; SSE4-NEXT:    paddq %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpaddq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsraq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <8 x i64> %a0, %a1
+  %xor = xor <8 x i64> %a1, %a0
+  %shift = ashr <8 x i64> %xor, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = add <8 x i64> %and, %shift
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_ext_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    .cfi_def_cfa_offset 64
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm8, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm1, %rbp
+; SSE2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rbx
+; SSE2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movq %xmm0, %r15
+; SSE2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r10
+; SSE2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movq %xmm7, %r9
+; SSE2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r12
+; SSE2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    movq %xmm6, %r13
+; SSE2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r14
+; SSE2-NEXT:    movq %r14, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    movq %xmm5, %r11
+; SSE2-NEXT:    movq %r11, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r8
+; SSE2-NEXT:    movq %r8, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movq %xmm4, %rcx
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    adcq %r15, %rax
+; SSE2-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    adcq %rbx, %rdi
+; SSE2-NEXT:    addq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    adcq %rbp, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE2-NEXT:    addq %r14, %r15
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE2-NEXT:    adcq (%rsp), %r10 # 8-byte Folded Reload
+; SSE2-NEXT:    shldq $63, %rcx, %r10
+; SSE2-NEXT:    shldq $63, %r8, %r9
+; SSE2-NEXT:    shldq $63, %r11, %r12
+; SSE2-NEXT:    shldq $63, %rbx, %r13
+; SSE2-NEXT:    shldq $63, %r15, %rsi
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rcx, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rcx, %rdi
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rcx, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movq %rdi, %xmm4
+; SSE2-NEXT:    movq %rdx, %xmm1
+; SSE2-NEXT:    movq %rsi, %xmm5
+; SSE2-NEXT:    movq %r13, %xmm2
+; SSE2-NEXT:    movq %r12, %xmm6
+; SSE2-NEXT:    movq %r9, %xmm3
+; SSE2-NEXT:    movq %r10, %xmm7
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    pushq %rax
+; SSE4-NEXT:    .cfi_def_cfa_offset 64
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    movq %xmm3, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm3, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm2, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm2, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm1, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm1, %rbp
+; SSE4-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rbp
+; SSE4-NEXT:    movq %xmm0, %rbx
+; SSE4-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rbx
+; SSE4-NEXT:    pextrq $1, %xmm0, %r14
+; SSE4-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r14
+; SSE4-NEXT:    movq %xmm7, %r10
+; SSE4-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    pextrq $1, %xmm7, %r9
+; SSE4-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r9
+; SSE4-NEXT:    movq %xmm6, %r15
+; SSE4-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r15
+; SSE4-NEXT:    pextrq $1, %xmm6, %r13
+; SSE4-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r13
+; SSE4-NEXT:    movq %xmm5, %r12
+; SSE4-NEXT:    movq %r12, %rsi
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    pextrq $1, %xmm5, %r11
+; SSE4-NEXT:    movq %r11, %rdx
+; SSE4-NEXT:    sarq $63, %rdx
+; SSE4-NEXT:    movq %xmm4, %r8
+; SSE4-NEXT:    movq %r8, %rdi
+; SSE4-NEXT:    sarq $63, %rdi
+; SSE4-NEXT:    pextrq $1, %xmm4, %rcx
+; SSE4-NEXT:    movq %rcx, %rax
+; SSE4-NEXT:    sarq $63, %rax
+; SSE4-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE4-NEXT:    adcq %r14, %rax
+; SSE4-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE4-NEXT:    adcq %rbx, %rdi
+; SSE4-NEXT:    addq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE4-NEXT:    adcq %rbp, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    addq %r12, %r14
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE4-NEXT:    adcq (%rsp), %r10 # 8-byte Folded Reload
+; SSE4-NEXT:    shldq $63, %rcx, %r10
+; SSE4-NEXT:    shldq $63, %r8, %r9
+; SSE4-NEXT:    shldq $63, %r11, %r15
+; SSE4-NEXT:    shldq $63, %rbx, %r13
+; SSE4-NEXT:    shldq $63, %r14, %rsi
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rcx, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rcx, %rdi
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rcx, %rax
+; SSE4-NEXT:    movq %rax, %xmm4
+; SSE4-NEXT:    movq %rdi, %xmm0
+; SSE4-NEXT:    movq %rdx, %xmm5
+; SSE4-NEXT:    movq %rsi, %xmm1
+; SSE4-NEXT:    movq %r13, %xmm6
+; SSE4-NEXT:    movq %r15, %xmm2
+; SSE4-NEXT:    movq %r9, %xmm7
+; SSE4-NEXT:    movq %r10, %xmm3
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE4-NEXT:    addq $8, %rsp
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    pushq %rax
+; AVX1-NEXT:    .cfi_def_cfa_offset 64
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rbp
+; AVX1-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rbx
+; AVX1-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r15
+; AVX1-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vmovq %xmm3, %r9
+; AVX1-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX1-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r12
+; AVX1-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r13
+; AVX1-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vmovq %xmm2, %r14
+; AVX1-NEXT:    movq %r14, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX1-NEXT:    movq %r11, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r8
+; AVX1-NEXT:    movq %r8, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT:    movq %rcx, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    adcq %r15, %rax
+; AVX1-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    adcq %rbx, %rdi
+; AVX1-NEXT:    addq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    adcq %rbp, %rdx
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX1-NEXT:    addq %r14, %r15
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX1-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX1-NEXT:    shldq $63, %rcx, %r9
+; AVX1-NEXT:    shldq $63, %r8, %r10
+; AVX1-NEXT:    shldq $63, %r11, %r12
+; AVX1-NEXT:    shldq $63, %rbx, %r13
+; AVX1-NEXT:    shldq $63, %r15, %rsi
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rcx, %rdx
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rcx, %rdi
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rcx, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovq %rdi, %xmm1
+; AVX1-NEXT:    vmovq %rdx, %xmm2
+; AVX1-NEXT:    vmovq %rsi, %xmm3
+; AVX1-NEXT:    vmovq %r13, %xmm4
+; AVX1-NEXT:    vmovq %r12, %xmm5
+; AVX1-NEXT:    vmovq %r10, %xmm6
+; AVX1-NEXT:    vmovq %r9, %xmm7
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    addq $8, %rsp
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    .cfi_def_cfa_offset 64
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rbp
+; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rbx
+; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r15
+; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vmovq %xmm3, %r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r12
+; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r13
+; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vmovq %xmm2, %r14
+; AVX2-NEXT:    movq %r14, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX2-NEXT:    movq %r11, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r8
+; AVX2-NEXT:    movq %r8, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    adcq %r15, %rax
+; AVX2-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    adcq %rbx, %rdi
+; AVX2-NEXT:    addq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    adcq %rbp, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    addq %r14, %r15
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX2-NEXT:    shldq $63, %rcx, %r9
+; AVX2-NEXT:    shldq $63, %r8, %r10
+; AVX2-NEXT:    shldq $63, %r11, %r12
+; AVX2-NEXT:    shldq $63, %rbx, %r13
+; AVX2-NEXT:    shldq $63, %r15, %rsi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rcx, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rcx, %rdi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rcx, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vmovq %rsi, %xmm3
+; AVX2-NEXT:    vmovq %r13, %xmm4
+; AVX2-NEXT:    vmovq %r12, %xmm5
+; AVX2-NEXT:    vmovq %r10, %xmm6
+; AVX2-NEXT:    vmovq %r9, %xmm7
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    .cfi_def_cfa_offset 64
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r13
+; AVX512-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r13
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r14
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r14
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r15
+; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r15
+; AVX512-NEXT:    vmovq %xmm1, %r9
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r9
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r11
+; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r11
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r12
+; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r12
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rbp
+; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %rbp
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512-NEXT:    vmovq %xmm0, %rbx
+; AVX512-NEXT:    movq %rbx, %rsi
+; AVX512-NEXT:    sarq $63, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512-NEXT:    movq %r10, %rdx
+; AVX512-NEXT:    sarq $63, %rdx
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r8
+; AVX512-NEXT:    movq %r8, %rdi
+; AVX512-NEXT:    sarq $63, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    movq %rcx, %rax
+; AVX512-NEXT:    sarq $63, %rax
+; AVX512-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    adcq %r15, %rax
+; AVX512-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    adcq %r14, %rdi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    addq %r10, %r15
+; AVX512-NEXT:    adcq %r13, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT:    addq %rbx, %r14
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX512-NEXT:    shldq $63, %rcx, %r9
+; AVX512-NEXT:    shldq $63, %r8, %r11
+; AVX512-NEXT:    shldq $63, %r10, %r12
+; AVX512-NEXT:    shldq $63, %rbx, %rbp
+; AVX512-NEXT:    shldq $63, %r14, %rsi
+; AVX512-NEXT:    shldq $63, %r15, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rcx, %rdi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rcx, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %rdi, %xmm1
+; AVX512-NEXT:    vmovq %rdx, %xmm2
+; AVX512-NEXT:    vmovq %rsi, %xmm3
+; AVX512-NEXT:    vmovq %rbp, %xmm4
+; AVX512-NEXT:    vmovq %r12, %xmm5
+; AVX512-NEXT:    vmovq %r11, %xmm6
+; AVX512-NEXT:    vmovq %r9, %xmm7
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    addq $8, %rsp
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i64> %a0 to <8 x i128>
+  %x1 = sext <8 x i64> %a1 to <8 x i128>
+  %sum = add <8 x i128> %x0, %x1
+  %shift = ashr <8 x i128> %sum, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <8 x i128> %shift to <8 x i64>
+  ret <8 x i64> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/avgflooru.ll b/llvm/test/CodeGen/X86/avgflooru.ll
new file mode 100644
index 0000000000000..e07c1f55991e8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgflooru.ll
@@ -0,0 +1,2629 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+;
+; 128-bit vectors
+;
+
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_fixed_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    paddb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i8> %a0, %a1
+  %xor = xor <16 x i8> %a0, %a1
+  %shift = lshr <16 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <16 x i8> %and, %shift
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE2-LABEL: test_ext_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE2-NEXT:    paddw %xmm3, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE4-NEXT:    paddw %xmm0, %xmm1
+; SSE4-NEXT:    paddw %xmm4, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    packuswb %xmm1, %xmm2
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i8> %a0 to <16 x i16>
+  %x1 = zext <16 x i8> %a1 to <16 x i16>
+  %sum = add <16 x i16> %x0, %x1
+  %shift = lshr <16 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <16 x i16> %shift to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_fixed_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a1, %a0
+  %shift = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <8 x i16> %and, %shift
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE2-LABEL: test_ext_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    paddd %xmm3, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE4-NEXT:    paddd %xmm0, %xmm1
+; SSE4-NEXT:    paddd %xmm4, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    packusdw %xmm1, %xmm2
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %sum = add <8 x i32> %x0, %x1
+  %shift = lshr <8 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <8 x i32> %shift to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_fixed_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a1, %a0
+  %shift = lshr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
+  %res = add <4 x i32> %and, %shift
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE2-LABEL: test_ext_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    paddq %xmm3, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE4-NEXT:    paddq %xmm0, %xmm1
+; SSE4-NEXT:    paddq %xmm4, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE4-NEXT:    movaps %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = zext <4 x i32> %a0 to <4 x i64>
+  %x1 = zext <4 x i32> %a1 to <4 x i64>
+  %sum = add <4 x i64> %x0, %x1
+  %shift = lshr <4 x i64> %sum, <i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <4 x i64> %shift to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE-LABEL: test_fixed_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %a0, %a1
+  %xor = xor <2 x i64> %a1, %a0
+  %shift = lshr <2 x i64> %xor, <i64 1, i64 1>
+  %res = add <2 x i64> %and, %shift
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_ext_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %xmm1, %rsi
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    addq %rcx, %rsi
+; SSE2-NEXT:    setb %dil
+; SSE2-NEXT:    xorl %ecx, %ecx
+; SSE2-NEXT:    addq %rax, %rdx
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    shldq $63, %rdx, %rcx
+; SSE2-NEXT:    shldq $63, %rsi, %rdi
+; SSE2-NEXT:    movq %rdi, %xmm0
+; SSE2-NEXT:    movq %rcx, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm0, %rax
+; SSE4-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE4-NEXT:    movq %xmm1, %rdx
+; SSE4-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE4-NEXT:    xorl %edi, %edi
+; SSE4-NEXT:    addq %rcx, %rsi
+; SSE4-NEXT:    setb %dil
+; SSE4-NEXT:    xorl %ecx, %ecx
+; SSE4-NEXT:    addq %rax, %rdx
+; SSE4-NEXT:    setb %cl
+; SSE4-NEXT:    shldq $63, %rdx, %rcx
+; SSE4-NEXT:    shldq $63, %rsi, %rdi
+; SSE4-NEXT:    movq %rdi, %xmm1
+; SSE4-NEXT:    movq %rcx, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT:    xorl %edi, %edi
+; AVX1-NEXT:    addq %rcx, %rsi
+; AVX1-NEXT:    setb %dil
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    addq %rax, %rdx
+; AVX1-NEXT:    setb %cl
+; AVX1-NEXT:    shldq $63, %rdx, %rcx
+; AVX1-NEXT:    shldq $63, %rsi, %rdi
+; AVX1-NEXT:    vmovq %rdi, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    addq %rcx, %rsi
+; AVX2-NEXT:    setb %dil
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    addq %rax, %rdx
+; AVX2-NEXT:    setb %cl
+; AVX2-NEXT:    shldq $63, %rdx, %rcx
+; AVX2-NEXT:    shldq $63, %rsi, %rdi
+; AVX2-NEXT:    vmovq %rdi, %xmm0
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    xorl %edi, %edi
+; AVX512-NEXT:    addq %rcx, %rdx
+; AVX512-NEXT:    setb %dil
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    addq %rax, %rsi
+; AVX512-NEXT:    setb %cl
+; AVX512-NEXT:    shldq $63, %rsi, %rcx
+; AVX512-NEXT:    shldq $63, %rdx, %rdi
+; AVX512-NEXT:    vmovq %rdi, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    retq
+  %x0 = zext <2 x i64> %a0 to <2 x i128>
+  %x1 = zext <2 x i64> %a1 to <2 x i128>
+  %sum = add <2 x i128> %x0, %x1
+  %shift = lshr <2 x i128> %sum, <i128 1, i128 1>
+  %res = trunc <2 x i128> %shift to <2 x i64>
+  ret <2 x i64> %res
+}
+
+;
+; 256-bit vectors
+;
+
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_fixed_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    paddb %xmm4, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    paddb %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <32 x i8> %a0, %a1
+  %xor = xor <32 x i8> %a0, %a1
+  %shift = lshr <32 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <32 x i8> %and, %shift
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE2-LABEL: test_ext_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT:    paddw %xmm5, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    paddw %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    paddw %xmm6, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm7
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    packuswb %xmm7, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm5, %xmm5
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; SSE4-NEXT:    paddw %xmm1, %xmm3
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; SSE4-NEXT:    paddw %xmm0, %xmm2
+; SSE4-NEXT:    paddw %xmm6, %xmm4
+; SSE4-NEXT:    paddw %xmm7, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    packuswb %xmm3, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    packuswb %xmm2, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vpaddw %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; AVX1-NEXT:    vpaddw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT:    vpaddw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <32 x i8> %a0 to <32 x i16>
+  %x1 = zext <32 x i8> %a1 to <32 x i16>
+  %sum = add <32 x i16> %x0, %x1
+  %shift = lshr <32 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <32 x i16> %shift to <32 x i8>
+  ret <32 x i8> %res
+}
+
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_fixed_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    paddw %xmm4, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i16> %a0, %a1
+  %xor = xor <16 x i16> %a1, %a0
+  %shift = lshr <16 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <16 x i16> %and, %shift
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE2-LABEL: test_ext_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm7
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm7, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm5, %xmm5
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE4-NEXT:    paddd %xmm1, %xmm3
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE4-NEXT:    paddd %xmm0, %xmm2
+; SSE4-NEXT:    paddd %xmm6, %xmm4
+; SSE4-NEXT:    paddd %xmm7, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    packusdw %xmm3, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    packusdw %xmm2, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX1-NEXT:    vpaddd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i16> %a0 to <16 x i32>
+  %x1 = zext <16 x i16> %a1 to <16 x i32>
+  %sum = add <16 x i32> %x0, %x1
+  %shift = lshr <16 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <16 x i32> %shift to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_fixed_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    paddd %xmm4, %xmm1
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <8 x i32> %a0, %a1
+  %xor = xor <8 x i32> %a1, %a0
+  %shift = lshr <8 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = add <8 x i32> %and, %shift
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE2-LABEL: test_ext_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT:    paddq %xmm5, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT:    paddq %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    paddq %xmm6, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm5, %xmm5
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE4-NEXT:    paddq %xmm1, %xmm3
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE4-NEXT:    paddq %xmm0, %xmm2
+; SSE4-NEXT:    paddq %xmm6, %xmm4
+; SSE4-NEXT:    paddq %xmm7, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE4-NEXT:    movaps %xmm1, %xmm0
+; SSE4-NEXT:    movaps %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX1-NEXT:    vpaddq %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i32> %a0 to <8 x i64>
+  %x1 = zext <8 x i32> %a1 to <8 x i64>
+  %sum = add <8 x i64> %x0, %x1
+  %shift = lshr <8 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <8 x i64> %shift to <8 x i32>
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE-LABEL: test_fixed_v4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    paddq %xmm4, %xmm1
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    paddq %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <4 x i64> %a0, %a1
+  %xor = xor <4 x i64> %a1, %a0
+  %shift = lshr <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1>
+  %res = add <4 x i64> %and, %shift
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_ext_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm4, %rdi
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r10
+; SSE2-NEXT:    movq %xmm0, %r11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r8
+; SSE2-NEXT:    movq %xmm3, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    xorl %ecx, %ecx
+; SSE2-NEXT:    addq %r11, %rax
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    xorl %r11d, %r11d
+; SSE2-NEXT:    addq %r10, %rdx
+; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    xorl %r10d, %r10d
+; SSE2-NEXT:    addq %r9, %rsi
+; SSE2-NEXT:    setb %r10b
+; SSE2-NEXT:    xorl %r9d, %r9d
+; SSE2-NEXT:    addq %rdi, %r8
+; SSE2-NEXT:    setb %r9b
+; SSE2-NEXT:    shldq $63, %r8, %r9
+; SSE2-NEXT:    shldq $63, %rsi, %r10
+; SSE2-NEXT:    shldq $63, %rdx, %r11
+; SSE2-NEXT:    shldq $63, %rax, %rcx
+; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    movq %r11, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movq %r10, %xmm1
+; SSE2-NEXT:    movq %r9, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm1, %r8
+; SSE4-NEXT:    pextrq $1, %xmm1, %r9
+; SSE4-NEXT:    movq %xmm0, %r10
+; SSE4-NEXT:    pextrq $1, %xmm0, %r11
+; SSE4-NEXT:    movq %xmm3, %rdi
+; SSE4-NEXT:    pextrq $1, %xmm3, %rsi
+; SSE4-NEXT:    movq %xmm2, %rdx
+; SSE4-NEXT:    pextrq $1, %xmm2, %rax
+; SSE4-NEXT:    xorl %ecx, %ecx
+; SSE4-NEXT:    addq %r11, %rax
+; SSE4-NEXT:    setb %cl
+; SSE4-NEXT:    xorl %r11d, %r11d
+; SSE4-NEXT:    addq %r10, %rdx
+; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    xorl %r10d, %r10d
+; SSE4-NEXT:    addq %r9, %rsi
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    xorl %r9d, %r9d
+; SSE4-NEXT:    addq %r8, %rdi
+; SSE4-NEXT:    setb %r9b
+; SSE4-NEXT:    shldq $63, %rdi, %r9
+; SSE4-NEXT:    shldq $63, %rsi, %r10
+; SSE4-NEXT:    shldq $63, %rdx, %r11
+; SSE4-NEXT:    shldq $63, %rax, %rcx
+; SSE4-NEXT:    movq %rcx, %xmm1
+; SSE4-NEXT:    movq %r11, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    movq %r10, %xmm2
+; SSE4-NEXT:    movq %r9, %xmm1
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r10
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX1-NEXT:    vmovq %xmm1, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    addq %r11, %rax
+; AVX1-NEXT:    setb %cl
+; AVX1-NEXT:    xorl %r11d, %r11d
+; AVX1-NEXT:    addq %r10, %rdx
+; AVX1-NEXT:    setb %r11b
+; AVX1-NEXT:    xorl %r10d, %r10d
+; AVX1-NEXT:    addq %r9, %rsi
+; AVX1-NEXT:    setb %r10b
+; AVX1-NEXT:    xorl %r9d, %r9d
+; AVX1-NEXT:    addq %rdi, %r8
+; AVX1-NEXT:    setb %r9b
+; AVX1-NEXT:    shldq $63, %r8, %r9
+; AVX1-NEXT:    shldq $63, %rsi, %r10
+; AVX1-NEXT:    shldq $63, %rdx, %r11
+; AVX1-NEXT:    shldq $63, %rax, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vmovq %r11, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovq %r10, %xmm1
+; AVX1-NEXT:    vmovq %r9, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r10
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    addq %r11, %rax
+; AVX2-NEXT:    setb %cl
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    addq %r10, %rdx
+; AVX2-NEXT:    setb %r11b
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    addq %r9, %rsi
+; AVX2-NEXT:    setb %r10b
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    addq %rdi, %r8
+; AVX2-NEXT:    setb %r9b
+; AVX2-NEXT:    shldq $63, %r8, %r9
+; AVX2-NEXT:    shldq $63, %rsi, %r10
+; AVX2-NEXT:    shldq $63, %rdx, %r11
+; AVX2-NEXT:    shldq $63, %rax, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vmovq %r11, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vmovq %r10, %xmm1
+; AVX2-NEXT:    vmovq %r9, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq %xmm0, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r10
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX512-NEXT:    vmovq %xmm1, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm0, %r8
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    addq %r11, %rax
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    xorl %r11d, %r11d
+; AVX512-NEXT:    addq %r10, %r8
+; AVX512-NEXT:    setb %r11b
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    addq %r9, %rcx
+; AVX512-NEXT:    setb %r10b
+; AVX512-NEXT:    xorl %r9d, %r9d
+; AVX512-NEXT:    addq %rsi, %rdi
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    shldq $63, %rdi, %r9
+; AVX512-NEXT:    shldq $63, %rcx, %r10
+; AVX512-NEXT:    shldq $63, %r8, %r11
+; AVX512-NEXT:    shldq $63, %rax, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm0
+; AVX512-NEXT:    vmovq %r11, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %r10, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <4 x i64> %a0 to <4 x i128>
+  %x1 = zext <4 x i64> %a1 to <4 x i128>
+  %sum = add <4 x i128> %x0, %x1
+  %shift = lshr <4 x i128> %sum, <i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <4 x i128> %shift to <4 x i64>
+  ret <4 x i64> %res
+}
+
+;
+; 512-bit vectors
+;
+
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_fixed_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm9
+; SSE-NEXT:    pand %xmm7, %xmm9
+; SSE-NEXT:    movdqa %xmm2, %xmm10
+; SSE-NEXT:    pand %xmm6, %xmm10
+; SSE-NEXT:    movdqa %xmm1, %xmm11
+; SSE-NEXT:    pand %xmm5, %xmm11
+; SSE-NEXT:    movdqa %xmm0, %xmm8
+; SSE-NEXT:    pand %xmm4, %xmm8
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    paddb %xmm9, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    paddb %xmm10, %xmm2
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    paddb %xmm11, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    paddb %xmm8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm6, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <64 x i8> %a0, %a1
+  %xor = xor <64 x i8> %a0, %a1
+  %shift = lshr <64 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <64 x i8> %and, %shift
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE2-LABEL: test_ext_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm13
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
+; SSE2-NEXT:    paddw %xmm10, %xmm9
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; SSE2-NEXT:    paddw %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
+; SSE2-NEXT:    paddw %xmm11, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE2-NEXT:    paddw %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
+; SSE2-NEXT:    paddw %xmm12, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; SSE2-NEXT:    paddw %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT:    paddw %xmm13, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE2-NEXT:    paddw %xmm4, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm9
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    packuswb %xmm9, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm7
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    packuswb %xmm7, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm6
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    packuswb %xmm6, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    packuswb %xmm5, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v64i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    pxor %xmm13, %xmm13
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm14 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm13[8],xmm10[9],xmm13[9],xmm10[10],xmm13[10],xmm10[11],xmm13[11],xmm10[12],xmm13[12],xmm10[13],xmm13[13],xmm10[14],xmm13[14],xmm10[15],xmm13[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm12 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15]
+; SSE4-NEXT:    paddw %xmm9, %xmm7
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15]
+; SSE4-NEXT:    paddw %xmm10, %xmm6
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
+; SSE4-NEXT:    paddw %xmm11, %xmm5
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15]
+; SSE4-NEXT:    paddw %xmm8, %xmm4
+; SSE4-NEXT:    paddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE4-NEXT:    paddw %xmm14, %xmm2
+; SSE4-NEXT:    paddw %xmm15, %xmm1
+; SSE4-NEXT:    paddw %xmm12, %xmm0
+; SSE4-NEXT:    psrlw $1, %xmm7
+; SSE4-NEXT:    psrlw $1, %xmm6
+; SSE4-NEXT:    psrlw $1, %xmm5
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    packuswb %xmm7, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    packuswb %xmm6, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    packuswb %xmm5, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    packuswb %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; AVX1-NEXT:    vpaddw %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15]
+; AVX1-NEXT:    vpaddw %xmm7, %xmm12, %xmm7
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; AVX1-NEXT:    vpaddw %xmm12, %xmm8, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
+; AVX1-NEXT:    vpaddw %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero
+; AVX1-NEXT:    vpaddw %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero,xmm12[4],zero,xmm12[5],zero,xmm12[6],zero,xmm12[7],zero
+; AVX1-NEXT:    vpaddw %xmm2, %xmm9, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrlw $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; AVX2-NEXT:    vpaddw %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT:    vpaddw %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm4, %ymm2
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <64 x i8> %a0 to <64 x i16>
+  %x1 = zext <64 x i8> %a1 to <64 x i16>
+  %sum = add <64 x i16> %x0, %x1
+  %shift = lshr <64 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <64 x i16> %shift to <64 x i8>
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_fixed_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm3
+; SSE-NEXT:    paddw %xmm8, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm2
+; SSE-NEXT:    paddw %xmm9, %xmm2
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    paddw %xmm10, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddw %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <32 x i16> %a0, %a1
+  %xor = xor <32 x i16> %a1, %a0
+  %shift = lshr <32 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <32 x i16> %and, %shift
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE2-LABEL: test_ext_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; SSE2-NEXT:    paddd %xmm9, %xmm10
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; SSE2-NEXT:    paddd %xmm11, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
+; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; SSE2-NEXT:    paddd %xmm12, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm7, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE2-NEXT:    paddd %xmm13, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    pslld $15, %xmm10
+; SSE2-NEXT:    psrad $16, %xmm10
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm10, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm9
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm9, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm5, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    packssdw %xmm4, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    pxor %xmm13, %xmm13
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm14 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm12 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSE4-NEXT:    paddd %xmm9, %xmm7
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSE4-NEXT:    paddd %xmm10, %xmm6
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
+; SSE4-NEXT:    paddd %xmm11, %xmm5
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
+; SSE4-NEXT:    paddd %xmm8, %xmm4
+; SSE4-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE4-NEXT:    paddd %xmm14, %xmm2
+; SSE4-NEXT:    paddd %xmm15, %xmm1
+; SSE4-NEXT:    paddd %xmm12, %xmm0
+; SSE4-NEXT:    psrld $1, %xmm7
+; SSE4-NEXT:    psrld $1, %xmm6
+; SSE4-NEXT:    psrld $1, %xmm5
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    packusdw %xmm7, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    packusdw %xmm6, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    packusdw %xmm5, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    packusdw %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX1-NEXT:    vpaddd %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; AVX1-NEXT:    vpaddd %xmm7, %xmm12, %xmm7
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX1-NEXT:    vpaddd %xmm12, %xmm8, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; AVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
+; AVX1-NEXT:    vpaddd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
+; AVX1-NEXT:    vpaddd %xmm2, %xmm9, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrld $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX2-NEXT:    vpaddd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <32 x i16> %a0 to <32 x i32>
+  %x1 = zext <32 x i16> %a1 to <32 x i32>
+  %sum = add <32 x i32> %x0, %x1
+  %shift = lshr <32 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <32 x i32> %shift to <32 x i16>
+  ret <32 x i16> %res
+}
+
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE-LABEL: test_fixed_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrld $1, %xmm3
+; SSE-NEXT:    paddd %xmm8, %xmm3
+; SSE-NEXT:    psrld $1, %xmm2
+; SSE-NEXT:    paddd %xmm9, %xmm2
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    paddd %xmm10, %xmm1
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxord %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i32> %a0, %a1
+  %xor = xor <16 x i32> %a1, %a0
+  %shift = lshr <16 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = add <16 x i32> %and, %shift
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE2-LABEL: test_ext_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm8[2],xmm12[3],xmm8[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm13
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; SSE2-NEXT:    paddq %xmm10, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; SSE2-NEXT:    paddq %xmm11, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; SSE2-NEXT:    paddq %xmm12, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
+; SSE2-NEXT:    paddq %xmm13, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm9
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    pxor %xmm13, %xmm13
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm14 = xmm2[0],zero,xmm2[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm13[2],xmm10[3],xmm13[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm12 = xmm8[0],zero,xmm8[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm13[2],xmm7[3],xmm13[3]
+; SSE4-NEXT:    paddq %xmm9, %xmm7
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm13[2],xmm6[3],xmm13[3]
+; SSE4-NEXT:    paddq %xmm10, %xmm6
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; SSE4-NEXT:    paddq %xmm11, %xmm5
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
+; SSE4-NEXT:    paddq %xmm8, %xmm4
+; SSE4-NEXT:    paddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE4-NEXT:    paddq %xmm14, %xmm2
+; SSE4-NEXT:    paddq %xmm15, %xmm1
+; SSE4-NEXT:    paddq %xmm12, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm7
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
+; AVX1-NEXT:    vpaddq %xmm6, %xmm12, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT:    vpaddq %xmm7, %xmm12, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm12[2],xmm5[2],xmm12[3],xmm5[3]
+; AVX1-NEXT:    vpaddq %xmm13, %xmm9, %xmm9
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX1-NEXT:    vpaddq %xmm5, %xmm10, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm10 = xmm11[0],zero,xmm11[1],zero
+; AVX1-NEXT:    vpaddq %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero
+; AVX1-NEXT:    vpaddq %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm6, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm9, %xmm7
+; AVX1-NEXT:    vpsrlq $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX2-NEXT:    vpaddq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX2-NEXT:    vpaddq %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm5[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],ymm4[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i32> %a0 to <16 x i64>
+  %x1 = zext <16 x i32> %a1 to <16 x i64>
+  %sum = add <16 x i64> %x0, %x1
+  %shift = lshr <16 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <16 x i64> %shift to <16 x i32>
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE-LABEL: test_fixed_v8i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrlq $1, %xmm3
+; SSE-NEXT:    paddq %xmm8, %xmm3
+; SSE-NEXT:    psrlq $1, %xmm2
+; SSE-NEXT:    paddq %xmm9, %xmm2
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    paddq %xmm10, %xmm1
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    paddq %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <8 x i64> %a0, %a1
+  %xor = xor <8 x i64> %a1, %a0
+  %shift = lshr <8 x i64> %xor, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = add <8 x i64> %and, %shift
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_ext_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %rbx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %r12
+; SSE2-NEXT:    movq %xmm2, %rbp
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %r13
+; SSE2-NEXT:    movq %xmm1, %r15
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r14
+; SSE2-NEXT:    movq %xmm0, %r11
+; SSE2-NEXT:    movq %xmm7, %r10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r9
+; SSE2-NEXT:    movq %xmm6, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    movq %xmm5, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %xmm4, %rax
+; SSE2-NEXT:    xorl %ecx, %ecx
+; SSE2-NEXT:    addq %r11, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    xorl %r11d, %r11d
+; SSE2-NEXT:    addq %r14, %rdx
+; SSE2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    xorl %r14d, %r14d
+; SSE2-NEXT:    addq %r15, %rsi
+; SSE2-NEXT:    setb %r14b
+; SSE2-NEXT:    xorl %r15d, %r15d
+; SSE2-NEXT:    addq %r13, %rdi
+; SSE2-NEXT:    setb %r15b
+; SSE2-NEXT:    xorl %r13d, %r13d
+; SSE2-NEXT:    addq %rbp, %r8
+; SSE2-NEXT:    setb %r13b
+; SSE2-NEXT:    xorl %ebp, %ebp
+; SSE2-NEXT:    addq %r12, %r9
+; SSE2-NEXT:    setb %bpl
+; SSE2-NEXT:    xorl %r12d, %r12d
+; SSE2-NEXT:    addq %rbx, %r10
+; SSE2-NEXT:    movq %xmm8, %rdx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    setb %r12b
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorl %ebx, %ebx
+; SSE2-NEXT:    addq %rdx, %rax
+; SSE2-NEXT:    setb %bl
+; SSE2-NEXT:    shldq $63, %rax, %rbx
+; SSE2-NEXT:    shldq $63, %r10, %r12
+; SSE2-NEXT:    shldq $63, %r9, %rbp
+; SSE2-NEXT:    shldq $63, %r8, %r13
+; SSE2-NEXT:    shldq $63, %rdi, %r15
+; SSE2-NEXT:    shldq $63, %rsi, %r14
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rax, %r11
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rax, %rcx
+; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    movq %r11, %xmm4
+; SSE2-NEXT:    movq %r14, %xmm1
+; SSE2-NEXT:    movq %r15, %xmm5
+; SSE2-NEXT:    movq %r13, %xmm2
+; SSE2-NEXT:    movq %rbp, %xmm6
+; SSE2-NEXT:    movq %r12, %xmm3
+; SSE2-NEXT:    movq %rbx, %xmm7
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    pextrq $1, %xmm3, %r14
+; SSE4-NEXT:    movq %xmm2, %r13
+; SSE4-NEXT:    pextrq $1, %xmm2, %rbp
+; SSE4-NEXT:    movq %xmm1, %r12
+; SSE4-NEXT:    pextrq $1, %xmm1, %r15
+; SSE4-NEXT:    movq %xmm0, %rbx
+; SSE4-NEXT:    pextrq $1, %xmm0, %r11
+; SSE4-NEXT:    pextrq $1, %xmm7, %r10
+; SSE4-NEXT:    movq %xmm6, %r9
+; SSE4-NEXT:    pextrq $1, %xmm6, %r8
+; SSE4-NEXT:    movq %xmm5, %rdi
+; SSE4-NEXT:    pextrq $1, %xmm5, %rsi
+; SSE4-NEXT:    movq %xmm4, %rdx
+; SSE4-NEXT:    pextrq $1, %xmm4, %rax
+; SSE4-NEXT:    xorl %ecx, %ecx
+; SSE4-NEXT:    addq %r11, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    setb %cl
+; SSE4-NEXT:    xorl %r11d, %r11d
+; SSE4-NEXT:    addq %rbx, %rdx
+; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    xorl %ebx, %ebx
+; SSE4-NEXT:    addq %r15, %rsi
+; SSE4-NEXT:    setb %bl
+; SSE4-NEXT:    xorl %r15d, %r15d
+; SSE4-NEXT:    addq %r12, %rdi
+; SSE4-NEXT:    setb %r15b
+; SSE4-NEXT:    xorl %r12d, %r12d
+; SSE4-NEXT:    addq %rbp, %r8
+; SSE4-NEXT:    setb %r12b
+; SSE4-NEXT:    xorl %ebp, %ebp
+; SSE4-NEXT:    addq %r13, %r9
+; SSE4-NEXT:    setb %bpl
+; SSE4-NEXT:    xorl %r13d, %r13d
+; SSE4-NEXT:    addq %r14, %r10
+; SSE4-NEXT:    movq %xmm3, %rdx
+; SSE4-NEXT:    setb %r13b
+; SSE4-NEXT:    movq %xmm7, %rax
+; SSE4-NEXT:    xorl %r14d, %r14d
+; SSE4-NEXT:    addq %rdx, %rax
+; SSE4-NEXT:    setb %r14b
+; SSE4-NEXT:    shldq $63, %rax, %r14
+; SSE4-NEXT:    shldq $63, %r10, %r13
+; SSE4-NEXT:    shldq $63, %r9, %rbp
+; SSE4-NEXT:    shldq $63, %r8, %r12
+; SSE4-NEXT:    shldq $63, %rdi, %r15
+; SSE4-NEXT:    shldq $63, %rsi, %rbx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rax, %r11
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rax, %rcx
+; SSE4-NEXT:    movq %rcx, %xmm4
+; SSE4-NEXT:    movq %r11, %xmm0
+; SSE4-NEXT:    movq %rbx, %xmm5
+; SSE4-NEXT:    movq %r15, %xmm1
+; SSE4-NEXT:    movq %r12, %xmm6
+; SSE4-NEXT:    movq %rbp, %xmm2
+; SSE4-NEXT:    movq %r13, %xmm7
+; SSE4-NEXT:    movq %r14, %xmm3
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovq %xmm4, %r15
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rbp
+; AVX1-NEXT:    vmovq %xmm0, %r13
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r14
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX1-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r9
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r8
+; AVX1-NEXT:    vmovq %xmm2, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    addq %r11, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    setb %cl
+; AVX1-NEXT:    xorl %r11d, %r11d
+; AVX1-NEXT:    addq %r14, %rdx
+; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    setb %r11b
+; AVX1-NEXT:    xorl %r14d, %r14d
+; AVX1-NEXT:    addq %r12, %rsi
+; AVX1-NEXT:    setb %r14b
+; AVX1-NEXT:    xorl %r12d, %r12d
+; AVX1-NEXT:    addq %r13, %rdi
+; AVX1-NEXT:    setb %r12b
+; AVX1-NEXT:    xorl %r13d, %r13d
+; AVX1-NEXT:    addq %rbp, %r8
+; AVX1-NEXT:    setb %r13b
+; AVX1-NEXT:    xorl %ebp, %ebp
+; AVX1-NEXT:    addq %r15, %r9
+; AVX1-NEXT:    setb %bpl
+; AVX1-NEXT:    xorl %r15d, %r15d
+; AVX1-NEXT:    addq %rbx, %r10
+; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    setb %r15b
+; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    xorl %ebx, %ebx
+; AVX1-NEXT:    addq %rdx, %rax
+; AVX1-NEXT:    setb %bl
+; AVX1-NEXT:    shldq $63, %rax, %rbx
+; AVX1-NEXT:    shldq $63, %r10, %r15
+; AVX1-NEXT:    shldq $63, %r9, %rbp
+; AVX1-NEXT:    shldq $63, %r8, %r13
+; AVX1-NEXT:    shldq $63, %rdi, %r12
+; AVX1-NEXT:    shldq $63, %rsi, %r14
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %r11
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vmovq %r11, %xmm1
+; AVX1-NEXT:    vmovq %r14, %xmm2
+; AVX1-NEXT:    vmovq %r12, %xmm3
+; AVX1-NEXT:    vmovq %r13, %xmm4
+; AVX1-NEXT:    vmovq %rbp, %xmm5
+; AVX1-NEXT:    vmovq %r15, %xmm6
+; AVX1-NEXT:    vmovq %rbx, %xmm7
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vmovq %xmm4, %r15
+; AVX2-NEXT:    vpextrq $1, %xmm4, %rbp
+; AVX2-NEXT:    vmovq %xmm0, %r13
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r14
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r9
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r8
+; AVX2-NEXT:    vmovq %xmm2, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    addq %r11, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    setb %cl
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    addq %r14, %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    setb %r11b
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    addq %r12, %rsi
+; AVX2-NEXT:    setb %r14b
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    addq %r13, %rdi
+; AVX2-NEXT:    setb %r12b
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    addq %rbp, %r8
+; AVX2-NEXT:    setb %r13b
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    addq %r15, %r9
+; AVX2-NEXT:    setb %bpl
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    addq %rbx, %r10
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    setb %r15b
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    addq %rdx, %rax
+; AVX2-NEXT:    setb %bl
+; AVX2-NEXT:    shldq $63, %rax, %rbx
+; AVX2-NEXT:    shldq $63, %r10, %r15
+; AVX2-NEXT:    shldq $63, %r9, %rbp
+; AVX2-NEXT:    shldq $63, %r8, %r13
+; AVX2-NEXT:    shldq $63, %rdi, %r12
+; AVX2-NEXT:    shldq $63, %rsi, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rax, %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rax, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vmovq %r11, %xmm1
+; AVX2-NEXT:    vmovq %r14, %xmm2
+; AVX2-NEXT:    vmovq %r12, %xmm3
+; AVX2-NEXT:    vmovq %r13, %xmm4
+; AVX2-NEXT:    vmovq %rbp, %xmm5
+; AVX2-NEXT:    vmovq %r15, %xmm6
+; AVX2-NEXT:    vmovq %rbx, %xmm7
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
+; AVX512-NEXT:    vmovq %xmm2, %r15
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vmovq %xmm2, %rbp
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r12
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %r14
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rbx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %r8
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX512-NEXT:    vmovq %xmm2, %r11
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    addq %rbx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    setb %sil
+; AVX512-NEXT:    xorl %ebx, %ebx
+; AVX512-NEXT:    addq %r14, %rdi
+; AVX512-NEXT:    setb %bl
+; AVX512-NEXT:    xorl %r14d, %r14d
+; AVX512-NEXT:    addq %r12, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    setb %r14b
+; AVX512-NEXT:    xorl %r12d, %r12d
+; AVX512-NEXT:    addq %rbp, %r11
+; AVX512-NEXT:    setb %r12b
+; AVX512-NEXT:    xorl %ebp, %ebp
+; AVX512-NEXT:    addq %r13, %rdx
+; AVX512-NEXT:    setb %bpl
+; AVX512-NEXT:    xorl %r13d, %r13d
+; AVX512-NEXT:    addq %r15, %r8
+; AVX512-NEXT:    setb %r13b
+; AVX512-NEXT:    xorl %r15d, %r15d
+; AVX512-NEXT:    addq %r10, %r9
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    setb %r15b
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    addq %rcx, %rax
+; AVX512-NEXT:    setb %r10b
+; AVX512-NEXT:    shldq $63, %rax, %r10
+; AVX512-NEXT:    shldq $63, %r9, %r15
+; AVX512-NEXT:    shldq $63, %r8, %r13
+; AVX512-NEXT:    shldq $63, %rdx, %rbp
+; AVX512-NEXT:    shldq $63, %r11, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rax, %r14
+; AVX512-NEXT:    shldq $63, %rdi, %rbx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rax, %rsi
+; AVX512-NEXT:    vmovq %rsi, %xmm0
+; AVX512-NEXT:    vmovq %rbx, %xmm1
+; AVX512-NEXT:    vmovq %r14, %xmm2
+; AVX512-NEXT:    vmovq %r12, %xmm3
+; AVX512-NEXT:    vmovq %rbp, %xmm4
+; AVX512-NEXT:    vmovq %r13, %xmm5
+; AVX512-NEXT:    vmovq %r15, %xmm6
+; AVX512-NEXT:    vmovq %r10, %xmm7
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i64> %a0 to <8 x i128>
+  %x1 = zext <8 x i64> %a1 to <8 x i128>
+  %sum = add <8 x i128> %x0, %x1
+  %shift = lshr <8 x i128> %sum, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <8 x i128> %shift to <8 x i64>
+  ret <8 x i64> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
index 34ef23db34575..234c7a0a500d3 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -553,8 +553,8 @@ define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32
 ; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm2, %ymm3, %ymm1
-; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovmskps %ymm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -571,8 +571,8 @@ define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:  .LBB7_3:
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm1
-; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovmskps %ymm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll
index 374e75967d52f..a8c068fc5b865 100644
--- a/llvm/test/CodeGen/X86/cmov.ll
+++ b/llvm/test/CodeGen/X86/cmov.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false -mattr=+ndd --show-mc-encoding | FileCheck %s --check-prefix=NDD
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define i32 @test1(i32 %x, i32 %n, i32 %w, ptr %vp) nounwind readnone {
@@ -9,6 +10,13 @@ define i32 @test1(i32 %x, i32 %n, i32 %w, ptr %vp) nounwind readnone {
 ; CHECK-NEXT:    movl $12, %eax
 ; CHECK-NEXT:    cmovael (%rcx), %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    btl %esi, %edi # encoding: [0x0f,0xa3,0xf7]
+; NDD-NEXT:    movl $12, %eax # encoding: [0xb8,0x0c,0x00,0x00,0x00]
+; NDD-NEXT:    cmovael (%rcx), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0x01]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
 	%0 = lshr i32 %x, %n
 	%1 = and i32 %0, 1
@@ -25,6 +33,13 @@ define i32 @test2(i32 %x, i32 %n, i32 %w, ptr %vp) nounwind readnone {
 ; CHECK-NEXT:    movl $12, %eax
 ; CHECK-NEXT:    cmovbl (%rcx), %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test2:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    btl %esi, %edi # encoding: [0x0f,0xa3,0xf7]
+; NDD-NEXT:    movl $12, %eax # encoding: [0xb8,0x0c,0x00,0x00,0x00]
+; NDD-NEXT:    cmovbl (%rcx), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0x01]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
 	%0 = lshr i32 %x, %n
 	%1 = and i32 %0, 1
@@ -50,6 +65,16 @@ define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
 ; CHECK-NEXT:    callq bar@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test3:
+; NDD:       # %bb.0:
+; NDD-NEXT:    pushq %rax # encoding: [0x50]
+; NDD-NEXT:    testb $1, %dl # encoding: [0xf6,0xc2,0x01]
+; NDD-NEXT:    cmovel %esi, %edi # EVEX TO LEGACY Compression encoding: [0x0f,0x44,0xfe]
+; NDD-NEXT:    callq bar@PLT # encoding: [0xe8,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: bar@PLT-4, kind: FK_PCRel_4
+; NDD-NEXT:    popq %rax # encoding: [0x58]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %c = trunc i64 %a to i32
   %d = trunc i64 %b to i32
   %e = select i1 %p, i32 %c, i32 %d
@@ -114,6 +139,54 @@ define i1 @test4() nounwind {
 ; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test4:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movsbl g_3(%rip), %eax # encoding: [0x0f,0xbe,0x05,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: g_3-4, kind: reloc_riprel_4byte
+; NDD-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
+; NDD-NEXT:    shrl $7, %ecx # EVEX TO LEGACY Compression encoding: [0xc1,0xe9,0x07]
+; NDD-NEXT:    xorb $1, %cl # EVEX TO LEGACY Compression encoding: [0x80,0xf1,0x01]
+; NDD-NEXT:    sarl %cl, %eax, %ecx # encoding: [0x62,0xf4,0x74,0x18,0xd3,0xf8]
+; NDD-NEXT:    movzbl g_96(%rip), %eax # encoding: [0x0f,0xb6,0x05,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: g_96-4, kind: reloc_riprel_4byte
+; NDD-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
+; NDD-NEXT:    je .LBB3_2 # encoding: [0x74,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: .LBB3_2-1, kind: FK_PCRel_1
+; NDD-NEXT:  # %bb.1: # %bb.i.i.i
+; NDD-NEXT:    movzbl g_100(%rip), %edx # encoding: [0x0f,0xb6,0x15,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: g_100-4, kind: reloc_riprel_4byte
+; NDD-NEXT:  .LBB3_2: # %func_4.exit.i
+; NDD-NEXT:    pushq %rbx # encoding: [0x53]
+; NDD-NEXT:    xorl %edx, %edx # encoding: [0x31,0xd2]
+; NDD-NEXT:    testb %cl, %cl # encoding: [0x84,0xc9]
+; NDD-NEXT:    setne %bl # encoding: [0x0f,0x95,0xc3]
+; NDD-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
+; NDD-NEXT:    cmovnel %edx, %ecx # EVEX TO LEGACY Compression encoding: [0x0f,0x45,0xca]
+; NDD-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
+; NDD-NEXT:    je .LBB3_5 # encoding: [0x74,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: .LBB3_5-1, kind: FK_PCRel_1
+; NDD-NEXT:  # %bb.3: # %func_4.exit.i
+; NDD-NEXT:    testb %bl, %bl # encoding: [0x84,0xdb]
+; NDD-NEXT:    jne .LBB3_5 # encoding: [0x75,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: .LBB3_5-1, kind: FK_PCRel_1
+; NDD-NEXT:  # %bb.4: # %bb.i.i
+; NDD-NEXT:    movzbl g_100(%rip), %ecx # encoding: [0x0f,0xb6,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: g_100-4, kind: reloc_riprel_4byte
+; NDD-NEXT:    xorl %ebx, %ebx # encoding: [0x31,0xdb]
+; NDD-NEXT:    movl %eax, %ecx # encoding: [0x89,0xc1]
+; NDD-NEXT:  .LBB3_5: # %func_1.exit
+; NDD-NEXT:    movb %cl, g_96(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 2, value: g_96-4, kind: reloc_riprel_4byte
+; NDD-NEXT:    movzbl %cl, %esi # encoding: [0x0f,0xb6,0xf1]
+; NDD-NEXT:    movl $_2E_str, %edi # encoding: [0xbf,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: _2E_str, kind: FK_Data_4
+; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NDD-NEXT:    callq printf@PLT # encoding: [0xe8,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: printf@PLT-4, kind: FK_PCRel_4
+; NDD-NEXT:    movl %ebx, %eax # encoding: [0x89,0xd8]
+; NDD-NEXT:    popq %rbx # encoding: [0x5b]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = load i8, ptr @g_3, align 1
   %1 = sext i8 %0 to i32
@@ -163,6 +236,14 @@ define i32 @test5(ptr nocapture %P) nounwind readonly {
 ; CHECK-NEXT:    setge %al
 ; CHECK-NEXT:    orl $-2, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test5:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NDD-NEXT:    cmpl $42, (%rdi) # encoding: [0x83,0x3f,0x2a]
+; NDD-NEXT:    setge %al # encoding: [0x0f,0x9d,0xc0]
+; NDD-NEXT:    orl $-2, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0xfe]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
 	%0 = load i32, ptr %P, align 4
 	%1 = icmp sgt i32 %0, 41
@@ -178,6 +259,14 @@ define i32 @test6(ptr nocapture %P) nounwind readonly {
 ; CHECK-NEXT:    setl %al
 ; CHECK-NEXT:    leal 4(%rax,%rax,8), %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test6:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NDD-NEXT:    cmpl $42, (%rdi) # encoding: [0x83,0x3f,0x2a]
+; NDD-NEXT:    setl %al # encoding: [0x0f,0x9c,0xc0]
+; NDD-NEXT:    leal 4(%rax,%rax,8), %eax # encoding: [0x8d,0x44,0xc0,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
 	%0 = load i32, ptr %P, align 4
 	%1 = icmp sgt i32 %0, 41
@@ -194,6 +283,13 @@ define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
 ; CHECK-NEXT:    cmovel %edx, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test7:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testb $1, %dil # encoding: [0x40,0xf6,0xc7,0x01]
+; NDD-NEXT:    cmovnel %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x45,0xd6]
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
   %d = select i1 %c, i8 %a, i8 %b
   ret i8 %d
 }
@@ -205,6 +301,13 @@ define i64 @test8(i64 %0, i64 %1, i64 %2) {
 ; CHECK-NEXT:    cmpq $-2147483648, %rdi # imm = 0x80000000
 ; CHECK-NEXT:    cmovlq %rdx, %rax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    cmpq $-2147483648, %rdi # encoding: [0x48,0x81,0xff,0x00,0x00,0x00,0x80]
+; NDD-NEXT:    # imm = 0x80000000
+; NDD-NEXT:    cmovgeq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x4d,0xd6]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %4 = icmp sgt i64 %0, -2147483649
   %5 = select i1 %4, i64 %1, i64 %2
   ret i64 %5
@@ -218,6 +321,14 @@ define i32 @smin(i32 %x) {
 ; CHECK-NEXT:    movl $-1, %eax
 ; CHECK-NEXT:    cmovnsl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: smin:
+; NDD:       # %bb.0:
+; NDD-NEXT:    notl %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd7]
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NDD-NEXT:    cmovsl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x48,0xc1]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %not_x = xor i32 %x, -1
   %1 = icmp slt i32 %not_x, -1
   %sel = select i1 %1, i32 %not_x, i32 -1
@@ -231,6 +342,13 @@ define i32 @pr47049_1(i32 %0) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmovlel %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: pr47049_1:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; NDD-NEXT:    cmovlel %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x4e,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %2 = icmp slt i32 %0, 1
   %3 = select i1 %2, i32 %0, i32 1
   ret i32 %3
@@ -243,6 +361,13 @@ define i32 @pr47049_2(i32 %0) {
 ; CHECK-NEXT:    movl $-1, %eax
 ; CHECK-NEXT:    cmovnsl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: pr47049_2:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
+; NDD-NEXT:    cmovnsl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x49,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %2 = icmp sgt i32 %0, -1
   %3 = select i1 %2, i32 %0, i32 -1
   ret i32 %3
@@ -255,6 +380,13 @@ define i32 @pr47049_3(i32 %0) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmovgl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: pr47049_3:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; NDD-NEXT:    cmovgl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x4f,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %2 = icmp sgt i32 %0, 1
   %3 = select i1 %2, i32 %0, i32 1
   ret i32 %3
@@ -267,6 +399,13 @@ define i32 @pr47049_4(i32 %0) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: pr47049_4:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; NDD-NEXT:    cmovnel %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x45,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %2 = icmp ugt i32 %0, 1
   %3 = select i1 %2, i32 %0, i32 1
   ret i32 %3
diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll
index cd1953bec774d..30e52f0630759 100644
--- a/llvm/test/CodeGen/X86/cmp.ll
+++ b/llvm/test/CodeGen/X86/cmp.ll
@@ -416,9 +416,8 @@ define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
 ;
 ; NDD-LABEL: test13:
 ; NDD:       # %bb.0:
-; NDD-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; NDD-NEXT:    testb $8, %dil # encoding: [0x40,0xf6,0xc7,0x08]
-; NDD-NEXT:    cmovnel %edx, %eax # encoding: [0x0f,0x45,0xc2]
+; NDD-NEXT:    cmovnel %edx, %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x45,0xf2]
 ; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %mask, 8
   %tobool = icmp ne i32 %and, 0
@@ -436,9 +435,8 @@ define i32 @test14(i32 %mask, i32 %base, i32 %intra) {
 ;
 ; NDD-LABEL: test14:
 ; NDD:       # %bb.0:
-; NDD-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
-; NDD-NEXT:    shrl $7, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0xc1,0xef,0x07]
-; NDD-NEXT:    cmovnsl %edx, %eax # encoding: [0x0f,0x49,0xc2]
+; NDD-NEXT:    shrl $7, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xef,0x07]
+; NDD-NEXT:    cmovnsl %edx, %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x49,0xf2]
 ; NDD-NEXT:    retq # encoding: [0xc3]
   %s = lshr i32 %mask, 7
   %tobool = icmp sgt i32 %s, -1
@@ -1100,9 +1098,8 @@ define { i64, i64 } @pr39968(i64, i64, i32) {
 ; NDD:       # %bb.0:
 ; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; NDD-NEXT:    testb $64, %dl # encoding: [0xf6,0xc2,0x40]
-; NDD-NEXT:    cmovneq %rdi, %rsi # encoding: [0x48,0x0f,0x45,0xf7]
-; NDD-NEXT:    cmovneq %rdi, %rax # encoding: [0x48,0x0f,0x45,0xc7]
-; NDD-NEXT:    movq %rsi, %rdx # encoding: [0x48,0x89,0xf2]
+; NDD-NEXT:    cmovneq %rdi, %rsi, %rdx # encoding: [0x62,0xf4,0xec,0x18,0x45,0xf7]
+; NDD-NEXT:    cmovneq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x45,0xc7]
 ; NDD-NEXT:    retq # encoding: [0xc3]
   %4 = and i32 %2, 64
   %5 = icmp ne i32 %4, 0
diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll
index 0743592f3ca11..9bb7fec7eeacb 100644
--- a/llvm/test/CodeGen/X86/combine-pavg.ll
+++ b/llvm/test/CodeGen/X86/combine-pavg.ll
@@ -18,6 +18,22 @@ define <16 x i8> @combine_pavgb_self(<16 x i8> %a0) {
   ret <16 x i8> %1
 }
 
+define <16 x i8> @combine_pavgb_zero(<16 x i8> %a0) {
+; SSE-LABEL: combine_pavgb_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pavgb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pavgb_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> zeroinitializer, <16 x i8> %a0)
+  ret <16 x i8> %1
+}
+
 define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
 ; SSE-LABEL: combine_pavgw_knownbits:
 ; SSE:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index 0675ced68d7a7..7eee418742ddb 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -521,3 +521,276 @@ define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) {
   %2 = ashr <4 x i32> %1, <i32 10, i32 10, i32 10, i32 10>
   ret <4 x i32> %2
 }
+
+define <8 x i16> @combine_vec8i16_ashr_clamped(<8 x i16> %x, <8 x i16> %y) {
+; SSE2-LABEL: combine_vec8i16_ashr_clamped:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    psubw %xmm2, %xmm1
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec8i16_ashr_clamped:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $8, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $4, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $2, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: combine_vec8i16_ashr_clamped:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec8i16_ashr_clamped:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %y, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  %2 = ashr <8 x i16> %x, %1
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @combine_vec4i32_ashr_clamped(<4 x i32> %x, <4 x i32> %y) {
+; SSE2-LABEL: combine_vec4i32_ashr_clamped:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    psrld $27, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm1, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad %xmm4, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrad %xmm2, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec4i32_ashr_clamped:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrad %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_vec4i32_ashr_clamped:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %y, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
+  %2 = ashr <4 x i32> %x, %1
+  ret <4 x i32> %2
+}
+
+define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) {
+; SSE2-LABEL: combine_vec4i64_ashr_clamped:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483711,2147483711,2147483711,2147483711]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [63,63]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrlq %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    psrlq %xmm6, %xmm7
+; SSE2-NEXT:    movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrlq %xmm3, %xmm5
+; SSE2-NEXT:    psrlq %xmm6, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; SSE2-NEXT:    xorpd %xmm7, %xmm0
+; SSE2-NEXT:    psubq %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT:    psrlq %xmm5, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm3
+; SSE2-NEXT:    psrlq %xmm5, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; SSE2-NEXT:    xorpd %xmm2, %xmm1
+; SSE2-NEXT:    psubq %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec4i64_ashr_clamped:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519]
+; SSE41-NEXT:    movdqa %xmm8, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711]
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm9 = [63,63]
+; SSE41-NEXT:    movapd %xmm9, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm2, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm8
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pand %xmm8, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm9, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrlq %xmm3, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm4, %xmm2
+; SSE41-NEXT:    psrlq %xmm9, %xmm2
+; SSE41-NEXT:    psrlq %xmm3, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    psubq %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm6, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
+; SSE41-NEXT:    psrlq %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlq %xmm6, %xmm2
+; SSE41-NEXT:    psrlq %xmm3, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    psubq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: combine_vec4i64_ashr_clamped:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [9223372036854775870,9223372036854775870,9223372036854775870,9223372036854775870]
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [63,63,63,63]
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec4i64_ashr_clamped:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %y, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
+  %2 = ashr <4 x i64> %x, %1
+  ret <4 x i64> %2
+}
diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
index cbb5bd09c2399..a332b3e890800 100644
--- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -164,14 +164,13 @@ define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a
 ; SSE-LABEL: demandedbits_sitofp_blendvps:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    cvtdq2ps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: demandedbits_sitofp_blendvps:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
 ; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %cvt = sitofp <4 x i32> %a2 to <4 x float>
diff --git a/llvm/test/CodeGen/X86/fast-regalloc-live-out-debug-values.mir b/llvm/test/CodeGen/X86/fast-regalloc-live-out-debug-values.mir
index 56cbe3f7b5638..37a90a2f16d67 100644
--- a/llvm/test/CodeGen/X86/fast-regalloc-live-out-debug-values.mir
+++ b/llvm/test/CodeGen/X86/fast-regalloc-live-out-debug-values.mir
@@ -119,6 +119,7 @@
 name:            foo
 tracksRegLiveness: true
 frameInfo:
+  adjustsStack:    true
   hasCalls:        true
 stack:
   - { id: 0, name: a.addr, size: 4, alignment: 4, debug-info-variable: '!11',
diff --git a/llvm/test/CodeGen/X86/heap-alloc-markers.mir b/llvm/test/CodeGen/X86/heap-alloc-markers.mir
index 0bf83657cb06c..6e0dc50bac0e1 100644
--- a/llvm/test/CodeGen/X86/heap-alloc-markers.mir
+++ b/llvm/test/CodeGen/X86/heap-alloc-markers.mir
@@ -34,6 +34,7 @@ name: test
 # CHECK-LABEL: {{^}}test:
 tracksRegLiveness: true
 frameInfo:
+  adjustsStack:    true
   hasCalls: true
 body: |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/instr-symbols.mir b/llvm/test/CodeGen/X86/instr-symbols.mir
index a900288d70869..7af6ca8181012 100644
--- a/llvm/test/CodeGen/X86/instr-symbols.mir
+++ b/llvm/test/CodeGen/X86/instr-symbols.mir
@@ -23,6 +23,7 @@ name: test
 # CHECK-LABEL: {{^}}test:
 tracksRegLiveness: true
 frameInfo:
+  adjustsStack:    true
   hasCalls: true
 body: |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
new file mode 100644
index 0000000000000..8652136ae5cd9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
@@ -0,0 +1,375 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+declare void @use.i1(i1)
+declare void @use.i32(i32)
+define i32 @sitofp_signbit_only(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_signbit_only:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_signbit_only:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    movmskps %xmm0, %eax
+; X64-NEXT:    shll $31, %eax
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @sitofp_signbit_only_okay_width(i16 %i_in) nounwind {
+; X86-LABEL: sitofp_signbit_only_okay_width:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    filds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_signbit_only_okay_width:
+; X64:       # %bb.0:
+; X64-NEXT:    shll $16, %edi
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    movmskps %xmm0, %eax
+; X64-NEXT:    shll $31, %eax
+; X64-NEXT:    retq
+  %f = sitofp i16 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @sitofp_signbit_only_fail_bad_width1(i64 %i_in) nounwind {
+; X86-LABEL: sitofp_signbit_only_fail_bad_width1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl (%esp), %eax
+; X86-NEXT:    popl %ecx
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_signbit_only_fail_bad_width1:
+; X64:       # %bb.0:
+; X64-NEXT:    cvtsi2ss %rdi, %xmm0
+; X64-NEXT:    movmskps %xmm0, %eax
+; X64-NEXT:    shll $31, %eax
+; X64-NEXT:    retq
+  %f = sitofp i64 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define <2 x i16> @sitofp_signbit_only_fail_bad_width2(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_signbit_only_fail_bad_width2:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, (%esp)
+; X86-NEXT:    fildl (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    shrl $16, %edx
+; X86-NEXT:    andl $32768, %edx # imm = 0x8000
+; X86-NEXT:    movl $32768, %eax # imm = 0x8000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    # kill: def $dx killed $dx killed $edx
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_signbit_only_fail_bad_width2:
+; X64:       # %bb.0:
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i2xi16 = bitcast float %f to <2 x i16>
+  %r = and <2 x i16> %i2xi16, <i16 32768, i16 32768>
+  ret <2 x i16> %r
+}
+
+define i32 @sitofp_many_bits_fail(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_many_bits_fail:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fildl (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $-2147483647, %eax # imm = 0x80000001
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_many_bits_fail:
+; X64:       # %bb.0:
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $-2147483647, %eax # imm = 0x80000001
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483649
+  ret i32 %r
+}
+
+define i32 @sitofp_multiuse_fail(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_multiuse_fail:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildl {{[0-9]+}}(%esp)
+; X86-NEXT:    fsts {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll use.i32@PLT
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_multiuse_fail:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %ebx
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    callq use.i32@PLT
+; X64-NEXT:    andl $-2147483648, %ebx # imm = 0x80000000
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  call void @use.i32(i32 %i)
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @sitofp_multiuse_okay(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_multiuse_okay:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildl {{[0-9]+}}(%esp)
+; X86-NEXT:    fsts {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll use.i1@PLT
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_multiuse_okay:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %ebx
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    callq use.i1@PLT
+; X64-NEXT:    andl $-2147483648, %ebx # imm = 0x80000000
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %cmp = icmp slt i32 %i, 0
+  call void @use.i1(i32 %i)
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @uitofp_signbit_only(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_signbit_only:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_signbit_only:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @uitofp_signbit_only_okay_width(i16 %i_in) nounwind {
+; X86-LABEL: uitofp_signbit_only_okay_width:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_signbit_only_okay_width:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %f = uitofp i16 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @uitofp_signbit_only_okay_width1(i64 %i_in) nounwind {
+; X86-LABEL: uitofp_signbit_only_okay_width1:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_signbit_only_okay_width1:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %f = uitofp i64 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define <2 x i16> @uitofp_signbit_only_fail_bad_width2(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_signbit_only_fail_bad_width2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $32768, %eax # imm = 0x8000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_signbit_only_fail_bad_width2:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ss %rax, %xmm0
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i2xi16 = bitcast float %f to <2 x i16>
+  %r = and <2 x i16> %i2xi16, <i16 32768, i16 32768>
+  ret <2 x i16> %r
+}
+
+define i32 @uitofp_many_bits_fail(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_many_bits_fail:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_many_bits_fail:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ss %rax, %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483649
+  ret i32 %r
+}
+
+define i32 @uitofp_multiuse_fail(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_multiuse_fail:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll use.i32@PLT
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_multiuse_fail:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ss %rax, %xmm0
+; X64-NEXT:    movd %xmm0, %edi
+; X64-NEXT:    callq use.i32@PLT
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  call void @use.i32(i32 %i)
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @uitofp_multiuse_okay(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_multiuse_okay:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll use.i1@PLT
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_multiuse_okay:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ss %rax, %xmm0
+; X64-NEXT:    movd %xmm0, %edi
+; X64-NEXT:    callq use.i1@PLT
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %cmp = icmp slt i32 %i, 0
+  call void @use.i1(i32 %i)
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
diff --git a/llvm/test/CodeGen/X86/isel-select-cmov.ll b/llvm/test/CodeGen/X86/isel-select-cmov.ll
index 0e5293c9000f0..39a20bf6637bb 100644
--- a/llvm/test/CodeGen/X86/isel-select-cmov.ll
+++ b/llvm/test/CodeGen/X86/isel-select-cmov.ll
@@ -13,6 +13,8 @@
 ; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10 -verify-machineinstrs              | FileCheck %s --check-prefix=GISEL-X86
 ; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10 -verify-machineinstrs -mattr=+cmov | FileCheck %s --check-prefix=GISEL-X86-CMOV
 
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs -mattr=+ndd    | FileCheck %s --check-prefix=NDD
+
 ; Test conditional move for the supported types (i16, i32, and i32) and
 ; conditon input (argument or cmp).
 ; When cmov is not available (i8 type or X86), the branch is expected.
@@ -114,6 +116,16 @@ define zeroext i8 @select_cmov_i8(i1 zeroext %cond, i8 zeroext %a, i8 zeroext %b
 ; GISEL-X86-CMOV-NEXT:    cmovnew %dx, %ax
 ; GISEL-X86-CMOV-NEXT:    ## kill: def $al killed $al killed $eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmov_i8:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    testb $1, %dil
+; NDD-NEXT:    jne LBB0_2
+; NDD-NEXT:  ## %bb.1:
+; NDD-NEXT:    movl %edx, %esi
+; NDD-NEXT:  LBB0_2:
+; NDD-NEXT:    movzbl %sil, %eax
+; NDD-NEXT:    retq
   %1 = select i1 %cond, i8 %a, i8 %b
   ret i8 %1
 }
@@ -207,6 +219,13 @@ define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroex
 ; GISEL-X86-CMOV-NEXT:    cmovnew %dx, %ax
 ; GISEL-X86-CMOV-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmov_i16:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    testb $1, %dil
+; NDD-NEXT:    cmovnew %si, %dx, %ax
+; NDD-NEXT:    movzwl %ax, %eax
+; NDD-NEXT:    retq
   %1 = select i1 %cond, i16 %a, i16 %b
   ret i16 %1
 }
@@ -305,6 +324,13 @@ define zeroext i16 @select_cmp_cmov_i16(i16 zeroext %a, i16 zeroext %b) {
 ; GISEL-X86-CMOV-NEXT:    cmovew %cx, %ax
 ; GISEL-X86-CMOV-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmp_cmov_i16:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    cmpw %si, %di
+; NDD-NEXT:    cmovbw %di, %si, %ax
+; NDD-NEXT:    movzwl %ax, %eax
+; NDD-NEXT:    retq
   %1 = icmp ult i16 %a, %b
   %2 = select i1 %1, i16 %a, i16 %b
   ret i16 %2
@@ -391,6 +417,12 @@ define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) {
 ; GISEL-X86-CMOV-NEXT:    testl %ecx, %ecx
 ; GISEL-X86-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmov_i32:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    testb $1, %dil
+; NDD-NEXT:    cmovnel %esi, %edx, %eax
+; NDD-NEXT:    retq
   %1 = select i1 %cond, i32 %a, i32 %b
   ret i32 %1
 }
@@ -482,6 +514,12 @@ define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
 ; GISEL-X86-CMOV-NEXT:    andl $1, %edx
 ; GISEL-X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmp_cmov_i32:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    cmpl %esi, %edi
+; NDD-NEXT:    cmovbl %edi, %esi, %eax
+; NDD-NEXT:    retq
   %1 = icmp ult i32 %a, %b
   %2 = select i1 %1, i32 %a, i32 %b
   ret i32 %2
@@ -584,6 +622,12 @@ define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) {
 ; GISEL-X86-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %edx
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmov_i64:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    testb $1, %dil
+; NDD-NEXT:    cmovneq %rsi, %rdx, %rax
+; NDD-NEXT:    retq
   %1 = select i1 %cond, i64 %a, i64 %b
   ret i64 %1
 }
@@ -754,6 +798,12 @@ define i64 @select_cmp_cmov_i64(i64 %a, i64 %b) nounwind {
 ; GISEL-X86-CMOV-NEXT:    popl %ebx
 ; GISEL-X86-CMOV-NEXT:    popl %ebp
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmp_cmov_i64:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    cmpq %rsi, %rdi
+; NDD-NEXT:    cmovbq %rdi, %rsi, %rax
+; NDD-NEXT:    retq
   %1 = icmp ult i64 %a, %b
   %2 = select i1 %1, i64 %a, i64 %b
   ret i64 %2
diff --git a/llvm/test/CodeGen/X86/pr85681.ll b/llvm/test/CodeGen/X86/pr85681.ll
new file mode 100644
index 0000000000000..3b27a0257ab43
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr85681.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=emeraldrapids | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s
+
+; PR85681 - shift i1/vXi1 X, Y -> X as only Y==0 is defined
+
+define i32 @shl(i32 %a0) {
+; CHECK-LABEL: shl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    retq
+  %v0 = bitcast i32 %a0 to <32 x i1>
+  %s = shl <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, %v0
+  %r = bitcast <32 x i1> %s to i32
+  ret i32 %r
+}
+
+define i32 @lshr(i32 %a0) {
+; CHECK-LABEL: lshr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    retq
+  %v0 = bitcast i32 %a0 to <32 x i1>
+  %s = lshr <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, %v0
+  %r = bitcast <32 x i1> %s to i32
+  ret i32 %r
+}
+
+define i32 @ashr(i32 %a0) {
+; CHECK-LABEL: ashr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    retq
+  %v0 = bitcast i32 %a0 to <32 x i1>
+  %s = ashr <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, %v0
+  %r = bitcast <32 x i1> %s to i32
+  ret i32 %r
+}
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir b/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
index 30a68e6c2efd2..4a18351bde493 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
@@ -61,7 +61,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/X86/statepoint-vreg.mir b/llvm/test/CodeGen/X86/statepoint-vreg.mir
index bfeadfc93da8f..a0c596f249931 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg.mir
+++ b/llvm/test/CodeGen/X86/statepoint-vreg.mir
@@ -134,6 +134,8 @@ registers:
 liveins:
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
+frameInfo:
+  adjustsStack:    true
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/tls-desc.ll b/llvm/test/CodeGen/X86/tls-desc.ll
new file mode 100644
index 0000000000000..c73986e69e791
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tls-desc.ll
@@ -0,0 +1,199 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i686 --relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 --relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64 --relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=X64
+
+@x = thread_local global i32 0, align 4
+@y = internal thread_local global i32 1, align 4
+@z = external hidden thread_local global i32, align 4
+
+define ptr @f1() nounwind {
+; X86-LABEL: f1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll .L0$pb
+; X86-NEXT:  .L0$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp0:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ebx
+; X86-NEXT:    #APP
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    leal x@tlsdesc(%ebx), %eax
+; X86-NEXT:    calll *x@tlscall(%eax)
+; X86-NEXT:    addl %gs:0, %eax
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    #APP
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X32-LABEL: f1:
+; X32:       # %bb.0:
+; X32-NEXT:    pushq %rax
+; X32-NEXT:    #APP
+; X32-NEXT:    #NO_APP
+; X32-NEXT:    leal x@tlsdesc(%rip), %eax
+; X32-NEXT:    callq *x@tlscall(%eax)
+; X32-NEXT:    # kill: def $eax killed $eax def $rax
+; X32-NEXT:    addl %fs:0, %eax
+; X32-NEXT:    #APP
+; X32-NEXT:    #NO_APP
+; X32-NEXT:    popq %rcx
+; X32-NEXT:    retq
+;
+; X64-LABEL: f1:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    #APP
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    leaq x@tlsdesc(%rip), %rax
+; X64-NEXT:    callq *x@tlscall(%rax)
+; X64-NEXT:    addq %fs:0, %rax
+; X64-NEXT:    #APP
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %a = call { i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=r,=r,=r,=r,=r,=r,~{dirflag},~{fpsr},~{flags}"()
+  %b = call ptr @llvm.threadlocal.address.p0(ptr @x)
+  %a.0 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 0
+  %a.1 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 1
+  %a.2 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 2
+  %a.3 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 3
+  %a.4 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 4
+  %a.5 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 5
+  call void asm sideeffect "", "r,r,r,r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %a.0, i32 %a.1, i32 %a.2, i32 %a.3, i32 %a.4, i32 %a.5)
+  ret ptr %b
+}
+
+define i32 @f2() nounwind {
+; X86-LABEL: f2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    calll .L1$pb
+; X86-NEXT:  .L1$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp1:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L1$pb), %ebx
+; X86-NEXT:    movl %gs:0, %ecx
+; X86-NEXT:    leal x@tlsdesc(%ebx), %eax
+; X86-NEXT:    calll *x@tlscall(%eax)
+; X86-NEXT:    movl (%eax,%ecx), %eax
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X32-LABEL: f2:
+; X32:       # %bb.0:
+; X32-NEXT:    pushq %rax
+; X32-NEXT:    movl %fs:0, %ecx
+; X32-NEXT:    leal x@tlsdesc(%rip), %eax
+; X32-NEXT:    callq *x@tlscall(%eax)
+; X32-NEXT:    movl (%eax,%ecx), %eax
+; X32-NEXT:    popq %rcx
+; X32-NEXT:    retq
+;
+; X64-LABEL: f2:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %fs:0, %rcx
+; X64-NEXT:    leaq x@tlsdesc(%rip), %rax
+; X64-NEXT:    callq *x@tlscall(%rax)
+; X64-NEXT:    movl (%rax,%rcx), %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %1 = tail call ptr @llvm.threadlocal.address.p0(ptr @x)
+  %2 = load i32, ptr %1
+  ret i32 %2
+}
+
+define ptr @f3() nounwind {
+; X86-LABEL: f3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    calll .L2$pb
+; X86-NEXT:  .L2$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp2:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.L2$pb), %ebx
+; X86-NEXT:    leal x@tlsdesc(%ebx), %eax
+; X86-NEXT:    calll *x@tlscall(%eax)
+; X86-NEXT:    addl %gs:0, %eax
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X32-LABEL: f3:
+; X32:       # %bb.0:
+; X32-NEXT:    pushq %rax
+; X32-NEXT:    leal x@tlsdesc(%rip), %eax
+; X32-NEXT:    callq *x@tlscall(%eax)
+; X32-NEXT:    # kill: def $eax killed $eax def $rax
+; X32-NEXT:    addl %fs:0, %eax
+; X32-NEXT:    popq %rcx
+; X32-NEXT:    retq
+;
+; X64-LABEL: f3:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    leaq x@tlsdesc(%rip), %rax
+; X64-NEXT:    callq *x@tlscall(%rax)
+; X64-NEXT:    addq %fs:0, %rax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %1 = tail call ptr @llvm.threadlocal.address.p0(ptr @x)
+  ret ptr %1
+}
+
+define i32 @f4() nounwind {
+; X86-LABEL: f4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    calll .L3$pb
+; X86-NEXT:  .L3$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp3:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp3-.L3$pb), %ebx
+; X86-NEXT:    movl %gs:0, %edx
+; X86-NEXT:    leal _TLS_MODULE_BASE_@tlsdesc(%ebx), %eax
+; X86-NEXT:    calll *_TLS_MODULE_BASE_@tlscall(%eax)
+; X86-NEXT:    movl y@DTPOFF(%eax,%edx), %ecx
+; X86-NEXT:    addl z@DTPOFF(%eax,%edx), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X32-LABEL: f4:
+; X32:       # %bb.0:
+; X32-NEXT:    pushq %rax
+; X32-NEXT:    movl %fs:0, %edx
+; X32-NEXT:    leal _TLS_MODULE_BASE_@tlsdesc(%rip), %eax
+; X32-NEXT:    callq *_TLS_MODULE_BASE_@tlscall(%eax)
+; X32-NEXT:    movl y@DTPOFF(%eax,%edx), %ecx
+; X32-NEXT:    addl z@DTPOFF(%eax,%edx), %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    popq %rcx
+; X32-NEXT:    retq
+;
+; X64-LABEL: f4:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %fs:0, %rdx
+; X64-NEXT:    leaq _TLS_MODULE_BASE_@tlsdesc(%rip), %rax
+; X64-NEXT:    callq *_TLS_MODULE_BASE_@tlscall(%rax)
+; X64-NEXT:    movl y@DTPOFF(%rax,%rdx), %ecx
+; X64-NEXT:    addl z@DTPOFF(%rax,%rdx), %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %1 = load i32, ptr @y, align 4
+  %2 = load i32, ptr @z, align 4
+  %3 = add nsw i32 %1, %2
+  ret i32 %3
+}
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
new file mode 100644
index 0000000000000..e6a07b4aeb271
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
+
+define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
+; CHECK-LABEL: vpdpwssd_test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
+  ret <16 x i32> %4
+}
diff --git a/llvm/test/DebugInfo/AArch64/ptrauth.ll b/llvm/test/DebugInfo/AArch64/ptrauth.ll
new file mode 100644
index 0000000000000..5d9099f05704b
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/ptrauth.ll
@@ -0,0 +1,70 @@
+; RUN: llc %s -filetype=obj -mtriple arm64e-apple-darwin -o - \
+; RUN:   | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 0, 0x04d2)")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d2)
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 1, 0x04d3)")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_address_discriminated (true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d3)
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 1, 0x04d4, "isa-pointer")")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_address_discriminated (true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d4)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_isa_pointer	(true)
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 1, 0x04d5, "authenticates-null-values")")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_address_discriminated (true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d5)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_authenticates_null_values	(true)
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 1, 0x04d6, "isa-pointer,authenticates-null-values")")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_address_discriminated (true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d6)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_isa_pointer	(true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_authenticates_null_values	(true)
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+@p = global ptr null, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!10}
+!llvm.module.flags = !{!19, !20}
+
+!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!1 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!2 = !DIGlobalVariableExpression(var: !7, expr: !DIExpression())
+!3 = !DIGlobalVariableExpression(var: !8, expr: !DIExpression())
+!4 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression())
+!5 = distinct !DIGlobalVariable(name: "p1", scope: !10, file: !11, line: 1, type: !14, isLocal: false, isDefinition: true)
+!6 = distinct !DIGlobalVariable(name: "p2", scope: !10, file: !11, line: 1, type: !15, isLocal: false, isDefinition: true)
+!7 = distinct !DIGlobalVariable(name: "p3", scope: !10, file: !11, line: 1, type: !16, isLocal: false, isDefinition: true)
+!8 = distinct !DIGlobalVariable(name: "p4", scope: !10, file: !11, line: 1, type: !17, isLocal: false, isDefinition: true)
+!9 = distinct !DIGlobalVariable(name: "p5", scope: !10, file: !11, line: 1, type: !18, isLocal: false, isDefinition: true)
+!10 = distinct !DICompileUnit(language: DW_LANG_C99, file: !11, emissionKind: FullDebug, globals: !13)
+!11 = !DIFile(filename: "/tmp/p.c", directory: "/")
+!12 = !{}
+!13 = !{!0,!1,!2,!3,!4}
+!14 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: false, ptrAuthExtraDiscriminator: 1234)
+!15 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1235)
+!16 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1236, ptrAuthIsaPointer: true)
+!17 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1237, ptrAuthAuthenticatesNullValues: true)
+!18 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1238, ptrAuthIsaPointer: true, ptrAuthAuthenticatesNullValues: true)
+!19 = !{i32 2, !"Dwarf Version", i32 4}
+!20 = !{i32 2, !"Debug Info Version", i32 3}
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null)
diff --git a/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll b/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll
new file mode 100644
index 0000000000000..9240bf25b6f63
--- /dev/null
+++ b/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll
@@ -0,0 +1,84 @@
+; RUN: llc --stop-after=hardware-loops < %s | FileCheck %s
+
+;; Tests that Hardware Loop Insertion does not insert new phi nodes after debug
+;; records when they appear immediately after the last existing phi node.
+
+; CHECK-LABEL: for.body:
+; CHECK-NEXT: = phi i32
+; CHECK-NEXT: = phi i32
+; CHECK-NEXT: call void @llvm.dbg.value
+
+source_filename = "repro.c"
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+@z = dso_local local_unnamed_addr global i32 42, align 4, !dbg !0
+@arr = dso_local local_unnamed_addr global [10 x i32] zeroinitializer, align 4, !dbg !5
+
+define dso_local void @func1() local_unnamed_addr #0 !dbg !18 {
+entry:
+  %0 = load i32, ptr @z, align 4, !tbaa !26
+  br label %for.body, !dbg !30
+
+for.body:                                         ; preds = %entry, %for.body
+  %p1.04 = phi ptr [ @arr, %entry ], [ %incdec.ptr, %for.body ]
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.dbg.value(metadata ptr %p1.04, metadata !23, metadata !DIExpression()), !dbg !25
+  store i32 %0, ptr %p1.04, align 4, !dbg !32, !tbaa !26
+  %inc = add nuw nsw i32 %i.03, 1, !dbg !34
+  %incdec.ptr = getelementptr inbounds i8, ptr %p1.04, i32 4, !dbg !35
+  %exitcond.not = icmp eq i32 %inc, 10, !dbg !36
+  br i1 %exitcond.not, label %for.end, label %for.body, !dbg !30, !llvm.loop !37
+
+for.end:                                          ; preds = %for.body
+  ret void, !dbg !41
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!11, !12, !13, !14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "z", scope: !2, file: !3, line: 2, type: !8, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 19.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "repro.c", directory: "/home/gbtozers/dev/upstream-llvm")
+!4 = !{!0, !5}
+!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!6 = distinct !DIGlobalVariable(name: "arr", scope: !2, file: !3, line: 1, type: !7, isLocal: false, isDefinition: true)
+!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 320, elements: !9)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !{!10}
+!10 = !DISubrange(count: 10)
+!11 = !{i32 7, !"Dwarf Version", i32 5}
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !{i32 1, !"wchar_size", i32 4}
+!14 = !{i32 1, !"min_enum_size", i32 4}
+!15 = !{i32 7, !"frame-pointer", i32 2}
+!16 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!17 = !{!"clang version 19.0.0git"}
+!18 = distinct !DISubprogram(name: "func1", scope: !3, file: !3, line: 4, type: !19, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{null}
+!21 = !{!23}
+!22 = !DILocalVariable(name: "i", scope: !18, file: !3, line: 6, type: !8)
+!23 = !DILocalVariable(name: "p1", scope: !18, file: !3, line: 7, type: !24)
+!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 32)
+!25 = !DILocation(line: 0, scope: !18)
+!26 = !{!27, !27, i64 0}
+!27 = !{!"int", !28, i64 0}
+!28 = !{!"omnipotent char", !29, i64 0}
+!29 = !{!"Simple C/C++ TBAA"}
+!30 = !DILocation(line: 8, column: 3, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !18, file: !3, line: 8, column: 3)
+!32 = !DILocation(line: 9, column: 10, scope: !33)
+!33 = distinct !DILexicalBlock(scope: !31, file: !3, line: 8, column: 3)
+!34 = !DILocation(line: 8, column: 27, scope: !33)
+!35 = !DILocation(line: 8, column: 32, scope: !33)
+!36 = !DILocation(line: 8, column: 21, scope: !33)
+!37 = distinct !{!37, !30, !38, !39, !40}
+!38 = !DILocation(line: 9, column: 12, scope: !31)
+!39 = !{!"llvm.loop.mustprogress"}
+!40 = !{!"llvm.loop.unroll.disable"}
+!41 = !DILocation(line: 10, column: 1, scope: !18)
diff --git a/llvm/test/DebugInfo/MIR/X86/debug-loc-0.mir b/llvm/test/DebugInfo/MIR/X86/debug-loc-0.mir
index 56a4d835aaa59..0b007456be1e6 100644
--- a/llvm/test/DebugInfo/MIR/X86/debug-loc-0.mir
+++ b/llvm/test/DebugInfo/MIR/X86/debug-loc-0.mir
@@ -75,7 +75,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/DebugInfo/MIR/X86/prolog-epilog-indirection.mir b/llvm/test/DebugInfo/MIR/X86/prolog-epilog-indirection.mir
index 6941467fe0e4a..4df967ce03493 100644
--- a/llvm/test/DebugInfo/MIR/X86/prolog-epilog-indirection.mir
+++ b/llvm/test/DebugInfo/MIR/X86/prolog-epilog-indirection.mir
@@ -104,6 +104,7 @@ alignment:       16
 tracksRegLiveness: true
 frameInfo:       
   maxAlignment:    4
+  adjustsStack:    true
   hasCalls:        true
 stack:           
   - { id: 0, name: l_1081, type: default, offset: 0, size: 4, alignment: 4, 
diff --git a/llvm/test/DebugInfo/NVPTX/no-extra-loc.ll b/llvm/test/DebugInfo/NVPTX/no-extra-loc.ll
new file mode 100644
index 0000000000000..6d3a69beffe87
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/no-extra-loc.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
+
+
+define i32 @foo(i32 %a, i32 %b) !dbg !3 {
+
+; CHECK:     .loc    [[FILE:[0-9]+]] 26 0          // extra-lineinfo.cu:26:0
+; CHECK-NOT: .loc    [[FILE]]        26 0          // extra-lineinfo.cu:26:0
+; CHECK:     .file   [[FILE]] "/test/directory/extra-lineinfo.cu"
+
+  %add = add i32 %b, %a, !dbg !6
+  ret i32 %add, !dbg !6
+}
+
+!llvm.dbg.cu = !{!0}
+!nvvm.annotations = !{}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "extra-lineinfo.cu", directory: "/test/directory/")
+!2 = !{i32 1, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "kernel", linkageName: "foo", scope: !1, file: !1, line: 123, type: !4, scopeLine: 26, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 40, column: 22, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !3, file: !1, line: 3, column: 17)
diff --git a/llvm/test/DebugInfo/X86/live-debug-vars-dse.mir b/llvm/test/DebugInfo/X86/live-debug-vars-dse.mir
index 9443ed5e332c1..908889063584c 100644
--- a/llvm/test/DebugInfo/X86/live-debug-vars-dse.mir
+++ b/llvm/test/DebugInfo/X86/live-debug-vars-dse.mir
@@ -107,7 +107,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/DebugInfo/X86/prolog-params.mir b/llvm/test/DebugInfo/X86/prolog-params.mir
index af21bc85ccd74..6629dca810f95 100644
--- a/llvm/test/DebugInfo/X86/prolog-params.mir
+++ b/llvm/test/DebugInfo/X86/prolog-params.mir
@@ -98,6 +98,8 @@ fixedStack:
       isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true }
   - { id: 2, type: default, offset: 0, size: 4, alignment: 16, stack-id: default,
       isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true }
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, name: arr, type: default, offset: 0, size: 8, alignment: 4,
       stack-id: default, callee-saved-register: '', callee-saved-restored: true }
diff --git a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
index 0a618c6780d1d..d8cb542285c89 100755
--- a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
+++ b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
@@ -1,4 +1,4 @@
-;; Tests that we can debug-print DPValues that have no markers attached.
+;; Tests that we can debug-print DbgVariableRecords that have no markers attached.
 ; RUN: opt -passes="instcombine" -debug %s -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
diff --git a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
index 2e765619fcb89..490f24ff76ff5 100644
--- a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
+++ b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
@@ -26,6 +26,8 @@
 ; CHECK-NEXT: {{^}}  store i32 %[[VAL_ADD]]{{.+}}, !DIAssignID ![[ASSIGNID:[0-9]+]]
 ; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata i32 %[[VAL_ADD]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ptr %[[VAL_B]], metadata !DIExpression()), !dbg ![[LOC_4:[0-9]+]]
 ; NEWDBG-NEXT: {{^}}    #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]])
+; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata ![[EMPTY:[0-9]+]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ![[EMPTY]], metadata !DIExpression()), !dbg ![[LOC_4]]
+; NEWDBG-NEXT: {{^}}    #dbg_assign(![[EMPTY:[0-9]+]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ![[EMPTY]], !DIExpression(), ![[LOC_4]])
 ; CHECK-NEXT: {{^}}  ret i32
 
 ; OLDDBG-DAG: declare void @llvm.dbg.value
@@ -40,6 +42,7 @@
 ; CHECK-DAG: ![[LOC_3]] = !DILocation(line: 3, column: 25
 ; CHECK-DAG: ![[LOC_4]] = !DILocation(line: 3, column: 30
 ; CHECK-DAG: ![[LABEL_ID]] = !DILabel(
+; CHECK-DAG: ![[EMPTY]] = !{}
 
 define dso_local i32 @f(i32 %a) !dbg !7 {
 entry:
@@ -51,6 +54,7 @@ entry:
   call void @llvm.dbg.label(metadata !50), !dbg !32
   store i32 %add, ptr %b, !dbg !32, !DIAssignID !40
   call void @llvm.dbg.assign(metadata i32 %add, metadata !21, metadata !DIExpression(), metadata !40, metadata ptr %b, metadata !DIExpression()), !dbg !33
+  call void @llvm.dbg.assign(metadata !2, metadata !21, metadata !DIExpression(), metadata !40, metadata !2, metadata !DIExpression()), !dbg !33
   ret i32 %add, !dbg !33
 
 }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll
index 5d068872fcace..4359d5305d4d9 100644
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll
@@ -85,7 +85,6 @@ exit:
   ret void
 }
 
-; FIXME: The fakeresume1 here should be marked as musttail.
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll
index 6ea81c6ff0b09..2a14be0f92180 100644
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll
@@ -88,7 +88,6 @@ exit:
   ret void
 }
 
-; FIXME: The fakeresume1 here should be marked as musttail.
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
diff --git a/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll b/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll
index 76afc4bf007c2..8b387cd496297 100644
--- a/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll
+++ b/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll
@@ -78,6 +78,26 @@ entry:
 ; CHECK-LABEL: atomic8_xchg_monotonic
 ; CHECK: call i8 @__tsan_atomic8_exchange(ptr %a, i8 0, i32 0), !dbg
 
+define void @atomic8_xchg_monotonic_ptr(ptr %a, ptr %b) nounwind uwtable {
+entry:
+  atomicrmw xchg ptr %a, ptr %b monotonic, !dbg !7
+  ret void, !dbg !7
+}
+; CHECK-LABEL: atomic8_xchg_monotonic_ptr
+; CHECK: [[ARG:%.*]] = ptrtoint ptr %b to i64, !dbg
+; CHECK: [[RES:%.*]] = call i64 @__tsan_atomic64_exchange(ptr %a, i64 [[ARG]], i32 0), !dbg
+; CHECK: [[CAST:%.*]] = inttoptr i64 [[RES]] to ptr, !dbg
+
+define void @atomic8_xchg_monotonic_float(ptr %a, float %b) nounwind uwtable {
+entry:
+  atomicrmw xchg ptr %a, float %b monotonic, !dbg !7
+  ret void, !dbg !7
+}
+; CHECK-LABEL: atomic8_xchg_monotonic_float
+; CHECK: [[ARG:%.*]] = bitcast float %b to i32, !dbg
+; CHECK: [[RES:%.*]] = call i32 @__tsan_atomic32_exchange(ptr %a, i32 [[ARG]], i32 0), !dbg
+; CHECK: [[CAST:%.*]] = bitcast i32 [[RES]] to float, !dbg
+
 define void @atomic8_add_monotonic(ptr %a) nounwind uwtable {
 entry:
   atomicrmw add ptr %a, i8 0 monotonic, !dbg !7
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index 74d9c86881d52..ccf8cf67ede6d 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -32,7 +32,6 @@ entry:
 ; CHECK-DUMP: <main>:
 ; CHECK-DUMP:      bl      0x8 <main+0x8>
 ; CHECK-DUMP: <foo>:
-; CHECK-DUMP:     paciasp
 
 ; `main` doesn't support BTI while `foo` does, so in the binary
 ; we should see only PAC which is supported by both.
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
deleted file mode 100644
index c25857ceed7b4..0000000000000
--- a/llvm/test/LTO/AArch64/link-sign-return-address.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; Testcase to check that module with different branch-target-enforcement can
-; be mixed.
-;
-; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
-; RUN: llvm-lto -exported-symbol main \
-; RUN:          -exported-symbol foo \
-; RUN:          -filetype=obj \
-; RUN:           %t2.bc %t1.bc \
-; RUN:           -o %t1.exe 2>&1
-; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s
-; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-gnu"
-
-declare i32 @foo();
-
-define i32 @main() {
-entry:
-  %add = call i32 @foo()
-  ret i32 %add
-}
-
-!llvm.module.flags = !{!0, !1, !2, !3 }
-!0 = !{i32 8, !"branch-target-enforcement", i32 0}
-!1 = !{i32 8, !"sign-return-address", i32 0}
-!2 = !{i32 8, !"sign-return-address-all", i32 0}
-!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
-
-; CHECK-DUMP: <foo>:
-; CHECK-DUMP:     paciasp
-; CHECK-DUMP:     mov     w0, #0x2a
-; CHECK-DUMP:     autiasp
-; CHECK-DUMP:     ret
-; CHECK-DUMP: <main>:
-; CHECK-DUMP-NOT:  paciasp
-; CHECK-DUMP:      str     x30,
-; CHECK-DUMP:      bl      0x14 <main+0x4>
-
-; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary
-; we should not see anything.
-; CHECK-PROP-NOT:   Properties: aarch64 feature: PAC
\ No newline at end of file
diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll
index 37bd8c37f8b5e..a90f2128e4430 100644
--- a/llvm/test/Linker/link-arm-and-thumb.ll
+++ b/llvm/test/Linker/link-arm-and-thumb.ll
@@ -13,12 +13,11 @@ entry:
   ret i32 %add
 }
 
-; CHECK: define i32 @main() [[MAIN_ATTRS:#[0-9]+]]
+; CHECK: define i32 @main() {
 ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]]
 ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]]
 
-; CHECK: attributes [[MAIN_ATTRS]] = { {{.*}} }
-; CHECK: attributes [[ARM_ATTRS]] = { {{.*}} "target-features"="-thumb-mode" }
-; CHECK: attributes [[THUMB_ATTRS]] = { {{.*}} "target-features"="+thumb-mode" }
+; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" }
+; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" }
 
 ; STDERR-NOT: warning: Linking two modules of different target triples:
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_sop1.s
index 8a7f64331317e..c4029b0658b2e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_sop1.s
@@ -2964,6 +2964,9 @@ s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
 s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA)
 // GFX11: encoding: [0x85,0x4c,0x80,0xbe]
 
+s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA_TO_PC)
+// GFX11: encoding: [0x86,0x4c,0x80,0xbe]
+
 s_ctz_i32_b32 s5, s1
 // GFX11: encoding: [0x01,0x08,0x85,0xbe]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index 4fd355f10f341..939320e9ef2dc 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -3708,9 +3708,12 @@ s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
 s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA)
 // GFX12: encoding: [0x85,0x4c,0x80,0xbe]
 
-s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_SE_AID_ID)
+s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA_TO_PC)
 // GFX12: encoding: [0x86,0x4c,0x80,0xbe]
 
+s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_SE_AID_ID)
+// GFX12: encoding: [0x87,0x4c,0x80,0xbe]
+
 s_ctz_i32_b32 s5, s1
 // GFX12: encoding: [0x01,0x08,0x85,0xbe]
 
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
index 8b90e20bb87d1..7b591904e877f 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
@@ -29,7 +29,7 @@
 // OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 00000c60 80000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00000c60 80000000 00040000 00000000
 // complete
 // OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
@@ -39,12 +39,12 @@
 // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 00b0 00000060 80000000 00000000 00000000
+// OBJDUMP-NEXT: 00b0 00000060 80000000 00040000 00000000
 // disabled_user_sgpr
 // OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 00f0 00000c60 80000000 00000000 00000000
+// OBJDUMP-NEXT: 00f0 00000c60 80000000 00040000 00000000
 
 .text
 // ASM: .text
diff --git a/llvm/test/MC/AMDGPU/mcexpr_amd.s b/llvm/test/MC/AMDGPU/mcexpr_amd.s
new file mode 100644
index 0000000000000..a9639c3acc305
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mcexpr_amd.s
@@ -0,0 +1,130 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -filetype=obj < %s > %t
+// RUN: llvm-objdump --syms %t | FileCheck --check-prefix=OBJDUMP %s
+
+// OBJDUMP: SYMBOL TABLE:
+// OBJDUMP-NEXT: 0000000000000000 l       *ABS*  0000000000000000 zero
+// OBJDUMP-NEXT: 0000000000000001 l       *ABS*  0000000000000000 one
+// OBJDUMP-NEXT: 0000000000000002 l       *ABS*  0000000000000000 two
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 three
+// OBJDUMP-NEXT: 7fffffffffffffff l       *ABS*  0000000000000000 i64_max
+// OBJDUMP-NEXT: 8000000000000000 l       *ABS*  0000000000000000 i64_min
+// OBJDUMP-NEXT: 0000000000000005 l       *ABS*  0000000000000000 max_expression_all
+// OBJDUMP-NEXT: 0000000000000005 l       *ABS*  0000000000000000 five
+// OBJDUMP-NEXT: 0000000000000004 l       *ABS*  0000000000000000 four
+// OBJDUMP-NEXT: 0000000000000002 l       *ABS*  0000000000000000 max_expression_two
+// OBJDUMP-NEXT: 0000000000000001 l       *ABS*  0000000000000000 max_expression_one
+// OBJDUMP-NEXT: 000000000000000a l       *ABS*  0000000000000000 max_literals
+// OBJDUMP-NEXT: 000000000000000f l       *ABS*  0000000000000000 max_with_max_sym
+// OBJDUMP-NEXT: 000000000000000f l       *ABS*  0000000000000000 max
+// OBJDUMP-NEXT: ffffffffffffffff l       *ABS*  0000000000000000 neg_one
+// OBJDUMP-NEXT: ffffffffffffffff l       *ABS*  0000000000000000 max_neg_numbers
+// OBJDUMP-NEXT: ffffffffffffffff l       *ABS*  0000000000000000 max_neg_number
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 max_with_subexpr
+// OBJDUMP-NEXT: 0000000000000006 l       *ABS*  0000000000000000 max_as_subexpr
+// OBJDUMP-NEXT: 0000000000000005 l       *ABS*  0000000000000000 max_recursive_subexpr
+// OBJDUMP-NEXT: 7fffffffffffffff l       *ABS*  0000000000000000 max_expr_one_max
+// OBJDUMP-NEXT: 7fffffffffffffff l       *ABS*  0000000000000000 max_expr_two_max
+// OBJDUMP-NEXT: 7fffffffffffffff l       *ABS*  0000000000000000 max_expr_three_max
+// OBJDUMP-NEXT: 8000000000000000 l       *ABS*  0000000000000000 max_expr_one_min
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 max_expr_two_min
+// OBJDUMP-NEXT: 0000000000989680 l       *ABS*  0000000000000000 max_expr_three_min
+// OBJDUMP-NEXT: 0000000000000007 l       *ABS*  0000000000000000 or_expression_all
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 or_expression_two
+// OBJDUMP-NEXT: 0000000000000001 l       *ABS*  0000000000000000 or_expression_one
+// OBJDUMP-NEXT: 000000000000000f l       *ABS*  0000000000000000 or_literals
+// OBJDUMP-NEXT: 0000000000000000 l       *ABS*  0000000000000000 or_false
+// OBJDUMP-NEXT: 00000000000000ff l       *ABS*  0000000000000000 or_with_or_sym
+// OBJDUMP-NEXT: 00000000000000ff l       *ABS*  0000000000000000 or
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 or_with_subexpr
+// OBJDUMP-NEXT: 0000000000000008 l       *ABS*  0000000000000000 or_as_subexpr
+// OBJDUMP-NEXT: 0000000000000007 l       *ABS*  0000000000000000 or_recursive_subexpr
+
+// ASM: .set zero, 0
+// ASM: .set one, 1
+// ASM: .set two, 2
+// ASM: .set three, 3
+// ASM: .set i64_max, 9223372036854775807
+// ASM: .set i64_min, -9223372036854775808
+
+.set zero, 0
+.set one, 1
+.set two, 2
+.set three, 3
+.set i64_max, 0x7FFFFFFFFFFFFFFF
+.set i64_min, 0x8000000000000000
+
+// ASM: .set max_expression_all, max(1, 2, five, 3, four)
+// ASM: .set max_expression_two, 2
+// ASM: .set max_expression_one, 1
+// ASM: .set max_literals, 10
+// ASM: .set max_with_max_sym, max(max, 4, 3, 1, 2)
+
+.set max_expression_all, max(one, two, five, three, four)
+.set max_expression_two, max(one, two)
+.set max_expression_one, max(one)
+.set max_literals, max(1,2,3,4,5,6,7,8,9,10)
+.set max_with_max_sym, max(max, 4, 3, one, two)
+
+// ASM: .set max_neg_numbers, -1
+// ASM: .set max_neg_number, -1
+
+.set neg_one, -1
+.set max_neg_numbers, max(-5, -4, -3, -2, neg_one)
+.set max_neg_number, max(neg_one)
+
+// ASM: .set max_with_subexpr, 3
+// ASM: .set max_as_subexpr, 1+(max(4, 3, five))
+// ASM: .set max_recursive_subexpr, max(max(1, four), 3, max_expression_all)
+
+.set max_with_subexpr, max(((one | 3) << 3) / 8)
+.set max_as_subexpr, 1 + max(4, 3, five)
+.set max_recursive_subexpr, max(max(one, four), three, max_expression_all)
+
+// ASM: .set max_expr_one_max, 9223372036854775807
+// ASM: .set max_expr_two_max, max(9223372036854775807, five)
+// ASM: .set max_expr_three_max, max(9223372036854775807, five, 10000000)
+
+.set max_expr_one_max, max(i64_max)
+.set max_expr_two_max, max(i64_max, five)
+.set max_expr_three_max, max(i64_max, five, 10000000)
+
+// ASM: .set max_expr_one_min, -9223372036854775808
+// ASM: .set max_expr_two_min, 3
+// ASM: .set max_expr_three_min, 10000000
+
+.set max_expr_one_min, max(i64_min)
+.set max_expr_two_min, max(i64_min, three)
+.set max_expr_three_min, max(i64_min, three, 10000000)
+
+// ASM: .set or_expression_all, or(1, 2, five, 3, four)
+// ASM: .set or_expression_two, 3
+// ASM: .set or_expression_one, 1
+// ASM: .set or_literals, 15
+// ASM: .set or_false, 0
+// ASM: .set or_with_or_sym, or(or, 4, 3, 1, 2)
+
+.set or_expression_all, or(one, two, five, three, four)
+.set or_expression_two, or(one, two)
+.set or_expression_one, or(one)
+.set or_literals, or(1,2,3,4,5,6,7,8,9,10)
+.set or_false, or(zero, 0, (2-2), 5 > 6)
+.set or_with_or_sym, or(or, 4, 3, one, two)
+
+// ASM: .set or_with_subexpr, 3
+// ASM: .set or_as_subexpr, 1+(or(4, 3, five))
+// ASM: .set or_recursive_subexpr, or(or(1, four), 3, or_expression_all)
+
+.set or_with_subexpr, or(((one | 3) << 3) / 8)
+.set or_as_subexpr, 1 + or(4, 3, five)
+.set or_recursive_subexpr, or(or(one, four), three, or_expression_all)
+
+// ASM: .set four, 4
+// ASM: .set five, 5
+// ASM: .set max, 15
+// ASM: .set or, 255
+
+.set four, 4
+.set five, 5
+.set max, 0xF
+.set or, 0xFF
diff --git a/llvm/test/MC/AMDGPU/mcexpr_amd_err.s b/llvm/test/MC/AMDGPU/mcexpr_amd_err.s
new file mode 100644
index 0000000000000..ea02e01362721
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mcexpr_amd_err.s
@@ -0,0 +1,53 @@
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa %s 2>&1 | FileCheck --check-prefix=ASM %s
+
+.set one, 1
+.set two, 2
+.set three, 3
+
+.set max_empty, max()
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: empty max expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set or_empty, or()
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: empty or expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_post_aux_comma, max(one,)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: mismatch of commas in max expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_pre_aux_comma, max(,one)
+// asm: :[[@line-1]]:{{[0-9]+}}: error: unknown token in expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_double_comma, max(one,, two)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unknown token in expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_no_comma, max(one two)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unexpected token in max expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_missing_paren, max(two
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unexpected token in max expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_expression_one, max(three, four,
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unknown token in expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set or_expression_one, or(four, five
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unexpected token in or expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_no_lparen, max four, five)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: expected newline
+
+.set max_no_paren, max one, two, three
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: expected newline
+
+.set max_rparen_only, max)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: expected newline
+
+.set four, 4
+.set five, 5
diff --git a/llvm/test/MC/ARM/arm-branch-errors.s b/llvm/test/MC/ARM/arm-branch-errors.s
index bbf6445f5c18a..5d7ae12fb3bc0 100644
--- a/llvm/test/MC/ARM/arm-branch-errors.s
+++ b/llvm/test/MC/ARM/arm-branch-errors.s
@@ -10,13 +10,13 @@
 
 @ CHECK: error: invalid instruction, any one of the following would fix this:
 @ CHECK:        b #2
-@ CHECK: note: instruction requires: thumb
 @ CHECK: note: invalid operand for instruction
+@ CHECK: note: instruction requires: thumb
 @ CHECK: error: invalid instruction, any one of the following would fix this:
 @ CHECK:        bl #2
 @ CHECK: note: instruction requires: thumb
 @ CHECK: note: invalid operand for instruction
 @ CHECK: error: invalid instruction, any one of the following would fix this:
 @ CHECK:        beq #2
-@ CHECK: note: instruction requires: thumb
 @ CHECK: note: invalid operand for instruction
+@ CHECK: note: instruction requires: thumb
diff --git a/llvm/test/MC/ARM/arm11-hint-instr.s b/llvm/test/MC/ARM/arm11-hint-instr.s
index 4193a686870ab..d9eaa5a89ab23 100644
--- a/llvm/test/MC/ARM/arm11-hint-instr.s
+++ b/llvm/test/MC/ARM/arm11-hint-instr.s
@@ -65,7 +65,13 @@
 @ CHECK-THUMB: wfe                             @ encoding: [0x20,0xbf]
 @ CHECK-THUMB: wfi                             @ encoding: [0x30,0xbf]
 @ CHECK-THUMB: sev                             @ encoding: [0x40,0xbf]
-@ CHECK-ERROR-THUMB: error: instruction requires: v7 clrex
+@ CHECK-ERROR-THUMB: error: invalid instruction, any one of the following would fix this:
+@ CHECK-ERROR-THUMB: clrex
+@ CHECK-ERROR-THUMB: ^
+@ CHECK-ERROR-THUMB: note: instruction requires: v7 clrex
+@ CHECK-ERROR-THUMB: clrex
+@ CHECK-ERROR-THUMB: ^
+@ CHECK-ERROR-THUMB: note: instruction requires: arm-mode
 @ CHECK-ERROR-THUMB: clrex
 @ CHECK-ERROR-THUMB: ^
 
diff --git a/llvm/test/MC/ARM/basic-arm-instructions.s b/llvm/test/MC/ARM/basic-arm-instructions.s
index 055f3ce316153..84a7cf52fa30e 100644
--- a/llvm/test/MC/ARM/basic-arm-instructions.s
+++ b/llvm/test/MC/ARM/basic-arm-instructions.s
@@ -1774,6 +1774,9 @@ Lforward:
         pkhtb r2, r2, r3, asr #31
         pkhtb r2, r2, r3, asr #15
 
+        it ne
+        pkhtbne r2, r2, r3, asr #15
+
 @ CHECK: pkhbt	r2, r2, r3              @ encoding: [0x13,0x20,0x82,0xe6]
 @ CHECK: pkhbt	r2, r2, r3, lsl #31     @ encoding: [0x93,0x2f,0x82,0xe6]
 @ CHECK: pkhbt	r2, r2, r3              @ encoding: [0x13,0x20,0x82,0xe6]
@@ -1782,6 +1785,7 @@ Lforward:
 @ CHECK: pkhbt	r2, r3, r2              @ encoding: [0x12,0x20,0x83,0xe6]
 @ CHECK: pkhtb	r2, r2, r3, asr #31     @ encoding: [0xd3,0x2f,0x82,0xe6]
 @ CHECK: pkhtb	r2, r2, r3, asr #15     @ encoding: [0xd3,0x27,0x82,0xe6]
+@ CHECK: pkhtbne r2, r2, r3, asr #15    @ encoding: [0xd3,0x27,0x82,0x16]
 
 @------------------------------------------------------------------------------
 @ FIXME: PLD
diff --git a/llvm/test/MC/ARM/cde-fp-vec.s b/llvm/test/MC/ARM/cde-fp-vec.s
index 4b139579b719b..fa18ffa766e2e 100644
--- a/llvm/test/MC/ARM/cde-fp-vec.s
+++ b/llvm/test/MC/ARM/cde-fp-vec.s
@@ -12,7 +12,7 @@ ittt eq
 vcx1a   p1, s7, #2047
 // ERROR: [[@LINE+1]]:{{[0-9]+}}: error: instructions in IT block must be predicable
 vcx2    p0, d0, d15, #0
-// ERROR-FP: [[@LINE+2]]:{{[0-9]+}}: error: invalid instruction
+// ERROR-FP: [[@LINE+2]]:{{[0-9]+}}: error: instruction requires: mve
 // ERROR-MVE: [[@LINE+1]]:{{[0-9]+}}: error: instructions in IT block must be predicable
 vcx3    p0, q0, q7, q0, #12
 nop
@@ -33,12 +33,15 @@ vcx1a   p1, d3, #2047
 // ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
 vcx1    p0, q1, #1234
 // CHECK-MVE-NEXT: vcx1a p1, q5, #4095 @ encoding: [0x2f,0xfd,0xff,0xa1]
-// ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: invalid instruction
+// ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: instruction requires: mve
 vcx1a   p1, q5, #4095
 
 // ERROR: [[@LINE+1]]:{{[0-9]+}}: error: invalid instruction
 vcx1a   p1, s7, s7, #2047
-// ERROR: [[@LINE+1]]:{{[0-9]+}}: error: operand must be an immediate in the range [0,2047]
+// ERROR-FP: [[@LINE+4]]:{{[0-9]+}}: error: operand must be an immediate in the range [0,2047]
+// ERROR-MVE: [[@LINE+3]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this
+// ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: note: operand must be a register in range [q0, q7]
+// ERROR-MVE: [[@LINE+1]]:{{[0-9]+}}: note: operand must be an immediate in the range [0,2047]
 vcx1    p0, d0, #2048
 // ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: operand must be an immediate in the range [0,2047]
 vcx1a   p1, s0, #2048
@@ -51,10 +54,13 @@ vcx1    p8, d0, #1234
 vcx1    p0, d16, #1234
 // ERROR: [[@LINE+1]]:{{[0-9]+}}: error: invalid instruction
 vcx1    p0, s32, #1234
-// ERROR-FP: [[@LINE+4]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
-// ERROR-FP: [[@LINE+3]]:{{[0-9]+}}: note: operand must be a register in range [s0, s31]
-// ERROR-FP: [[@LINE+2]]:{{[0-9]+}}: note: operand must be a register in range [d0, d15]
-// ERROR-MVE: [[@LINE+1]]:{{[0-9]+}}: error: operand must be a register in range [q0, q7]
+// ERROR-FP: [[@LINE+7]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
+// ERROR-FP: [[@LINE+6]]:{{[0-9]+}}: note: operand must be a register in range [s0, s31]
+// ERROR-FP: [[@LINE+5]]:{{[0-9]+}}: note: operand must be a register in range [d0, d15]
+// ERROR-MVE: [[@LINE+4]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
+// ERROR-MVE: [[@LINE+3]]:{{[0-9]+}}: note: operand must be a register in range [q0, q7]
+// ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: note: operand must be a register in range [s0, s31]
+// ERROR-MVE: [[@LINE+1]]:{{[0-9]+}}: note: operand must be a register in range [d0, d15]
 vcx1    p0, q8, #1234
 // ERROR: [[@LINE+3]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
 // ERROR: [[@LINE+2]]:{{[0-9]+}}: note: operand must be a register in range [s0, s31]
@@ -116,7 +122,7 @@ vcx3a   p1, d1, d11, d12, #8
 // ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: error: operand must be an immediate in the range [0,15]
 // ERROR-FP: error: invalid instruction
 vcx3a   p1, q1, q2, q3, #16
-// ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: error: invalid instruction
+// ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: error: operand must be a register in range [d0, d15]
 // ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: operand must be a register in range [d0, d15]
 vcx3    p0, d0, q0, d7, #1
 // ERROR: [[@LINE+1]]:{{[0-9]+}}: error: operand must be a register in range [s0, s31]
diff --git a/llvm/test/MC/ARM/cde-vec-pred.s b/llvm/test/MC/ARM/cde-vec-pred.s
index 6274fafa1224a..9932f8d000337 100644
--- a/llvm/test/MC/ARM/cde-vec-pred.s
+++ b/llvm/test/MC/ARM/cde-vec-pred.s
@@ -19,7 +19,7 @@ vcx3at   p1, q3, q7, q6, #15
 vcx3e    p0, q0, q2, q0, #12
 
 vpt.i8 eq, q0, q0
-// ERROR: [[@LINE+1]]:{{[0-9]+}}: error: incorrect predication in VPT block; got 'none', but expected 't'
+// ERROR: error: incorrect predication in VPT block; got 'none', but expected 't'
 vcx1    p0, q1, #1234
 
 vpt.i8 eq, q0, q0
diff --git a/llvm/test/MC/ARM/cps.s b/llvm/test/MC/ARM/cps.s
index bafdfdea537b8..1034ed93810dd 100644
--- a/llvm/test/MC/ARM/cps.s
+++ b/llvm/test/MC/ARM/cps.s
@@ -26,6 +26,6 @@
 @ V6-ERRORS: note: too many operands for instruction
 @ V6-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ V6-ERRORS: cps #0
-@ V6-ERRORS: note: too few operands for instruction
 @ V6-ERRORS: note: instruction requires: arm-mode
 @ V6-ERRORS: note: instruction requires: thumb2
+@ V6-ERRORS: note: too few operands for instruction
diff --git a/llvm/test/MC/ARM/diagnostics.s b/llvm/test/MC/ARM/diagnostics.s
index e6d80ea7a6280..fa23a7da1e404 100644
--- a/llvm/test/MC/ARM/diagnostics.s
+++ b/llvm/test/MC/ARM/diagnostics.s
@@ -288,7 +288,7 @@
 @ CHECK-ERRORS: error: 'asr' shift amount must be in range [1,32]
 @ CHECK-ERRORS: 	ssat	r8, #1, r10, asr #33
 @ CHECK-ERRORS: 	    	                  ^
-@ CHECK-ERRORS: error: shift operator 'asr' or 'lsl' expected
+@ CHECK-ERRORS: error: operand must be a register in range [r0, r14]
 @ CHECK-ERRORS:         ssat    r8, #1, r10, lsr #5
 @ CHECK-ERRORS:                              ^
 @ CHECK-ERRORS: error: '#' expected
diff --git a/llvm/test/MC/ARM/directive-arch_extension-crypto.s b/llvm/test/MC/ARM/directive-arch_extension-crypto.s
index 8d3cd9e5e1d0d..05b6d9e040188 100644
--- a/llvm/test/MC/ARM/directive-arch_extension-crypto.s
+++ b/llvm/test/MC/ARM/directive-arch_extension-crypto.s
@@ -10,15 +10,16 @@
 	.syntax unified
 
 	.arch_extension crypto
-@ CHECK-V7: error: architectural extension 'crypto' is not allowed for the current base architecture
+@ CHECK-V7: architectural extension 'crypto' is not allowed for the current base architecture
 @ CHECK-V7-NEXT: 	.arch_extension crypto
 @ CHECK-V7-NEXT:                     ^
 
 	.type crypto,%function
 crypto:
 	vmull.p64 q0, d0, d1
-@ CHECK-V7: error: instruction requires: aes armv8
-
+@ CHECK-V7: error: invalid instruction, any one of the following would fix this:
+@ CHECK-V7: note: invalid operand for instruction
+@ CHECK-V7: note: instruction requires: aes armv8
 	aesd.8 q0, q1
 @ CHECK-V7: error: instruction requires: aes armv8
 	aese.8 q0, q1
@@ -51,14 +52,18 @@ crypto:
 @ CHECK-V7: error: instruction requires: sha2 armv8
 
 	.arch_extension nocrypto
+@ CHECK-V7: error: architectural extension 'sha2' is not allowed for the current base architecture
+@ CHECK-V7: error: architectural extension 'aes' is not allowed for the current base architecture
 @ CHECK-V7: error: architectural extension 'crypto' is not allowed for the current base architecture
-@ CHECK-V7-NEXT: 	.arch_extension nocrypto
+@ CHECK-V7-NEXT:     .arch_extension nocrypto
 @ CHECK-V7-NEXT:                     ^
 
 	.type nocrypto,%function
 nocrypto:
 	vmull.p64 q0, d0, d1
-@ CHECK-V7: error: instruction requires: aes armv8
+@ CHECK-V7: error: invalid instruction, any one of the following
+@ CHECK-V7: note: invalid operand for instruction
+@ CHECK-V7: note: instruction requires: aes armv8
 @ CHECK-V8: error: instruction requires: aes
 
 	aesd.8 q0, q1
diff --git a/llvm/test/MC/ARM/invalid-fp-armv8.s b/llvm/test/MC/ARM/invalid-fp-armv8.s
index dca0e448d1140..c8ce261791cfd 100644
--- a/llvm/test/MC/ARM/invalid-fp-armv8.s
+++ b/llvm/test/MC/ARM/invalid-fp-armv8.s
@@ -88,6 +88,8 @@ vrinta.f64.f64 s3, q0
 vrintn.f32.f32 d3, d0
 @ V8: error: instruction requires: NEON
 vrintp.f32 q3, q0
-@ V8: error: instruction requires: NEON
+@ V8: error: invalid instruction, any one of the following would fix this:
+@ V8: note: instruction requires: mve.fp
+@ V8: note: instruction requires: NEON
 vrintmlt.f32 q3, q0
 @ V8: error: instruction 'vrintm' is not predicable, but condition code specified
diff --git a/llvm/test/MC/ARM/lsl-zero-errors.s b/llvm/test/MC/ARM/lsl-zero-errors.s
index e021aa9eb986d..1e51c587211d1 100644
--- a/llvm/test/MC/ARM/lsl-zero-errors.s
+++ b/llvm/test/MC/ARM/lsl-zero-errors.s
@@ -55,22 +55,22 @@
 
 // CHECK-NONARM: error: invalid instruction, any one of the following would fix this:
 // CHECK-NONARM-NEXT: mov pc, r0, lsl #0
-// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 // CHECK-THUMBV7: note: operand must be a register in range [r0, r12] or r14
 // CHECK-THUMBV8: note: operand must be a register in range [r0, r14]
+// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 
 // CHECK-NONARM: error: invalid instruction, any one of the following would fix this:
 // CHECK-NONARM-NEXT: mov r0, pc, lsl #0
-// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 // CHECK-NONARM: note: invalid operand for instruction
 // CHECK-NONARM: note: invalid operand for instruction
 // CHECK-NONARM: note: operand must be an immediate in the range [256,65535]
+// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 
 // CHECK-NONARM: error: invalid instruction, any one of the following would fix this:
 // CHECK-NONARM-NEXT: mov pc, pc, lsl #0
-// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 // CHECK-THUMBV7: note: operand must be a register in range [r0, r12] or r14
 // CHECK-THUMBV8: note: operand must be a register in range [r0, r14]
+// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 
 // CHECK-NONARM: error: invalid instruction, any one of the following would fix this:
 // CHECK-NONARM-NEXT: movs pc, r0, lsl #0
@@ -134,8 +134,8 @@
 // FIXME: We should consistently have the "requires ARMv8" error here
 // CHECK-THUMBV7: error: invalid instruction, any one of the following would fix this:
 // CHECK-THUMBV7-NEXT: mov sp, sp, lsl #0
-// CHECK-THUMBV7: note: operand must be a register in range [r0, r15]
 // CHECK-THUMBV7: note: operand must be a register in range [r0, r12] or r14
+// CHECK-THUMBV7: note: operand must be a register in range [r0, r15]
 
 // CHECK-THUMBV7: error: invalid instruction, any one of the following would fix this:
 // CHECK-THUMBV7-NEXT: movs sp, sp, lsl #0
diff --git a/llvm/test/MC/ARM/mve-load-store.s b/llvm/test/MC/ARM/mve-load-store.s
index 5c6d2a172b252..797ab1e22dd01 100644
--- a/llvm/test/MC/ARM/mve-load-store.s
+++ b/llvm/test/MC/ARM/mve-load-store.s
@@ -5,55 +5,55 @@
 # RUN:     FileCheck --check-prefix=ERROR-NOMVE < %t %s
 
 # CHECK: vldrb.u8 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r0]
 
 # CHECK: vldrb.u8 q1, [r0] @ encoding: [0x90,0xed,0x00,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q1, [r0]
 
 # CHECK: vldrb.u8 q0, [r11] @ encoding: [0x9b,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r11]
 
 # CHECK: vldrb.u8 q3, [r11] @ encoding: [0x9b,0xed,0x00,0x7e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q3, [r11]
 
 # CHECK: vldrb.u8 q0, [r4, #56] @ encoding: [0x94,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r4, #56]
 
 # CHECK: vldrb.u8 q4, [r4, #56] @ encoding: [0x94,0xed,0x38,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q4, [r4, #56]
 
 # CHECK: vldrb.u8 q0, [r8, #56] @ encoding: [0x98,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r8, #56]
 
 # CHECK: vldrb.u8 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [r4, #56]!
 
 # CHECK: vldrb.u8 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [r4, #56]!
 
 # CHECK: vldrb.u8 q5, [r4], #-25 @ encoding: [0x34,0xec,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [r4], #-25
 
 # CHECK: vldrb.u8 q5, [r10], #-25 @ encoding: [0x3a,0xec,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [r10], #-25
 
 # CHECK: vldrb.u8 q5, [sp, #-25] @ encoding: [0x1d,0xed,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [sp, #-25]
 
 # CHECK: vldrb.u8 q5, [sp, #-127] @ encoding: [0x1d,0xed,0x7f,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [sp, #-127]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: invalid operand for instruction
@@ -69,55 +69,55 @@ vldrb.u8 q0, [r0, #-128]!
 vldrb.u8 q0, [r0], #128
 
 # CHECK: vstrb.8 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r0]
 
 # CHECK: vstrb.8 q1, [r0] @ encoding: [0x80,0xed,0x00,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q1, [r0]
 
 # CHECK: vstrb.8 q0, [r11] @ encoding: [0x8b,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r11]
 
 # CHECK: vstrb.8 q3, [r11] @ encoding: [0x8b,0xed,0x00,0x7e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q3, [r11]
 
 # CHECK: vstrb.8 q0, [r4, #56] @ encoding: [0x84,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r4, #56]
 
 # CHECK: vstrb.8 q4, [r4, #56] @ encoding: [0x84,0xed,0x38,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q4, [r4, #56]
 
 # CHECK: vstrb.8 q0, [r8, #56] @ encoding: [0x88,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r8, #56]
 
 # CHECK: vstrb.8 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [r4, #56]!
 
 # CHECK: vstrb.8 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [r4, #56]!
 
 # CHECK: vstrb.8 q5, [r4], #-25 @ encoding: [0x24,0xec,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [r4], #-25
 
 # CHECK: vstrb.8 q5, [r10], #-25 @ encoding: [0x2a,0xec,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [r10], #-25
 
 # CHECK: vstrb.8 q5, [sp, #-25] @ encoding: [0x0d,0xed,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [sp, #-25]
 
 # CHECK: vstrb.8 q5, [sp, #127] @ encoding: [0x8d,0xed,0x7f,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [sp, #127]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: invalid operand for instruction
@@ -133,735 +133,735 @@ vstrb.u8 q0, [r0, #-128]!
 vstrb.u8 q0, [r0], #128
 
 # CHECK: vldrb.u16 q0, [r0] @ encoding: [0x90,0xfd,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r0]
 
 # CHECK: vldrb.u16 q1, [r0] @ encoding: [0x90,0xfd,0x80,0x2e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q1, [r0]
 
 # CHECK: vldrb.u16 q0, [r7] @ encoding: [0x97,0xfd,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r7]
 
 # CHECK: vldrb.u16 q3, [r7] @ encoding: [0x97,0xfd,0x80,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q3, [r7]
 
 # CHECK: vldrb.u16 q0, [r4, #56] @ encoding: [0x94,0xfd,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r4, #56]
 
 # CHECK: vldrb.u16 q4, [r4, #56] @ encoding: [0x94,0xfd,0xb8,0x8e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q4, [r4, #56]
 
 # CHECK: vldrb.u16 q0, [r2, #56] @ encoding: [0x92,0xfd,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r2, #56]
 
 # CHECK: vldrb.u16 q5, [r4, #56]! @ encoding: [0xb4,0xfd,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r4, #56]!
 
 # CHECK: vldrb.u16 q5, [r4, #56]! @ encoding: [0xb4,0xfd,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r4, #56]!
 
 # CHECK: vldrb.u16 q5, [r4], #-1 @ encoding: [0x34,0xfc,0x81,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r4], #-1
 
 # CHECK: vldrb.u16 q5, [r3], #-25 @ encoding: [0x33,0xfc,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r3], #-25
 
 # CHECK: vldrb.u16 q5, [r6, #-25] @ encoding: [0x16,0xfd,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r6, #-25]
 
 # CHECK: vldrb.u16 q5, [r6, #-64] @ encoding: [0x16,0xfd,0xc0,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r6, #-64]
 
 # CHECK: vldrb.s16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r0]
 
 # CHECK: vldrb.s16 q1, [r0] @ encoding: [0x90,0xed,0x80,0x2e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q1, [r0]
 
 # CHECK: vldrb.s16 q0, [r7] @ encoding: [0x97,0xed,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r7]
 
 # CHECK: vldrb.s16 q3, [r7] @ encoding: [0x97,0xed,0x80,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q3, [r7]
 
 # CHECK: vldrb.s16 q0, [r4, #56] @ encoding: [0x94,0xed,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r4, #56]
 
 # CHECK: vldrb.s16 q4, [r4, #56] @ encoding: [0x94,0xed,0xb8,0x8e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q4, [r4, #56]
 
 # CHECK: vldrb.s16 q0, [r2, #56] @ encoding: [0x92,0xed,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r2, #56]
 
 # CHECK: vldrb.s16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r4, #56]!
 
 # CHECK: vldrb.s16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r4, #56]!
 
 # CHECK: vldrb.s16 q5, [r4], #-25 @ encoding: [0x34,0xec,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r4], #-25
 
 # CHECK: vldrb.s16 q5, [r3], #-25 @ encoding: [0x33,0xec,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r3], #-25
 
 # CHECK: vldrb.s16 q5, [r6, #-25] @ encoding: [0x16,0xed,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r6, #-25]
 
 # CHECK: vldrb.s16 q5, [r6, #-64] @ encoding: [0x16,0xed,0xc0,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r6, #-64]
 
 # CHECK: vstrb.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r0]
 
 # CHECK: vstrb.16 q1, [r0] @ encoding: [0x80,0xed,0x80,0x2e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q1, [r0]
 
 # CHECK: vstrb.16 q0, [r7] @ encoding: [0x87,0xed,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r7]
 
 # CHECK: vstrb.16 q3, [r7] @ encoding: [0x87,0xed,0x80,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q3, [r7]
 
 # CHECK: vstrb.16 q0, [r4, #56] @ encoding: [0x84,0xed,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r4, #56]
 
 # CHECK: vstrb.16 q4, [r4, #56] @ encoding: [0x84,0xed,0xb8,0x8e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q4, [r4, #56]
 
 # CHECK: vstrb.16 q0, [r5, #56] @ encoding: [0x85,0xed,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r5, #56]
 
 # CHECK: vstrb.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r4, #56]!
 
 # CHECK: vstrb.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r4, #56]!
 
 # CHECK: vstrb.16 q5, [r4], #-25 @ encoding: [0x24,0xec,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r4], #-25
 
 # CHECK: vstrb.16 q5, [r3], #-25 @ encoding: [0x23,0xec,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r3], #-25
 
 # CHECK: vstrb.16 q5, [r2, #-25] @ encoding: [0x02,0xed,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r2, #-25]
 
 # CHECK: vstrb.16 q5, [r2, #-64] @ encoding: [0x02,0xed,0xc0,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r2, #-64]
 
 # CHECK: vldrb.u32 q0, [r0] @ encoding: [0x90,0xfd,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r0]
 
 # CHECK: vldrb.u32 q1, [r0] @ encoding: [0x90,0xfd,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q1, [r0]
 
 # CHECK: vldrb.u32 q0, [r7] @ encoding: [0x97,0xfd,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r7]
 
 # CHECK: vldrb.u32 q3, [r7] @ encoding: [0x97,0xfd,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q3, [r7]
 
 # CHECK: vldrb.u32 q0, [r4, #56] @ encoding: [0x94,0xfd,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r4, #56]
 
 # CHECK: vldrb.u32 q4, [r4, #56] @ encoding: [0x94,0xfd,0x38,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q4, [r4, #56]
 
 # CHECK: vldrb.u32 q0, [r2, #56] @ encoding: [0x92,0xfd,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r2, #56]
 
 # CHECK: vldrb.u32 q5, [r4, #56]! @ encoding: [0xb4,0xfd,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r4, #56]!
 
 # CHECK: vldrb.u32 q5, [r4, #56]! @ encoding: [0xb4,0xfd,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r4, #56]!
 
 # CHECK: vldrb.u32 q5, [r4], #-25 @ encoding: [0x34,0xfc,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r4], #-25
 
 # CHECK: vldrb.u32 q5, [r3], #-25 @ encoding: [0x33,0xfc,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r3], #-25
 
 # CHECK: vldrb.u32 q5, [r6, #-25] @ encoding: [0x16,0xfd,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r6, #-25]
 
 # CHECK: vldrb.u32 q5, [r6, #-64] @ encoding: [0x16,0xfd,0x40,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r6, #-64]
 
 # CHECK: vldrb.s32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r0]
 
 # CHECK: vldrb.s32 q1, [r0] @ encoding: [0x90,0xed,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q1, [r0]
 
 # CHECK: vldrb.s32 q0, [r7] @ encoding: [0x97,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r7]
 
 # CHECK: vldrb.s32 q3, [r7] @ encoding: [0x97,0xed,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q3, [r7]
 
 # CHECK: vldrb.s32 q0, [r4, #56] @ encoding: [0x94,0xed,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r4, #56]
 
 # CHECK: vldrb.s32 q4, [r4, #56] @ encoding: [0x94,0xed,0x38,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q4, [r4, #56]
 
 # CHECK: vldrb.s32 q0, [r2, #56] @ encoding: [0x92,0xed,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r2, #56]
 
 # CHECK: vldrb.s32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r4, #56]!
 
 # CHECK: vldrb.s32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r4, #56]!
 
 # CHECK: vldrb.s32 q5, [r4], #-25 @ encoding: [0x34,0xec,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r4], #-25
 
 # CHECK: vldrb.s32 q5, [r3], #-25 @ encoding: [0x33,0xec,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r3], #-25
 
 # CHECK: vldrb.s32 q5, [r6, #-25] @ encoding: [0x16,0xed,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r6, #-25]
 
 # CHECK: vldrb.s32 q5, [r6, #-64] @ encoding: [0x16,0xed,0x40,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r6, #-64]
 
 # CHECK: vstrb.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r0]
 
 # CHECK: vstrb.32 q1, [r0] @ encoding: [0x80,0xed,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q1, [r0]
 
 # CHECK: vstrb.32 q0, [r7] @ encoding: [0x87,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r7]
 
 # CHECK: vstrb.32 q3, [r7] @ encoding: [0x87,0xed,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q3, [r7]
 
 # CHECK: vstrb.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r4, #56]
 
 # CHECK: vstrb.32 q4, [r4, #56] @ encoding: [0x84,0xed,0x38,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q4, [r4, #56]
 
 # CHECK: vstrb.32 q0, [r5, #56] @ encoding: [0x85,0xed,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r5, #56]
 
 # CHECK: vstrb.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r4, #56]!
 
 # CHECK: vstrb.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r4, #56]!
 
 # CHECK: vstrb.32 q5, [r4], #-25 @ encoding: [0x24,0xec,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r4], #-25
 
 # CHECK: vstrb.32 q5, [r3], #-25 @ encoding: [0x23,0xec,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r3], #-25
 
 # CHECK: vstrb.32 q5, [r2, #-25] @ encoding: [0x02,0xed,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r2, #-25]
 
 # CHECK: vstrb.32 q5, [r2, #-64] @ encoding: [0x02,0xed,0x40,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r2, #-64]
 
 # CHECK: vldrh.u16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0]
 
 # CHECK: vldrh.u16 q1, [r0] @ encoding: [0x90,0xed,0x80,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q1, [r0]
 
 # CHECK: vldrh.u16 q0, [r11] @ encoding: [0x9b,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r11]
 
 # CHECK: vldrh.u16 q3, [r11] @ encoding: [0x9b,0xed,0x80,0x7e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q3, [r11]
 
 # CHECK: vldrh.u16 q0, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r4, #56]
 
 # CHECK: vldrh.u16 q4, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q4, [r4, #56]
 
 # CHECK: vldrh.u16 q0, [r8, #56] @ encoding: [0x98,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r8, #56]
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q5, [r4], #-26 @ encoding: [0x34,0xec,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r4], #-26
 
 # CHECK: vldrh.u16 q5, [r10], #-26 @ encoding: [0x3a,0xec,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r10], #-26
 
 # CHECK: vldrh.u16 q5, [sp, #-26] @ encoding: [0x1d,0xed,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [sp, #-26]
 
 # CHECK: vldrh.u16 q5, [sp, #-64] @ encoding: [0x1d,0xed,0xa0,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [sp, #-64]
 
 # CHECK: vldrh.u16 q5, [sp, #-254] @ encoding: [0x1d,0xed,0xff,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [sp, #-254]
 
 # CHECK: vldrh.u16 q5, [r10], #254 @ encoding: [0xba,0xec,0xff,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r10], #254
 
 # CHECK: vstrh.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r0]
 
 # CHECK: vstrh.16 q1, [r0] @ encoding: [0x80,0xed,0x80,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q1, [r0]
 
 # CHECK: vstrh.16 q0, [r11] @ encoding: [0x8b,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r11]
 
 # CHECK: vstrh.16 q3, [r11] @ encoding: [0x8b,0xed,0x80,0x7e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q3, [r11]
 
 # CHECK: vstrh.16 q0, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r4, #56]
 
 # CHECK: vstrh.16 q4, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q4, [r4, #56]
 
 # CHECK: vstrh.16 q0, [r8, #56] @ encoding: [0x88,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r8, #56]
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q5, [r4], #-26 @ encoding: [0x24,0xec,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r4], #-26
 
 # CHECK: vstrh.16 q5, [r10], #-26 @ encoding: [0x2a,0xec,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r10], #-26
 
 # CHECK: vstrh.16 q5, [sp, #-26] @ encoding: [0x0d,0xed,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [sp, #-26]
 
 # CHECK: vstrh.16 q5, [sp, #-64] @ encoding: [0x0d,0xed,0xa0,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [sp, #-64]
 
 # CHECK: vstrh.16 q5, [sp, #-254] @ encoding: [0x0d,0xed,0xff,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [sp, #-254]
 
 # CHECK: vstrh.16 q5, [r10], #254 @ encoding: [0xaa,0xec,0xff,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r10], #254
 
 # CHECK: vldrh.u32 q0, [r0] @ encoding: [0x98,0xfd,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r0]
 
 # CHECK: vldrh.u32 q1, [r0] @ encoding: [0x98,0xfd,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q1, [r0]
 
 # CHECK: vldrh.u32 q0, [r7] @ encoding: [0x9f,0xfd,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r7]
 
 # CHECK: vldrh.u32 q3, [r7] @ encoding: [0x9f,0xfd,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q3, [r7]
 
 # CHECK: vldrh.u32 q0, [r4, #56] @ encoding: [0x9c,0xfd,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r4, #56]
 
 # CHECK: vldrh.u32 q4, [r4, #56] @ encoding: [0x9c,0xfd,0x1c,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q4, [r4, #56]
 
 # CHECK: vldrh.u32 q0, [r2, #56] @ encoding: [0x9a,0xfd,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r2, #56]
 
 # CHECK: vldrh.u32 q5, [r4, #56]! @ encoding: [0xbc,0xfd,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r4, #56]!
 
 # CHECK: vldrh.u32 q5, [r4, #56]! @ encoding: [0xbc,0xfd,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r4, #56]!
 
 # CHECK: vldrh.u32 q5, [r4], #-26 @ encoding: [0x3c,0xfc,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r4], #-26
 
 # CHECK: vldrh.u32 q5, [r3], #-26 @ encoding: [0x3b,0xfc,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r3], #-26
 
 # CHECK: vldrh.u32 q5, [r6, #-26] @ encoding: [0x1e,0xfd,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r6, #-26]
 
 # CHECK: vldrh.u32 q5, [r6, #-64] @ encoding: [0x1e,0xfd,0x20,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r6, #-64]
 
 # CHECK: vldrh.u32 q5, [r6, #-254] @ encoding: [0x1e,0xfd,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r6, #-254]
 
 # CHECK: vldrh.u32 q5, [r4, #254]! @ encoding: [0xbc,0xfd,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r4, #254]!
 
 # CHECK: vldrh.s32 q0, [r0] @ encoding: [0x98,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r0]
 
 # CHECK: vldrh.s32 q1, [r0] @ encoding: [0x98,0xed,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q1, [r0]
 
 # CHECK: vldrh.s32 q0, [r7] @ encoding: [0x9f,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r7]
 
 # CHECK: vldrh.s32 q3, [r7] @ encoding: [0x9f,0xed,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q3, [r7]
 
 # CHECK: vldrh.s32 q0, [r4, #56] @ encoding: [0x9c,0xed,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r4, #56]
 
 # CHECK: vldrh.s32 q4, [r4, #56] @ encoding: [0x9c,0xed,0x1c,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q4, [r4, #56]
 
 # CHECK: vldrh.s32 q0, [r2, #56] @ encoding: [0x9a,0xed,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r2, #56]
 
 # CHECK: vldrh.s32 q5, [r4, #56]! @ encoding: [0xbc,0xed,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r4, #56]!
 
 # CHECK: vldrh.s32 q5, [r4, #56]! @ encoding: [0xbc,0xed,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r4, #56]!
 
 # CHECK: vldrh.s32 q5, [r4], #-26 @ encoding: [0x3c,0xec,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r4], #-26
 
 # CHECK: vldrh.s32 q5, [r3], #-26 @ encoding: [0x3b,0xec,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r3], #-26
 
 # CHECK: vldrh.s32 q5, [r6, #-26] @ encoding: [0x1e,0xed,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r6, #-26]
 
 # CHECK: vldrh.s32 q5, [r6, #-64] @ encoding: [0x1e,0xed,0x20,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r6, #-64]
 
 # CHECK: vldrh.s32 q5, [r6, #-254] @ encoding: [0x1e,0xed,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r6, #-254]
 
 # CHECK: vldrh.s32 q5, [r4, #254]! @ encoding: [0xbc,0xed,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r4, #254]!
 
 # CHECK: vstrh.32 q0, [r0] @ encoding: [0x88,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r0]
 
 # CHECK: vstrh.32 q1, [r0] @ encoding: [0x88,0xed,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q1, [r0]
 
 # CHECK: vstrh.32 q0, [r7] @ encoding: [0x8f,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r7]
 
 # CHECK: vstrh.32 q3, [r7] @ encoding: [0x8f,0xed,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q3, [r7]
 
 # CHECK: vstrh.32 q0, [r4, #56] @ encoding: [0x8c,0xed,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r4, #56]
 
 # CHECK: vstrh.32 q4, [r4, #56] @ encoding: [0x8c,0xed,0x1c,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q4, [r4, #56]
 
 # CHECK: vstrh.32 q0, [r5, #56] @ encoding: [0x8d,0xed,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r5, #56]
 
 # CHECK: vstrh.32 q5, [r4, #56]! @ encoding: [0xac,0xed,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r4, #56]!
 
 # CHECK: vstrh.32 q5, [r4, #56]! @ encoding: [0xac,0xed,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r4, #56]!
 
 # CHECK: vstrh.32 q5, [r4], #-26 @ encoding: [0x2c,0xec,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r4], #-26
 
 # CHECK: vstrh.32 q5, [r3], #-26 @ encoding: [0x2b,0xec,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r3], #-26
 
 # CHECK: vstrh.32 q5, [r2, #-26] @ encoding: [0x0a,0xed,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r2, #-26]
 
 # CHECK: vstrh.32 q5, [r2, #-64] @ encoding: [0x0a,0xed,0x20,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r2, #-64]
 
 # CHECK: vstrh.32 q5, [r2, #-254] @ encoding: [0x0a,0xed,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r2, #-254]
 
 # CHECK: vstrh.32 q5, [r4, #254]! @ encoding: [0xac,0xed,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r4, #254]!
 
 # CHECK: vldrw.u32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0]
 
 # CHECK: vldrw.u32 q1, [r0] @ encoding: [0x90,0xed,0x00,0x3f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q1, [r0]
 
 # CHECK: vldrw.u32 q0, [r11] @ encoding: [0x9b,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r11]
 
 # CHECK: vldrw.u32 q3, [r11] @ encoding: [0x9b,0xed,0x00,0x7f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q3, [r11]
 
 # CHECK: vldrw.u32 q0, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r4, #56]
 
 # CHECK: vldrw.u32 q4, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x9f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q4, [r4, #56]
 
 # CHECK: vldrw.u32 q0, [r8, #56] @ encoding: [0x98,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r8, #56]
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q5, [r4], #-28 @ encoding: [0x34,0xec,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r4], #-28
 
 # CHECK: vldrw.u32 q5, [r10], #-28 @ encoding: [0x3a,0xec,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r10], #-28
 
 # CHECK: vldrw.u32 q5, [sp, #-28] @ encoding: [0x1d,0xed,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [sp, #-28]
 
 # CHECK: vldrw.u32 q5, [sp, #-64] @ encoding: [0x1d,0xed,0x10,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [sp, #-64]
 
 # CHECK: vldrw.u32 q5, [sp, #-508] @ encoding: [0x1d,0xed,0x7f,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [sp, #-508]
 
 # CHECK: vldrw.u32 q5, [r4, #508]! @ encoding: [0xb4,0xed,0x7f,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r4, #508]!
 
 # CHECK: vstrw.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r0]
 
 # CHECK: vstrw.32 q1, [r0] @ encoding: [0x80,0xed,0x00,0x3f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q1, [r0]
 
 # CHECK: vstrw.32 q0, [r11] @ encoding: [0x8b,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r11]
 
 # CHECK: vstrw.32 q3, [r11] @ encoding: [0x8b,0xed,0x00,0x7f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q3, [r11]
 
 # CHECK: vstrw.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r4, #56]
 
 # CHECK: vstrw.32 q4, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x9f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q4, [r4, #56]
 
 # CHECK: vstrw.32 q0, [r8, #56] @ encoding: [0x88,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r8, #56]
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q5, [r4], #-28 @ encoding: [0x24,0xec,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r4], #-28
 
 # CHECK: vstrw.32 q5, [r10], #-28 @ encoding: [0x2a,0xec,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r10], #-28
 
 # CHECK: vstrw.32 q5, [sp, #-28] @ encoding: [0x0d,0xed,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [sp, #-28]
 
 # CHECK: vstrw.32 q5, [sp, #-64] @ encoding: [0x0d,0xed,0x10,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [sp, #-64]
 
 # CHECK: vstrw.32 q5, [sp, #-508] @ encoding: [0x0d,0xed,0x7f,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [sp, #-508]
 
 # CHECK: vstrw.32 q5, [r4, #508]! @ encoding: [0xa4,0xed,0x7f,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r4, #508]!
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: invalid operand for instruction
@@ -885,271 +885,271 @@ vstrw.32 q5, [sp, #-3]
 vstrw.32 q5, [sp, #512]
 
 # CHECK: vldrb.u8 q0, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r0, q1]
 
 # CHECK: vldrb.u8 q3, [r10, q1] @ encoding: [0x9a,0xfc,0x02,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q3, [r10, q1]
 
 # CHECK: vldrb.u16 q0, [r0, q1] @ encoding: [0x90,0xfc,0x82,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r0, q1]
 
 # CHECK: vldrb.u16 q3, [r9, q1] @ encoding: [0x99,0xfc,0x82,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q3, [r9, q1]
 
 # CHECK: vldrb.s16 q0, [r0, q1] @ encoding: [0x90,0xec,0x82,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r0, q1]
 
 # CHECK: vldrb.s16 q3, [sp, q1] @ encoding: [0x9d,0xec,0x82,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q3, [sp, q1]
 
 # CHECK: vldrb.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r0, q1]
 
 # CHECK: vldrb.u32 q3, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q3, [r0, q1]
 
 # CHECK: vldrb.s32 q0, [r0, q1] @ encoding: [0x90,0xec,0x02,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r0, q1]
 
 # CHECK: vldrb.s32 q3, [r0, q1] @ encoding: [0x90,0xec,0x02,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q3, [r0, q1]
 
 # CHECK: vldrh.u16 q0, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0, q1]
 
 # CHECK: vldrh.u16 q3, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q3, [r0, q1]
 
 # CHECK: vldrh.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x12,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r0, q1]
 
 # CHECK: vldrh.u32 q3, [r0, q1] @ encoding: [0x90,0xfc,0x12,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q3, [r0, q1]
 
 # CHECK: vldrh.s32 q0, [r0, q1] @ encoding: [0x90,0xec,0x12,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r0, q1]
 
 # CHECK: vldrh.s32 q3, [r0, q1] @ encoding: [0x90,0xec,0x12,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q3, [r0, q1]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0, q0, uxtw #1]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r0, q0, uxtw #1]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r0, q0, uxtw #1]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0, q0, uxtw #2]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [r0, q0, uxtw #3]
 
 # CHECK: vldrh.u16 q0, [r0, q1, uxtw #1] @ encoding: [0x90,0xfc,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vldrw.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0, q1]
 
 # CHECK: vldrw.u32 q3, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q3, [r0, q1]
 
 # CHECK: vldrw.u32 q0, [r0, q1, uxtw #2] @ encoding: [0x90,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vldrw.u32 q0, [sp, q1, uxtw #2] @ encoding: [0x9d,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [sp, q1, uxtw #2]
 
 # CHECK: vldrd.u64 q0, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [r0, q1]
 
 # CHECK: vldrd.u64 q3, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q3, [r0, q1]
 
 # CHECK: vldrd.u64 q0, [r0, q1, uxtw #3] @ encoding: [0x90,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vldrd.u64 q0, [sp, q1, uxtw #3] @ encoding: [0x9d,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [sp, q1, uxtw #3]
 
 # CHECK: vstrb.8 q0, [r0, q1] @ encoding: [0x80,0xec,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r0, q1]
 
 # CHECK: vstrb.8 q3, [r10, q1] @ encoding: [0x8a,0xec,0x02,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q3, [r10, q1]
 
 # CHECK: vstrb.8 q3, [r0, q3] @ encoding: [0x80,0xec,0x06,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q3, [r0, q3]
 
 # CHECK: vstrb.16 q0, [r0, q1] @ encoding: [0x80,0xec,0x82,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r0, q1]
 
 # CHECK: vstrb.16 q3, [sp, q1] @ encoding: [0x8d,0xec,0x82,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q3, [sp, q1]
 
 # CHECK: vstrb.16 q3, [r0, q3] @ encoding: [0x80,0xec,0x86,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q3, [r0, q3]
 
 # CHECK: vstrb.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x02,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r0, q1]
 
 # CHECK: vstrb.32 q3, [r0, q1] @ encoding: [0x80,0xec,0x02,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q3, [r0, q1]
 
 # CHECK: vstrb.32 q3, [r0, q3] @ encoding: [0x80,0xec,0x06,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q3, [r0, q3]
 
 # CHECK: vstrh.16 q0, [r0, q1] @ encoding: [0x80,0xec,0x92,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q1] @ encoding: [0x80,0xec,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q3, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q3] @ encoding: [0x80,0xec,0x96,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q3, [r0, q3]
 
 # CHECK: vstrh.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x12,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r0, q1]
 
 # CHECK: vstrh.32 q3, [r0, q1] @ encoding: [0x80,0xec,0x12,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q3, [r0, q1]
 
 # CHECK: vstrh.32 q3, [r0, q3] @ encoding: [0x80,0xec,0x16,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q3, [r0, q3]
 
 # CHECK: vstrh.16 q0, [r0, q1, uxtw #1] @ encoding: [0x80,0xec,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vstrh.32 q3, [r8, q3, uxtw #1] @ encoding: [0x88,0xec,0x17,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q3, [r8, q3, uxtw #1]
 
 # CHECK: vstrw.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r0, q1]
 
 # CHECK: vstrw.32 q3, [r0, q1] @ encoding: [0x80,0xec,0x42,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q3, [r0, q1]
 
 # CHECK: vstrw.32 q3, [r0, q3] @ encoding: [0x80,0xec,0x46,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q3, [r0, q3]
 
 # CHECK: vstrw.32 q0, [r0, q1, uxtw #2] @ encoding: [0x80,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vstrw.32 q0, [sp, q1, uxtw #2] @ encoding: [0x8d,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [sp, q1, uxtw #2]
 
 # CHECK: vstrd.64 q0, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q0, [r0, q1]
 
 # CHECK: vstrd.64 q3, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q3, [r0, q1]
 
 # CHECK: vstrd.64 q3, [r0, q3] @ encoding: [0x80,0xec,0xd6,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q3, [r0, q3]
 
 # CHECK: vstrd.64 q0, [r0, q1, uxtw #3] @ encoding: [0x80,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vstrd.64 q0, [sp, q1, uxtw #3] @ encoding: [0x8d,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q0, [sp, q1, uxtw #3]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: operand must be a register in range [q0, q7]
@@ -1189,79 +1189,79 @@ vstrh.16 q0, [r0, q1, uxtw #2]
 vstrb.32 q0, [r11, q1, uxtw #1]
 
 # CHECK: vldrw.u32 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [q1]
 
 # CHECK: vldrw.u32 q7, [q1] @ encoding: [0x92,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1]
 
 # CHECK: vldrw.u32 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1]!
 
 # CHECK: vldrw.u32 q7, [q1, #4] @ encoding: [0x92,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #4]
 
 # CHECK: vldrw.u32 q7, [q1, #-4] @ encoding: [0x12,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #-4]
 
 # CHECK: vldrw.u32 q7, [q1, #508] @ encoding: [0x92,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #508]
 
 # CHECK: vldrw.u32 q7, [q1, #-508] @ encoding: [0x12,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #-508]
 
 # CHECK: vldrw.u32 q7, [q1, #264] @ encoding: [0x92,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #264]
 
 # CHECK: vldrw.u32 q7, [q1, #4]! @ encoding: [0xb2,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #4]!
 
 # CHECK: vstrw.32 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [q1]
 
 # CHECK: vstrw.32 q1, [q1] @ encoding: [0x82,0xfd,0x00,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q1, [q1]
 
 # CHECK: vstrw.32 q7, [q1] @ encoding: [0x82,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1]
 
 # CHECK: vstrw.32 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1]!
 
 # CHECK: vstrw.32 q7, [q7] @ encoding: [0x8e,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q7]
 
 # CHECK: vstrw.32 q7, [q1, #4] @ encoding: [0x82,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #4]
 
 # CHECK: vstrw.32 q7, [q1, #-4] @ encoding: [0x02,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #-4]
 
 # CHECK: vstrw.32 q7, [q1, #508] @ encoding: [0x82,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #508]
 
 # CHECK: vstrw.32 q7, [q1, #-508] @ encoding: [0x02,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #-508]
 
 # CHECK: vstrw.32 q7, [q1, #264]! @ encoding: [0xa2,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #264]!
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: operand must be a register in range [q0, q7]
@@ -1277,103 +1277,103 @@ vstrw.32 q4, [q1, #3]!
 vldrw.u32 q7, [q1, #512]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector pointer register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q1, [q1, #264]
 
 # CHECK: vldrd.u64 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [q1]
 
 # CHECK: vldrd.u64 q7, [q1] @ encoding: [0x92,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1]
 
 # CHECK: vldrd.u64 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1]!
 
 # CHECK: vldrd.u64 q7, [q1, #8] @ encoding: [0x92,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #8]
 
 # CHECK: vldrd.u64 q7, [q1, #-8] @ encoding: [0x12,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #-8]
 
 # CHECK: vldrd.u64 q7, [q1, #1016] @ encoding: [0x92,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #1016]
 
 # CHECK: vldrd.u64 q7, [q1, #-1016] @ encoding: [0x12,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #-1016]
 
 # CHECK: vldrd.u64 q7, [q1, #264] @ encoding: [0x92,0xfd,0x21,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #264]
 
 # CHECK: vldrd.u64 q7, [q1, #624] @ encoding: [0x92,0xfd,0x4e,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #624]
 
 # CHECK: vldrd.u64 q7, [q1, #264] @ encoding: [0x92,0xfd,0x21,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #264]
 
 # CHECK: vldrd.u64 q7, [q1, #-1016]! @ encoding: [0x32,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #-1016]!
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector pointer register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q6, [q6]
 
 # CHECK: vstrd.64 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q0, [q1]
 
 # CHECK: vstrd.64 q1, [q1] @ encoding: [0x82,0xfd,0x00,0x3f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q1, [q1]
 
 # CHECK: vstrd.64 q7, [q1] @ encoding: [0x82,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1]
 
 # CHECK: vstrd.64 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1]!
 
 # CHECK: vstrd.64 q7, [q7] @ encoding: [0x8e,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q7]
 
 # CHECK: vstrd.64 q7, [q1, #8] @ encoding: [0x82,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #8]
 
 # CHECK: vstrd.64 q7, [q1, #-8]! @ encoding: [0x22,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #-8]!
 
 # CHECK: vstrd.64 q7, [q1, #1016] @ encoding: [0x82,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #1016]
 
 # CHECK: vstrd.64 q7, [q1, #-1016] @ encoding: [0x02,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #-1016]
 
 # CHECK: vstrd.64 q7, [q1, #264] @ encoding: [0x82,0xfd,0x21,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #264]
 
 # CHECK: vstrd.64 q7, [q1, #624] @ encoding: [0x82,0xfd,0x4e,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #624]
 
 # CHECK: vstrd.64 q7, [q1, #264] @ encoding: [0x82,0xfd,0x21,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #264]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: operand must be a register in range [q0, q7]
@@ -1393,547 +1393,547 @@ vstrd.64 q4, [q1, #3]
 vstrd.64 q4, [q1, #4]
 
 # CHECK: vldrb.u8 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s8 q0, [r0]
 
 # CHECK: vldrb.u8 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.8 q0, [r0]
 
 # CHECK: vldrb.u8 q0, [r8, #56] @ encoding: [0x98,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s8 q0, [r8, #56]
 
 # CHECK: vldrb.u8 q0, [r8, #56] @ encoding: [0x98,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.8 q0, [r8, #56]
 
 # CHECK: vldrb.u8 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s8 q5, [r4, #56]!
 
 # CHECK: vldrb.u8 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.8 q5, [r4, #56]!
 
 # CHECK: vstrb.8 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.u8 q0, [r0]
 
 # CHECK: vstrb.8 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.s8 q0, [r0]
 
 # CHECK: vstrb.8 q4, [r4, #56] @ encoding: [0x84,0xed,0x38,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.u8 q4, [r4, #56]
 
 # CHECK: vstrb.8 q4, [r4, #56] @ encoding: [0x84,0xed,0x38,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.s8 q4, [r4, #56]
 
 # CHECK: vstrb.8 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.u8 q5, [r4, #56]!
 
 # CHECK: vstrb.8 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.s8 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q0, [r0]
 
 # CHECK: vldrh.u16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q0, [r0]
 
 # CHECK: vldrh.u16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q0, [r0]
 
 # CHECK: vldrh.u16 q0, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q0, [r4, #56]
 
 # CHECK: vldrh.u16 q0, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q0, [r4, #56]
 
 # CHECK: vldrh.u16 q0, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q0, [r4, #56]
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q0, [r0]
 
 # CHECK: vstrh.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q0, [r0]
 
 # CHECK: vstrh.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q0, [r0]
 
 # CHECK: vstrh.16 q0, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q0, [r4, #56]
 
 # CHECK: vstrh.16 q0, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q0, [r4, #56]
 
 # CHECK: vstrh.16 q0, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q0, [r4, #56]
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [r0]
 
 # CHECK: vldrw.u32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [r0]
 
 # CHECK: vldrw.u32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [r0]
 
 # CHECK: vldrw.u32 q0, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [r4, #56]
 
 # CHECK: vldrw.u32 q0, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [r4, #56]
 
 # CHECK: vldrw.u32 q0, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [r4, #56]
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [r0]
 
 # CHECK: vstrw.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [r0]
 
 # CHECK: vstrw.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [r0]
 
 # CHECK: vstrw.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [r4, #56]
 
 # CHECK: vstrw.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [r4, #56]
 
 # CHECK: vstrw.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [r4, #56]
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q5, [r4, #56]!
 
 # CHECK: vldrb.u8 q0, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s8 q0, [r0, q1]
 
 # CHECK: vldrb.u8 q0, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.8 q0, [r0, q1]
 
 # CHECK: vldrh.u16 q3, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q3, [r0, q1]
 
 # CHECK: vldrh.u16 q3, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q3, [r0, q1]
 
 # CHECK: vldrh.u16 q3, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q3, [r0, q1]
 
 # CHECK: vldrh.u16 q0, [r0, q1, uxtw #1] @ encoding: [0x90,0xfc,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vldrh.u16 q0, [r0, q1, uxtw #1] @ encoding: [0x90,0xfc,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vldrh.u16 q0, [r0, q1, uxtw #1] @ encoding: [0x90,0xfc,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vldrw.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [r0, q1]
 
 # CHECK: vldrw.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [r0, q1]
 
 # CHECK: vldrw.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [r0, q1]
 
 # CHECK: vldrw.u32 q0, [r0, q1, uxtw #2] @ encoding: [0x90,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vldrw.u32 q0, [r0, q1, uxtw #2] @ encoding: [0x90,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vldrw.u32 q0, [r0, q1, uxtw #2] @ encoding: [0x90,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vldrd.u64 q0, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q0, [r0, q1]
 
 # CHECK: vldrd.u64 q0, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q0, [r0, q1]
 
 # CHECK: vldrd.u64 q0, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q0, [r0, q1]
 
 # CHECK: vldrd.u64 q0, [r0, q1, uxtw #3] @ encoding: [0x90,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vldrd.u64 q0, [r0, q1, uxtw #3] @ encoding: [0x90,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vldrd.u64 q0, [r0, q1, uxtw #3] @ encoding: [0x90,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vstrb.8 q0, [r0, q1] @ encoding: [0x80,0xec,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.u8 q0, [r0, q1]
 
 # CHECK: vstrb.8 q0, [r0, q1] @ encoding: [0x80,0xec,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.s8 q0, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q1] @ encoding: [0x80,0xec,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q3, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q1] @ encoding: [0x80,0xec,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q3, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q1] @ encoding: [0x80,0xec,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q3, [r0, q1]
 
 # CHECK: vstrh.16 q0, [r0, q1, uxtw #1] @ encoding: [0x80,0xec,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vstrh.16 q0, [r0, q1, uxtw #1] @ encoding: [0x80,0xec,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vstrh.16 q0, [r0, q1, uxtw #1] @ encoding: [0x80,0xec,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vstrw.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [r0, q1]
 
 # CHECK: vstrw.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [r0, q1]
 
 # CHECK: vstrw.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [r0, q1]
 
 # CHECK: vstrw.32 q0, [r0, q1, uxtw #2] @ encoding: [0x80,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vstrw.32 q0, [r0, q1, uxtw #2] @ encoding: [0x80,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vstrw.32 q0, [r0, q1, uxtw #2] @ encoding: [0x80,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vstrd.64 q3, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q3, [r0, q1]
 
 # CHECK: vstrd.64 q3, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q3, [r0, q1]
 
 # CHECK: vstrd.64 q3, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q3, [r0, q1]
 
 # CHECK: vstrd.64 q0, [r0, q1, uxtw #3] @ encoding: [0x80,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vstrd.64 q0, [r0, q1, uxtw #3] @ encoding: [0x80,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vstrd.64 q0, [r0, q1, uxtw #3] @ encoding: [0x80,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vldrw.u32 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [q1]
 
 # CHECK: vldrw.u32 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [q1]
 
 # CHECK: vldrw.u32 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [q1]
 
 # CHECK: vldrw.u32 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q7, [q1]!
 
 # CHECK: vldrw.u32 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q7, [q1]!
 
 # CHECK: vldrw.u32 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q7, [q1]!
 
 # CHECK: vldrw.u32 q7, [q1, #4] @ encoding: [0x92,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q7, [q1, #4]
 
 # CHECK: vldrw.u32 q7, [q1, #4] @ encoding: [0x92,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q7, [q1, #4]
 
 # CHECK: vldrw.u32 q7, [q1, #4] @ encoding: [0x92,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q7, [q1, #4]
 
 # CHECK: vldrw.u32 q7, [q1, #4]! @ encoding: [0xb2,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q7, [q1, #4]!
 
 # CHECK: vldrw.u32 q7, [q1, #4]! @ encoding: [0xb2,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q7, [q1, #4]!
 
 # CHECK: vldrw.u32 q7, [q1, #4]! @ encoding: [0xb2,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #4]!
 
 # CHECK: vstrw.32 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [q1]
 
 # CHECK: vstrw.32 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [q1]
 
 # CHECK: vstrw.32 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [q1]
 
 # CHECK: vstrw.32 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q7, [q1]!
 
 # CHECK: vstrw.32 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q7, [q1]!
 
 # CHECK: vstrw.32 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q7, [q1]!
 
 # CHECK: vstrw.32 q7, [q1, #508] @ encoding: [0x82,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q7, [q1, #508]
 
 # CHECK: vstrw.32 q7, [q1, #508] @ encoding: [0x82,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q7, [q1, #508]
 
 # CHECK: vstrw.32 q7, [q1, #508] @ encoding: [0x82,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q7, [q1, #508]
 
 # CHECK: vstrw.32 q7, [q1, #264]! @ encoding: [0xa2,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q7, [q1, #264]!
 
 # CHECK: vstrw.32 q7, [q1, #264]! @ encoding: [0xa2,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q7, [q1, #264]!
 
 # CHECK: vstrw.32 q7, [q1, #264]! @ encoding: [0xa2,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q7, [q1, #264]!
 
 # CHECK: vldrd.u64 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q0, [q1]
 
 # CHECK: vldrd.u64 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q0, [q1]
 
 # CHECK: vldrd.u64 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q0, [q1]
 
 # CHECK: vldrd.u64 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q7, [q1]!
 
 # CHECK: vldrd.u64 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q7, [q1]!
 
 # CHECK: vldrd.u64 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q7, [q1]!
 
 # CHECK: vldrd.u64 q7, [q1, #8] @ encoding: [0x92,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q7, [q1, #8]
 
 # CHECK: vldrd.u64 q7, [q1, #8] @ encoding: [0x92,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q7, [q1, #8]
 
 # CHECK: vldrd.u64 q7, [q1, #8] @ encoding: [0x92,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q7, [q1, #8]
 
 # CHECK: vldrd.u64 q7, [q1, #-1016]! @ encoding: [0x32,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q7, [q1, #-1016]!
 
 # CHECK: vldrd.u64 q7, [q1, #-1016]! @ encoding: [0x32,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q7, [q1, #-1016]!
 
 # CHECK: vldrd.u64 q7, [q1, #-1016]! @ encoding: [0x32,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q7, [q1, #-1016]!
 
 # CHECK: vstrd.64 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q0, [q1]
 
 # CHECK: vstrd.64 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q0, [q1]
 
 # CHECK: vstrd.64 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q0, [q1]
 
 # CHECK: vstrd.64 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q7, [q1]!
 
 # CHECK: vstrd.64 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q7, [q1]!
 
 # CHECK: vstrd.64 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q7, [q1]!
 
 # CHECK: vstrd.64 q7, [q1, #1016] @ encoding: [0x82,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q7, [q1, #1016]
 
 # CHECK: vstrd.64 q7, [q1, #1016] @ encoding: [0x82,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q7, [q1, #1016]
 
 # CHECK: vstrd.64 q7, [q1, #1016] @ encoding: [0x82,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q7, [q1, #1016]
 
 # CHECK: vstrd.64 q7, [q1, #-8]! @ encoding: [0x22,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q7, [q1, #-8]!
 
 # CHECK: vstrd.64 q7, [q1, #-8]! @ encoding: [0x22,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q7, [q1, #-8]!
 
 # CHECK: vstrd.64 q7, [q1, #-8]! @ encoding: [0x22,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q7, [q1, #-8]!
 
 vpste
diff --git a/llvm/test/MC/ARM/mve-misc.s b/llvm/test/MC/ARM/mve-misc.s
index f3af9e0afe64f..251a77b4710dc 100644
--- a/llvm/test/MC/ARM/mve-misc.s
+++ b/llvm/test/MC/ARM/mve-misc.s
@@ -7,63 +7,63 @@
 # RUN:     FileCheck --check-prefix=ERROR-NOMVE < %t %s
 
 # CHECK: vpsel   q0, q5, q2  @ encoding: [0x3b,0xfe,0x05,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vpsel   q0, q5, q2
 
 # CHECK: vpnot  @ encoding: [0x31,0xfe,0x4d,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vpnot
 
 # CHECK: wlstp.8     lr, r0, #1668  @ encoding: [0x00,0xf0,0x43,0xc3]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r0, #1668
 
 # CHECK: wlstp.16     lr, r0, #1668  @ encoding: [0x10,0xf0,0x43,0xc3]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.16     lr, r0, #1668
 
 # CHECK: wlstp.32     lr, r4, #2706  @ encoding: [0x24,0xf0,0x49,0xcd]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.32     lr, r4, #2706
 
 # CHECK: wlstp.64     lr, lr, #3026  @ encoding: [0x3e,0xf0,0xe9,0xcd]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.64     lr, lr, #3026
 
 # CHECK: wlstp.8     lr, r5, #3436  @ encoding: [0x05,0xf0,0xb7,0xc6]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r5, #3436
 
 # CHECK: wlstp.16     lr, r1, #1060  @ encoding: [0x11,0xf0,0x13,0xc2]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.16     lr, r1, #1060
 
 # CHECK: wlstp.32     lr, r7, #4036  @ encoding: [0x27,0xf0,0xe3,0xc7]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.32     lr, r7, #4036
 
 # CHECK: wlstp.8     lr, r1, #538  @ encoding: [0x01,0xf0,0x0d,0xc9]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r1, #538
 
 # CHECK: wlstp.8     lr, r10, #1404  @ encoding: [0x0a,0xf0,0xbf,0xc2]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r10, #1404
 
 # CHECK: wlstp.8     lr, r10, #1408  @ encoding: [0x0a,0xf0,0xc1,0xc2]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r10, #1408
 
 # CHECK: wlstp.8     lr, r10, #2358  @ encoding: [0x0a,0xf0,0x9b,0xcc]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r10, #2358
 
 # CHECK: wlstp.8     lr, r10, #4086  @ encoding: [0x0a,0xf0,0xfb,0xcf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r10, #4086
 
 # CHECK: wlstp.8     lr, r11, #1442  @ encoding: [0x0b,0xf0,0xd1,0xca]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r11, #1442
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: loop end is out of range or not a positive multiple of 2
@@ -87,39 +87,39 @@ wlstp.16     lr, sp, #1442
 wlstp.32     r10, r11, #1442
 
 # CHECK: wlstp.8     lr, r1, .Lendloop  @ encoding: [0x01'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r1, .Lendloop
 
 # CHECK: wlstp.16     lr, r2, .Lendloop  @ encoding: [0x12'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.16     lr, r2, .Lendloop
 
 # CHECK: wlstp.32     lr, r3, .Lendloop  @ encoding: [0x23'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.32     lr, r3, .Lendloop
 
 # CHECK: wlstp.64     lr, r5, .Lendloop  @ encoding: [0x35'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.64     lr, r5, .Lendloop
 
 # CHECK: wlstp.64     lr, r5, #0  @ encoding: [0x35,0xf0,0x01,0xc0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.64     lr, r5, #0
 
 # CHECK: dlstp.8     lr, r5  @ encoding: [0x05,0xf0,0x01,0xe0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 dlstp.8     lr, r5
 
 # CHECK: dlstp.16     lr, r5  @ encoding: [0x15,0xf0,0x01,0xe0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 dlstp.16     lr, r5
 
 # CHECK: dlstp.32     lr, r7  @ encoding: [0x27,0xf0,0x01,0xe0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 dlstp.32     lr, r7
 
 # CHECK: dlstp.64     lr, r2  @ encoding: [0x32,0xf0,0x01,0xe0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 dlstp.64     lr, r2
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: operand must be a register in range [r0, r12] or r14
@@ -135,15 +135,15 @@ dlstp.64     r10, r0
 dlstp.64     lr, pc
 
 # CHECK: letp lr, #-2 @ encoding: [0x1f,0xf0,0x01,0xc8]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 letp lr, #-2
 
 # CHECK: letp lr, #-8 @ encoding: [0x1f,0xf0,0x05,0xc0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 letp lr, #-8
 
 # CHECK: letp lr, #-4094 @ encoding: [0x1f,0xf0,0xff,0xcf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 letp lr, #-4094
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: invalid operand for instruction
@@ -159,7 +159,7 @@ letp lr, #8
 letp lr, #-4096
 
 # CHECK: letp lr, .Lstartloop @ encoding: [0x1f'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 letp lr, .Lstartloop
 
 # CHECK: lctp @ encoding: [0x0f,0xf0,0x01,0xe0]
@@ -172,8 +172,11 @@ it eq
 # ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 lctpeq
 
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vpste
+# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
 vpselt.s16 q0, q1, q2
+# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
 vpsele.i32 q0, q1, q2
 # CHECK: vpste @ encoding: [0x71,0xfe,0x4d,0x8f]
 # CHECK: vpselt q0, q1, q2 @ encoding: [0x33,0xfe,0x05,0x0f]
diff --git a/llvm/test/MC/ARM/neon-complex.s b/llvm/test/MC/ARM/neon-complex.s
index 0d428b596c94c..6054a08dc82e8 100644
--- a/llvm/test/MC/ARM/neon-complex.s
+++ b/llvm/test/MC/ARM/neon-complex.s
@@ -29,8 +29,10 @@
 // FP16-ARM: vcmla.f16       q0, q1, q2, #0  @ encoding: [0x44,0x08,0x22,0xfc]
 // FP16-THUMB: vcmla.f16       q0, q1, q2, #0  @ encoding: [0x22,0xfc,0x44,0x08]
 // NO-FP16-STDERR: :[[@LINE-3]]:{{[0-9]*}}: note: instruction requires: full half-float
-// V82A: :[[@LINE-4]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-// NO-NEON_STDERR: :[[@LINE-5]]:{{[0-9]*}}: error: instruction requires: NEON
+// V82A: :[[@LINE-4]]:{{[0-9]*}}: error: invalid instruction, any one of the following would fix this:
+// V82A: :[[@LINE-5]]:{{[0-9]*}}: note: instruction requires: mve.fp
+// V82A: :[[@LINE-6]]:{{[0-9]*}}: note: instruction requires: armv8.3a
+// NO-NEON_STDERR: :[[@LINE-7]]:{{[0-9]*}}: error: instruction requires: NEON
   vcmla.f32 d0, d1, d2, #0
 // ARM: vcmla.f32       d0, d1, d2, #0  @ encoding: [0x02,0x08,0x31,0xfc]
 // THUMB: vcmla.f32       d0, d1, d2, #0  @ encoding: [0x31,0xfc,0x02,0x08]
@@ -39,8 +41,10 @@
   vcmla.f32 q0, q1, q2, #0
 // ARM: vcmla.f32       q0, q1, q2, #0  @ encoding: [0x44,0x08,0x32,0xfc]
 // THUMB: vcmla.f32       q0, q1, q2, #0  @ encoding: [0x32,0xfc,0x44,0x08]
-// V82A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-// NO-NEON_STDERR: :[[@LINE-4]]:{{[0-9]*}}: error: instruction requires: NEON
+// V82A: :[[@LINE-3]]:{{[0-9]*}}: error: invalid instruction, any one of the following would fix this:
+// V82A: :[[@LINE-4]]:{{[0-9]*}}: note: instruction requires: mve.fp
+// V82A: :[[@LINE-5]]:{{[0-9]*}}: note: instruction requires: armv8.3a
+// NO-NEON_STDERR: :[[@LINE-6]]:{{[0-9]*}}: error: instruction requires: NEON
 
 // Valid rotations
   vcmla.f32 d0, d1, d2, #90
@@ -83,8 +87,10 @@
 // FP16-ARM: vcadd.f16       q0, q1, q2, #90 @ encoding: [0x44,0x08,0x82,0xfc]
 // FP16-THUMB: vcadd.f16       q0, q1, q2, #90 @ encoding: [0x82,0xfc,0x44,0x08]
 // NO-FP16-STDERR: :[[@LINE-3]]:{{[0-9]*}}: note: instruction requires: full half-float
-// V82A: :[[@LINE-4]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-// NO-NEON_STDERR: :[[@LINE-5]]:{{[0-9]*}}: error: instruction requires: NEON
+// V82A: :[[@LINE-4]]:{{[0-9]*}}: error: invalid instruction, any one of the following would fix this:
+// V82A: :[[@LINE-5]]:{{[0-9]*}}: note: instruction requires: mve.fp
+// V82A: :[[@LINE-6]]:{{[0-9]*}}: note: instruction requires: armv8.3a
+// NO-NEON_STDERR: :[[@LINE-7]]:{{[0-9]*}}: error: instruction requires: NEON
   vcadd.f32 d0, d1, d2, #90
 // ARM: vcadd.f32       d0, d1, d2, #90 @ encoding: [0x02,0x08,0x91,0xfc]
 // THUMB: vcadd.f32       d0, d1, d2, #90 @ encoding: [0x91,0xfc,0x02,0x08]
@@ -93,8 +99,10 @@
   vcadd.f32 q0, q1, q2, #90
 // ARM: vcadd.f32       q0, q1, q2, #90 @ encoding: [0x44,0x08,0x92,0xfc]
 // THUMB: vcadd.f32       q0, q1, q2, #90 @ encoding: [0x92,0xfc,0x44,0x08]
-// V82A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-// NO-NEON_STDERR: :[[@LINE-4]]:{{[0-9]*}}: error: instruction requires: NEON
+// V82A: :[[@LINE-3]]:{{[0-9]*}}: error: invalid instruction, any one of the following would fix this:
+// V82A: :[[@LINE-4]]:{{[0-9]*}}: note: instruction requires: mve.fp
+// V82A: :[[@LINE-5]]:{{[0-9]*}}: note: instruction requires: armv8.3a
+// NO-NEON_STDERR: :[[@LINE-6]]:{{[0-9]*}}: error: instruction requires: NEON
 
 // Valid rotations
   vcadd.f32 d0, d1, d2, #270
diff --git a/llvm/test/MC/ARM/no-mve.s b/llvm/test/MC/ARM/no-mve.s
index 668db402dc66d..2435ced45a370 100644
--- a/llvm/test/MC/ARM/no-mve.s
+++ b/llvm/test/MC/ARM/no-mve.s
@@ -4,13 +4,13 @@
 # RUN: FileCheck --check-prefix=CHECK-MVE < %t %s
 
 # CHECK-MVE: instruction requires: mve.fp
-# CHECK: invalid instruction
+# CHECK: instruction requires: mve.fp
 vcadd.f32 q1, q2, q3, #270
 
 # CHECK-MVE: instruction requires: mve.fp
-# CHECK: invalid instruction
+# CHECK: instruction requires: mve.fp
 vadd.f32 q1, q2, q3
 
 # CHECK-MVE: vadd.i16 q1, q2, q3 @ encoding: [0x14,0xef,0x46,0x28]
-# CHECK: invalid instruction
+# CHECK: instruction requires: mve
 vadd.i16 q1, q2, q3
diff --git a/llvm/test/MC/ARM/not-armv4.s b/llvm/test/MC/ARM/not-armv4.s
index c62c50c26c31d..b65b12976f78a 100644
--- a/llvm/test/MC/ARM/not-armv4.s
+++ b/llvm/test/MC/ARM/not-armv4.s
@@ -1,13 +1,21 @@
 @ RUN: not llvm-mc < %s -triple armv4-unknown-unknown -show-encoding 2>&1 | FileCheck %s
 
 @ PR18524
-@ CHECK: instruction requires: armv5t
+@ CHECK: error: invalid instruction, any one of the following would fix this:
+@ CHECK: note: instruction requires: armv5t
+@ CHECK: note: instruction requires: thumb2
 clz r4,r9
 
-@ CHECK: instruction requires: armv6t2
+@ CHECK: error: invalid instruction, any one of the following would fix this:
+@ CHECK: note: instruction requires: armv6t2
+@ CHECK: note: instruction requires: thumb2
 rbit r4,r9
 
 @ CHECK: error: instruction requires: armv6t2
 movw r4,#0x1234
-@ CHECK: error: instruction requires: armv6t2
+
+@ CHECK: error: invalid instruction, any one of the following would fix this:
+@ CHECK: note: invalid operand for instruction
+@ CHECK: note: operand must be a register in range [r0, r15]
+@ CHECK: note: instruction requires: armv6t2
 mov  r4,#0x1234
diff --git a/llvm/test/MC/ARM/register-token-source-loc.s b/llvm/test/MC/ARM/register-token-source-loc.s
index afb6ba0c61a39..7560f95999e71 100644
--- a/llvm/test/MC/ARM/register-token-source-loc.s
+++ b/llvm/test/MC/ARM/register-token-source-loc.s
@@ -3,6 +3,9 @@
 // CHECK:     error: invalid instruction, any one of the following would fix this:
 // CHECK-NEXT:  add sp, r0, #4
 // CHECK-NEXT:  ^
+// CHECK-NEXT: note: operand must be a register in range [r0, r7]
+// CHECK-NEXT:   add sp, r0, #4
+// CHECK-NEXT:       ^
 // CHECK-NEXT: note: operand must be a register sp
 // CHECK-NEXT:  add sp, r0, #4
 // CHECK-NEXT:          ^
diff --git a/llvm/test/MC/ARM/tMOVSr.s b/llvm/test/MC/ARM/tMOVSr.s
index 198c90aa5ceb4..09602fecd6851 100644
--- a/llvm/test/MC/ARM/tMOVSr.s
+++ b/llvm/test/MC/ARM/tMOVSr.s
@@ -1,6 +1,7 @@
 @ REQUIRES: asserts
-@ RUN: llvm-mc --triple=thumbv8 --debug %s 2>&1 | FileCheck %s --match-full-lines
+@ RUN: llvm-mc --triple=thumbv8 %s --show-encoding 2>&1 | FileCheck %s --match-full-lines
 
-@ CHECK: Changed to: <MCInst #{{[0-9]+}} tMOVSr <MCOperand Reg:{{[0-9]+}}> <MCOperand Reg:{{[0-9]+}}>>
+// Note this makes sure the narrow instruciton is selected
+@ CHECK: movs r2, r3 @ encoding: [0x1a,0x00]
 .text
   movs r2, r3
diff --git a/llvm/test/MC/ARM/thumb-diagnostics.s b/llvm/test/MC/ARM/thumb-diagnostics.s
index cacd7f21cb9d3..171d60ac13f9a 100644
--- a/llvm/test/MC/ARM/thumb-diagnostics.s
+++ b/llvm/test/MC/ARM/thumb-diagnostics.s
@@ -28,9 +28,12 @@
 @ CHECK-ERRORS:         ^
 @ CHECK-ERRORS: note: instruction variant requires Thumb2
 @ CHECK-ERRORS: note: operand must be a register sp
-@ CHECK-ERRORS-V5: error: instruction variant requires ARMv6 or later
+@ CHECK-ERRORS-V5: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS-V5:         mov r2, r3
 @ CHECK-ERRORS-V5:         ^
+@ CHECK-ERRORS-V5: note: instruction requires: arm-mode
+@ CHECK-ERRORS-V5: note: operand must be an immediate in the range [0,255] or a relocatable expression
+@ CHECK-ERRORS-V5: note: instruction variant requires ARMv6 or later
 
 @ Immediates where registers were expected
         adds #0, r1, r2
@@ -225,10 +228,11 @@
 
 @ Mismatched source/destination operands for MUL instruction.
         muls r1, r2, r3
-@ CHECK-ERRORS: error: destination register must match source register
+@ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS:         muls r1, r2, r3
-@ CHECK-ERRORS:              ^
-
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: note: destination register must match a source register
+@ CHECK-ERRORS: note: too many operands for instruction
 
 @ Out of range immediates for STR instruction.
         str r2, [r7, #-1]
@@ -274,30 +278,33 @@
 @ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS: add sp, #-1
 @ CHECK-ERRORS: ^
+@ CHECK-ERRORS: note: instruction requires: thumb2
+@ CHECK-ERRORS: add sp, #-1
+@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: note: operand must be a register in range [r0, r15]
 @ CHECK-ERRORS: add sp, #-1
 @ CHECK-ERRORS:         ^
 @ CHECK-ERRORS: note: invalid operand for instruction
 @ CHECK-ERRORS: add sp, #-1
 @ CHECK-ERRORS:         ^
-@ CHECK-ERRORS: note: instruction requires: thumb2
-@ CHECK-ERRORS: add sp, #-1
-@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS: add sp, #3
 @ CHECK-ERRORS: ^
+@ CHECK-ERRORS: note: instruction requires: thumb2
+@ CHECK-ERRORS: add sp, #3
+@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: note: operand must be a register in range [r0, r15]
 @ CHECK-ERRORS: add sp, #3
 @ CHECK-ERRORS:         ^
 @ CHECK-ERRORS: note: invalid operand for instruction
 @ CHECK-ERRORS: add sp, #3
 @ CHECK-ERRORS:         ^
-@ CHECK-ERRORS: note: instruction requires: thumb2
-@ CHECK-ERRORS: add sp, #3
-@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS: add sp, sp, #512
 @ CHECK-ERRORS: ^
+@ CHECK-ERRORS: note: instruction requires: thumb2
+@ CHECK-ERRORS: add sp, sp, #512
+@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: note: operand must be a register in range [r0, r15]
 @ CHECK-ERRORS: add sp, sp, #512
 @ CHECK-ERRORS:             ^
@@ -305,9 +312,6 @@
 @ CHECK-ERRORS: add sp, sp, #512
 @ CHECK-ERRORS:             ^
 @ CHECK-ERRORS: note: instruction requires: thumb2
-@ CHECK-ERRORS: add sp, sp, #512
-@ CHECK-ERRORS: ^
-@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS: add r2, sp, #1024
 @ CHECK-ERRORS: ^
         add r2, sp, ip
@@ -407,7 +411,8 @@
         adds
         adds r0
 @ CHECK-ERRORS: error: too few operands for instruction
-@ CHECK-ERRORS: error: too few operands for instruction
+@ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
+@ CHECK-ERRORS: note: too few operands for instruction
 
 @------------------------------------------------------------------------------
 @ Out of range width for SBFX/UBFX
diff --git a/llvm/test/MC/ARM/thumb-mov.s b/llvm/test/MC/ARM/thumb-mov.s
index 6f662f3ee2c72..e910722f2edf4 100644
--- a/llvm/test/MC/ARM/thumb-mov.s
+++ b/llvm/test/MC/ARM/thumb-mov.s
@@ -58,10 +58,16 @@
         movs sp, r0
         movs r0, sp
         movs sp, sp
-// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7: error: invalid instruction, any one of the following would fix this:
 // CHECK-V7-NEXT: movs sp, r0
-// CHECK-V7: instruction variant requires ARMv8 or later
+// CHECK-V7: note: instruction variant requires ARMv8 or later
+// CHECK-V7: note: operand must be a register in range [r0, r7]
+// CHECK-V7: error: invalid instruction, any one of the following would fix this:
 // CHECK-V7-NEXT: movs r0, sp
+// CHECK-V7: note: instruction variant requires ARMv8 or later
+// CHECK-V7: note: invalid operand for instruction
+// CHECK-V7: note: operand must be an immediate in the range [0,255] or a relocatable expression
+// CHECK-V7: note: operand must be a register in range [r0, r7]
 // CHECK-V7: error: instruction variant requires ARMv8 or later
 // CHECK-V7-NEXT: movs sp, sp
 // CHECK-V8: movs.w sp, r0            @ encoding: [0x5f,0xea,0x00,0x0d]
@@ -69,8 +75,9 @@
 // CHECK-V8: movs.w sp, sp            @ encoding: [0x5f,0xea,0x0d,0x0d]
 
         mov.w sp, sp
-// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7: error: invalid instruction, any one of the following would fix this:
 // CHECK-V7-NEXT: mov.w sp, sp
+// CHECK-V7: note: instruction variant requires ARMv8 or later
 // CHECK-V8: mov.w sp, sp             @ encoding: [0x4f,0xea,0x0d,0x0d]
 
         movs.w sp, r0
@@ -78,8 +85,9 @@
         movs.w sp, sp
 // CHECK-V7: error: instruction variant requires ARMv8 or later
 // CHECK-V7-NEXT: movs.w sp, r0
-// CHECK-V7: instruction variant requires ARMv8 or later
+// CHECK-V7: error: invalid instruction, any one of the following would fix this:
 // CHECK-V7-NEXT: movs.w r0, sp
+// CHECK-V7: note: instruction variant requires ARMv8 or later
 // CHECK-V7: error: instruction variant requires ARMv8 or later
 // CHECK-V7-NEXT: movs.w sp, sp
 // CHECK-V8: movs.w sp, r0            @ encoding: [0x5f,0xea,0x00,0x0d]
diff --git a/llvm/test/MC/ARM/thumb2-diagnostics.s b/llvm/test/MC/ARM/thumb2-diagnostics.s
index 45efd3c62120c..afb12ce430981 100644
--- a/llvm/test/MC/ARM/thumb2-diagnostics.s
+++ b/llvm/test/MC/ARM/thumb2-diagnostics.s
@@ -156,7 +156,9 @@ foo2:
         adds
         adds r0
 @ CHECK-ERRORS: error: too few operands for instruction
-@ CHECK-ERRORS: error: too few operands for instruction
+@ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
+@ CHECK-ERRORS: note: too few operands for instruction
+@ CHECK-ERRORS: note: operand must be a register in range [r0, r15]
 
         tst sp, #3
         tst sp, r5
diff --git a/llvm/test/MC/ARM/vfp4.s b/llvm/test/MC/ARM/vfp4.s
index 37d03bb53de8e..f3bc5750f4312 100644
--- a/llvm/test/MC/ARM/vfp4.s
+++ b/llvm/test/MC/ARM/vfp4.s
@@ -23,7 +23,7 @@ vfma.f32 d16, d18, d17
 
 @ ARM: vfma.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x08,0xf2]
 @ THUMB: vfma.f32	q2, q4, q0 @ encoding: [0x08,0xef,0x50,0x4c]
-@ THUMB_V7EM-ERRORS: error: invalid instruction
+@ THUMB_V7EM-ERRORS: error: instruction requires: mve.fp
 @ THUMB_V7EM-ERRORS-NEXT: vfma.f32 q2, q4, q0
 vfma.f32 q2, q4, q0
 
@@ -57,7 +57,7 @@ vfms.f32 d16, d18, d17
 
 @ ARM: vfms.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x28,0xf2]
 @ THUMB: vfms.f32	q2, q4, q0 @ encoding: [0x28,0xef,0x50,0x4c]
-@ THUMB_V7EM-ERRORS: error: invalid instruction
+@ THUMB_V7EM-ERRORS: error: instruction requires: mve.fp
 @ THUMB_V7EM-ERRORS-NEXT: vfms.f32 q2, q4, q0
 vfms.f32 q2, q4, q0
 
diff --git a/llvm/test/MC/BPF/insn-unit.s b/llvm/test/MC/BPF/insn-unit.s
index 224eb7381aa23..84735d196030d 100644
--- a/llvm/test/MC/BPF/insn-unit.s
+++ b/llvm/test/MC/BPF/insn-unit.s
@@ -65,8 +65,10 @@
 // CHECK: 8d 02 00 00 00 00 00 00 	callx r2
 
 // ======== BPF_JMP Class ========
+  may_goto Llabel0 // BPF_JCOND | BPF_K
   if r1 & r2 goto Llabel0    // BPF_JSET  | BPF_X
   if r1 & 0xffff goto Llabel0    // BPF_JSET  | BPF_K
+// CHECK: e5 00 1e 00 00 00 00 00	may_goto +30
 // CHECK: 4d 21 1d 00 00 00 00 00 	if r1 & r2 goto +29
 // CHECK: 45 01 1c 00 ff ff 00 00 	if r1 & 65535 goto +28
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt
index fbb95450dbbe2..929f3d2129be9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt
@@ -2536,6 +2536,9 @@
 # GFX11: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA) ; encoding: [0x85,0x4c,0x80,0xbe]
 0x85,0x4c,0x80,0xbe
 
+# GFX11: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA_TO_PC) ; encoding: [0x86,0x4c,0x80,0xbe]
+0x86,0x4c,0x80,0xbe
+
 # GFX11: s_setpc_b64 s[0:1]                      ; encoding: [0x00,0x48,0x80,0xbe]
 0x00,0x48,0x80,0xbe
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
index c87cea1205681..f8c235f77b5f5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
@@ -3279,9 +3279,12 @@
 # GFX12: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA) ; encoding: [0x85,0x4c,0x80,0xbe]
 0x85,0x4c,0x80,0xbe
 
-# GFX12: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_SE_AID_ID) ; encoding: [0x86,0x4c,0x80,0xbe]
+# GFX12: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA_TO_PC) ; encoding: [0x86,0x4c,0x80,0xbe]
 0x86,0x4c,0x80,0xbe
 
+# GFX12: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_SE_AID_ID) ; encoding: [0x87,0x4c,0x80,0xbe]
+0x87,0x4c,0x80,0xbe
+
 # GFX12: s_setpc_b64 s[0:1]                      ; encoding: [0x00,0x48,0x80,0xbe]
 0x00,0x48,0x80,0xbe
 
diff --git a/llvm/test/MC/Disassembler/X86/apx/cfcmov.txt b/llvm/test/MC/Disassembler/X86/apx/cfcmov.txt
new file mode 100644
index 0000000000000..4ecaa4b6a14a9
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/cfcmov.txt
@@ -0,0 +1,842 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   cfcmovbw	%r17w, %r21w, %r25w
+# INTEL: cfcmovb	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x42,0xe9
+
+# ATT:   cfcmovbw	%r17w, %r21w
+# INTEL: cfcmovb	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x42,0xcd
+
+# ATT:   cfcmovbw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovb	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x42,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbl	%r18d, %r22d, %r26d
+# INTEL: cfcmovb	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x42,0xf2
+
+# ATT:   cfcmovbl	%r18d, %r22d
+# INTEL: cfcmovb	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x42,0xd6
+
+# ATT:   cfcmovbl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovb	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x42,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbq	%r19, %r23, %r27
+# INTEL: cfcmovb	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x42,0xfb
+
+# ATT:   cfcmovbq	%r19, %r23
+# INTEL: cfcmovb	r23, r19
+0x62,0xec,0xfc,0x0c,0x42,0xdf
+
+# ATT:   cfcmovbq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovb	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x42,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovb	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x42,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovb	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x42,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovb	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x42,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovb	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x42,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovb	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x42,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovb	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x42,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbew	%r17w, %r21w, %r25w
+# INTEL: cfcmovbe	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x46,0xe9
+
+# ATT:   cfcmovbew	%r17w, %r21w
+# INTEL: cfcmovbe	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x46,0xcd
+
+# ATT:   cfcmovbew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovbe	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x46,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbel	%r18d, %r22d, %r26d
+# INTEL: cfcmovbe	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x46,0xf2
+
+# ATT:   cfcmovbel	%r18d, %r22d
+# INTEL: cfcmovbe	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x46,0xd6
+
+# ATT:   cfcmovbel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovbe	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x46,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbeq	%r19, %r23, %r27
+# INTEL: cfcmovbe	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x46,0xfb
+
+# ATT:   cfcmovbeq	%r19, %r23
+# INTEL: cfcmovbe	r23, r19
+0x62,0xec,0xfc,0x0c,0x46,0xdf
+
+# ATT:   cfcmovbeq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovbe	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x46,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovbe	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x46,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovbe	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x46,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovbe	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x46,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovbe	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x46,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbeq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovbe	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x46,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbeq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovbe	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x46,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlw	%r17w, %r21w, %r25w
+# INTEL: cfcmovl	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4c,0xe9
+
+# ATT:   cfcmovlw	%r17w, %r21w
+# INTEL: cfcmovl	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4c,0xcd
+
+# ATT:   cfcmovlw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovl	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovll	%r18d, %r22d, %r26d
+# INTEL: cfcmovl	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4c,0xf2
+
+# ATT:   cfcmovll	%r18d, %r22d
+# INTEL: cfcmovl	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4c,0xd6
+
+# ATT:   cfcmovll	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovl	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4c,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlq	%r19, %r23, %r27
+# INTEL: cfcmovl	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4c,0xfb
+
+# ATT:   cfcmovlq	%r19, %r23
+# INTEL: cfcmovl	r23, r19
+0x62,0xec,0xfc,0x0c,0x4c,0xdf
+
+# ATT:   cfcmovlq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovl	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovl	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovl	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovll	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovl	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4c,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovll	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovl	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4c,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovl	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovl	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlew	%r17w, %r21w, %r25w
+# INTEL: cfcmovle	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4e,0xe9
+
+# ATT:   cfcmovlew	%r17w, %r21w
+# INTEL: cfcmovle	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4e,0xcd
+
+# ATT:   cfcmovlew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovle	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlel	%r18d, %r22d, %r26d
+# INTEL: cfcmovle	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4e,0xf2
+
+# ATT:   cfcmovlel	%r18d, %r22d
+# INTEL: cfcmovle	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4e,0xd6
+
+# ATT:   cfcmovlel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovle	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4e,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovleq	%r19, %r23, %r27
+# INTEL: cfcmovle	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4e,0xfb
+
+# ATT:   cfcmovleq	%r19, %r23
+# INTEL: cfcmovle	r23, r19
+0x62,0xec,0xfc,0x0c,0x4e,0xdf
+
+# ATT:   cfcmovleq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovle	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovle	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovle	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovle	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4e,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovle	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4e,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovleq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovle	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovleq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovle	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaew	%r17w, %r21w, %r25w
+# INTEL: cfcmovae	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x43,0xe9
+
+# ATT:   cfcmovaew	%r17w, %r21w
+# INTEL: cfcmovae	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x43,0xcd
+
+# ATT:   cfcmovaew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovae	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x43,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovael	%r18d, %r22d, %r26d
+# INTEL: cfcmovae	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x43,0xf2
+
+# ATT:   cfcmovael	%r18d, %r22d
+# INTEL: cfcmovae	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x43,0xd6
+
+# ATT:   cfcmovael	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovae	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x43,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaeq	%r19, %r23, %r27
+# INTEL: cfcmovae	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x43,0xfb
+
+# ATT:   cfcmovaeq	%r19, %r23
+# INTEL: cfcmovae	r23, r19
+0x62,0xec,0xfc,0x0c,0x43,0xdf
+
+# ATT:   cfcmovaeq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovae	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x43,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovae	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x43,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovae	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x43,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovael	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovae	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x43,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovael	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovae	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x43,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaeq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovae	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x43,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaeq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovae	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x43,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaw	%r17w, %r21w, %r25w
+# INTEL: cfcmova	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x47,0xe9
+
+# ATT:   cfcmovaw	%r17w, %r21w
+# INTEL: cfcmova	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x47,0xcd
+
+# ATT:   cfcmovaw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmova	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x47,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoval	%r18d, %r22d, %r26d
+# INTEL: cfcmova	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x47,0xf2
+
+# ATT:   cfcmoval	%r18d, %r22d
+# INTEL: cfcmova	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x47,0xd6
+
+# ATT:   cfcmoval	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmova	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x47,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaq	%r19, %r23, %r27
+# INTEL: cfcmova	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x47,0xfb
+
+# ATT:   cfcmovaq	%r19, %r23
+# INTEL: cfcmova	r23, r19
+0x62,0xec,0xfc,0x0c,0x47,0xdf
+
+# ATT:   cfcmovaq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmova	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x47,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmova	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x47,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmova	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x47,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoval	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmova	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x47,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoval	291(%r28,%r29,4), %r18d
+# INTEL: cfcmova	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x47,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmova	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x47,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaq	291(%r28,%r29,4), %r19
+# INTEL: cfcmova	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x47,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgew	%r17w, %r21w, %r25w
+# INTEL: cfcmovge	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4d,0xe9
+
+# ATT:   cfcmovgew	%r17w, %r21w
+# INTEL: cfcmovge	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4d,0xcd
+
+# ATT:   cfcmovgew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovge	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgel	%r18d, %r22d, %r26d
+# INTEL: cfcmovge	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4d,0xf2
+
+# ATT:   cfcmovgel	%r18d, %r22d
+# INTEL: cfcmovge	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4d,0xd6
+
+# ATT:   cfcmovgel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovge	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4d,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgeq	%r19, %r23, %r27
+# INTEL: cfcmovge	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4d,0xfb
+
+# ATT:   cfcmovgeq	%r19, %r23
+# INTEL: cfcmovge	r23, r19
+0x62,0xec,0xfc,0x0c,0x4d,0xdf
+
+# ATT:   cfcmovgeq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovge	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovge	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovge	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovge	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4d,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovge	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4d,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgeq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovge	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgeq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovge	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnow	%r17w, %r21w, %r25w
+# INTEL: cfcmovno	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x41,0xe9
+
+# ATT:   cfcmovnow	%r17w, %r21w
+# INTEL: cfcmovno	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x41,0xcd
+
+# ATT:   cfcmovnow	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovno	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x41,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnol	%r18d, %r22d, %r26d
+# INTEL: cfcmovno	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x41,0xf2
+
+# ATT:   cfcmovnol	%r18d, %r22d
+# INTEL: cfcmovno	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x41,0xd6
+
+# ATT:   cfcmovnol	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovno	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x41,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnoq	%r19, %r23, %r27
+# INTEL: cfcmovno	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x41,0xfb
+
+# ATT:   cfcmovnoq	%r19, %r23
+# INTEL: cfcmovno	r23, r19
+0x62,0xec,0xfc,0x0c,0x41,0xdf
+
+# ATT:   cfcmovnoq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovno	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x41,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnow	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovno	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x41,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnow	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovno	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x41,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnol	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovno	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x41,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnol	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovno	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x41,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnoq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovno	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x41,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnoq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovno	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x41,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpw	%r17w, %r21w, %r25w
+# INTEL: cfcmovnp	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4b,0xe9
+
+# ATT:   cfcmovnpw	%r17w, %r21w
+# INTEL: cfcmovnp	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4b,0xcd
+
+# ATT:   cfcmovnpw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovnp	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpl	%r18d, %r22d, %r26d
+# INTEL: cfcmovnp	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4b,0xf2
+
+# ATT:   cfcmovnpl	%r18d, %r22d
+# INTEL: cfcmovnp	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4b,0xd6
+
+# ATT:   cfcmovnpl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovnp	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4b,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpq	%r19, %r23, %r27
+# INTEL: cfcmovnp	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4b,0xfb
+
+# ATT:   cfcmovnpq	%r19, %r23
+# INTEL: cfcmovnp	r23, r19
+0x62,0xec,0xfc,0x0c,0x4b,0xdf
+
+# ATT:   cfcmovnpq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovnp	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovnp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovnp	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovnp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4b,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovnp	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4b,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovnp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovnp	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsw	%r17w, %r21w, %r25w
+# INTEL: cfcmovns	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x49,0xe9
+
+# ATT:   cfcmovnsw	%r17w, %r21w
+# INTEL: cfcmovns	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x49,0xcd
+
+# ATT:   cfcmovnsw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovns	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x49,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsl	%r18d, %r22d, %r26d
+# INTEL: cfcmovns	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x49,0xf2
+
+# ATT:   cfcmovnsl	%r18d, %r22d
+# INTEL: cfcmovns	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x49,0xd6
+
+# ATT:   cfcmovnsl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovns	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x49,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsq	%r19, %r23, %r27
+# INTEL: cfcmovns	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x49,0xfb
+
+# ATT:   cfcmovnsq	%r19, %r23
+# INTEL: cfcmovns	r23, r19
+0x62,0xec,0xfc,0x0c,0x49,0xdf
+
+# ATT:   cfcmovnsq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovns	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x49,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovns	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x49,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovns	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x49,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovns	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x49,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovns	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x49,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovns	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x49,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovns	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x49,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnew	%r17w, %r21w, %r25w
+# INTEL: cfcmovne	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x45,0xe9
+
+# ATT:   cfcmovnew	%r17w, %r21w
+# INTEL: cfcmovne	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x45,0xcd
+
+# ATT:   cfcmovnew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovne	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x45,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnel	%r18d, %r22d, %r26d
+# INTEL: cfcmovne	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x45,0xf2
+
+# ATT:   cfcmovnel	%r18d, %r22d
+# INTEL: cfcmovne	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x45,0xd6
+
+# ATT:   cfcmovnel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovne	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x45,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovneq	%r19, %r23, %r27
+# INTEL: cfcmovne	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x45,0xfb
+
+# ATT:   cfcmovneq	%r19, %r23
+# INTEL: cfcmovne	r23, r19
+0x62,0xec,0xfc,0x0c,0x45,0xdf
+
+# ATT:   cfcmovneq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovne	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x45,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovne	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x45,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovne	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x45,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovne	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x45,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovne	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x45,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovneq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovne	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x45,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovneq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovne	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x45,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpw	%r17w, %r21w, %r25w
+# INTEL: cfcmovp	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4a,0xe9
+
+# ATT:   cfcmovpw	%r17w, %r21w
+# INTEL: cfcmovp	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4a,0xcd
+
+# ATT:   cfcmovpw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovp	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpl	%r18d, %r22d, %r26d
+# INTEL: cfcmovp	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4a,0xf2
+
+# ATT:   cfcmovpl	%r18d, %r22d
+# INTEL: cfcmovp	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4a,0xd6
+
+# ATT:   cfcmovpl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovp	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4a,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpq	%r19, %r23, %r27
+# INTEL: cfcmovp	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4a,0xfb
+
+# ATT:   cfcmovpq	%r19, %r23
+# INTEL: cfcmovp	r23, r19
+0x62,0xec,0xfc,0x0c,0x4a,0xdf
+
+# ATT:   cfcmovpq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovp	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovp	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4a,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovp	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4a,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovp	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsw	%r17w, %r21w, %r25w
+# INTEL: cfcmovs	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x48,0xe9
+
+# ATT:   cfcmovsw	%r17w, %r21w
+# INTEL: cfcmovs	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x48,0xcd
+
+# ATT:   cfcmovsw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovs	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x48,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsl	%r18d, %r22d, %r26d
+# INTEL: cfcmovs	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x48,0xf2
+
+# ATT:   cfcmovsl	%r18d, %r22d
+# INTEL: cfcmovs	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x48,0xd6
+
+# ATT:   cfcmovsl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovs	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x48,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsq	%r19, %r23, %r27
+# INTEL: cfcmovs	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x48,0xfb
+
+# ATT:   cfcmovsq	%r19, %r23
+# INTEL: cfcmovs	r23, r19
+0x62,0xec,0xfc,0x0c,0x48,0xdf
+
+# ATT:   cfcmovsq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovs	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x48,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovs	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x48,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovs	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x48,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovs	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x48,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovs	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x48,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovs	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x48,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovs	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x48,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovew	%r17w, %r21w, %r25w
+# INTEL: cfcmove	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x44,0xe9
+
+# ATT:   cfcmovew	%r17w, %r21w
+# INTEL: cfcmove	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x44,0xcd
+
+# ATT:   cfcmovew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmove	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x44,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovel	%r18d, %r22d, %r26d
+# INTEL: cfcmove	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x44,0xf2
+
+# ATT:   cfcmovel	%r18d, %r22d
+# INTEL: cfcmove	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x44,0xd6
+
+# ATT:   cfcmovel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmove	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x44,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoveq	%r19, %r23, %r27
+# INTEL: cfcmove	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x44,0xfb
+
+# ATT:   cfcmoveq	%r19, %r23
+# INTEL: cfcmove	r23, r19
+0x62,0xec,0xfc,0x0c,0x44,0xdf
+
+# ATT:   cfcmoveq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmove	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x44,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmove	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x44,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmove	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x44,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmove	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x44,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmove	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x44,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoveq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmove	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x44,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoveq	291(%r28,%r29,4), %r19
+# INTEL: cfcmove	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x44,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/cmov.txt b/llvm/test/MC/Disassembler/X86/apx/cmov.txt
new file mode 100644
index 0000000000000..cc96fb17e0085
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/cmov.txt
@@ -0,0 +1,386 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   cmovbw	%dx, %ax, %r9w
+# INTEL: cmovb	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x42,0xc2
+
+# ATT:   cmovbl	%ecx, %edx, %r10d
+# INTEL: cmovb	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x42,0xd1
+
+# ATT:   cmovbq	%r9, %r15, %r11
+# INTEL: cmovb	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x42,0xf9
+
+# ATT:   cmovbw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovb	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x42,0x54,0x80,0x7b
+
+# ATT:   cmovbl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovb	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x42,0x4c,0x80,0x7b
+
+# ATT:   cmovbq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovb	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x42,0x4c,0x80,0x7b
+
+# ATT:   cmovbew	%dx, %ax, %r9w
+# INTEL: cmovbe	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x46,0xc2
+
+# ATT:   cmovbel	%ecx, %edx, %r10d
+# INTEL: cmovbe	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x46,0xd1
+
+# ATT:   cmovbeq	%r9, %r15, %r11
+# INTEL: cmovbe	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x46,0xf9
+
+# ATT:   cmovbew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovbe	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x46,0x54,0x80,0x7b
+
+# ATT:   cmovbel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovbe	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x46,0x4c,0x80,0x7b
+
+# ATT:   cmovbeq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovbe	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x46,0x4c,0x80,0x7b
+
+# ATT:   cmovlw	%dx, %ax, %r9w
+# INTEL: cmovl	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4c,0xc2
+
+# ATT:   cmovll	%ecx, %edx, %r10d
+# INTEL: cmovl	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4c,0xd1
+
+# ATT:   cmovlq	%r9, %r15, %r11
+# INTEL: cmovl	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4c,0xf9
+
+# ATT:   cmovlw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovl	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4c,0x54,0x80,0x7b
+
+# ATT:   cmovll	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovl	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4c,0x4c,0x80,0x7b
+
+# ATT:   cmovlq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovl	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4c,0x4c,0x80,0x7b
+
+# ATT:   cmovlew	%dx, %ax, %r9w
+# INTEL: cmovle	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4e,0xc2
+
+# ATT:   cmovlel	%ecx, %edx, %r10d
+# INTEL: cmovle	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4e,0xd1
+
+# ATT:   cmovleq	%r9, %r15, %r11
+# INTEL: cmovle	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4e,0xf9
+
+# ATT:   cmovlew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovle	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4e,0x54,0x80,0x7b
+
+# ATT:   cmovlel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovle	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4e,0x4c,0x80,0x7b
+
+# ATT:   cmovleq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovle	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4e,0x4c,0x80,0x7b
+
+# ATT:   cmovaew	%dx, %ax, %r9w
+# INTEL: cmovae	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x43,0xc2
+
+# ATT:   cmovael	%ecx, %edx, %r10d
+# INTEL: cmovae	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x43,0xd1
+
+# ATT:   cmovaeq	%r9, %r15, %r11
+# INTEL: cmovae	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x43,0xf9
+
+# ATT:   cmovaew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovae	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x43,0x54,0x80,0x7b
+
+# ATT:   cmovael	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovae	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x43,0x4c,0x80,0x7b
+
+# ATT:   cmovaeq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovae	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x43,0x4c,0x80,0x7b
+
+# ATT:   cmovaw	%dx, %ax, %r9w
+# INTEL: cmova	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x47,0xc2
+
+# ATT:   cmoval	%ecx, %edx, %r10d
+# INTEL: cmova	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x47,0xd1
+
+# ATT:   cmovaq	%r9, %r15, %r11
+# INTEL: cmova	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x47,0xf9
+
+# ATT:   cmovaw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmova	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x47,0x54,0x80,0x7b
+
+# ATT:   cmoval	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmova	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x47,0x4c,0x80,0x7b
+
+# ATT:   cmovaq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmova	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x47,0x4c,0x80,0x7b
+
+# ATT:   cmovgew	%dx, %ax, %r9w
+# INTEL: cmovge	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4d,0xc2
+
+# ATT:   cmovgel	%ecx, %edx, %r10d
+# INTEL: cmovge	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4d,0xd1
+
+# ATT:   cmovgeq	%r9, %r15, %r11
+# INTEL: cmovge	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4d,0xf9
+
+# ATT:   cmovgew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovge	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4d,0x54,0x80,0x7b
+
+# ATT:   cmovgel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovge	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4d,0x4c,0x80,0x7b
+
+# ATT:   cmovgeq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovge	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4d,0x4c,0x80,0x7b
+
+# ATT:   cmovgw	%dx, %ax, %r9w
+# INTEL: cmovg	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4f,0xc2
+
+# ATT:   cmovgl	%ecx, %edx, %r10d
+# INTEL: cmovg	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4f,0xd1
+
+# ATT:   cmovgq	%r9, %r15, %r11
+# INTEL: cmovg	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4f,0xf9
+
+# ATT:   cmovgw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovg	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4f,0x54,0x80,0x7b
+
+# ATT:   cmovgl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovg	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4f,0x4c,0x80,0x7b
+
+# ATT:   cmovgq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovg	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4f,0x4c,0x80,0x7b
+
+# ATT:   cmovnow	%dx, %ax, %r9w
+# INTEL: cmovno	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x41,0xc2
+
+# ATT:   cmovnol	%ecx, %edx, %r10d
+# INTEL: cmovno	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x41,0xd1
+
+# ATT:   cmovnoq	%r9, %r15, %r11
+# INTEL: cmovno	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x41,0xf9
+
+# ATT:   cmovnow	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovno	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x41,0x54,0x80,0x7b
+
+# ATT:   cmovnol	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovno	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x41,0x4c,0x80,0x7b
+
+# ATT:   cmovnoq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovno	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x41,0x4c,0x80,0x7b
+
+# ATT:   cmovnpw	%dx, %ax, %r9w
+# INTEL: cmovnp	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4b,0xc2
+
+# ATT:   cmovnpl	%ecx, %edx, %r10d
+# INTEL: cmovnp	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4b,0xd1
+
+# ATT:   cmovnpq	%r9, %r15, %r11
+# INTEL: cmovnp	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4b,0xf9
+
+# ATT:   cmovnpw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovnp	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4b,0x54,0x80,0x7b
+
+# ATT:   cmovnpl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovnp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4b,0x4c,0x80,0x7b
+
+# ATT:   cmovnpq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovnp	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4b,0x4c,0x80,0x7b
+
+# ATT:   cmovnsw	%dx, %ax, %r9w
+# INTEL: cmovns	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x49,0xc2
+
+# ATT:   cmovnsl	%ecx, %edx, %r10d
+# INTEL: cmovns	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x49,0xd1
+
+# ATT:   cmovnsq	%r9, %r15, %r11
+# INTEL: cmovns	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x49,0xf9
+
+# ATT:   cmovnsw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovns	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x49,0x54,0x80,0x7b
+
+# ATT:   cmovnsl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovns	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x49,0x4c,0x80,0x7b
+
+# ATT:   cmovnsq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovns	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x49,0x4c,0x80,0x7b
+
+# ATT:   cmovnew	%dx, %ax, %r9w
+# INTEL: cmovne	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x45,0xc2
+
+# ATT:   cmovnel	%ecx, %edx, %r10d
+# INTEL: cmovne	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x45,0xd1
+
+# ATT:   cmovneq	%r9, %r15, %r11
+# INTEL: cmovne	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x45,0xf9
+
+# ATT:   cmovnew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovne	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x45,0x54,0x80,0x7b
+
+# ATT:   cmovnel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovne	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x45,0x4c,0x80,0x7b
+
+# ATT:   cmovneq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovne	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x45,0x4c,0x80,0x7b
+
+# ATT:   cmovow	%dx, %ax, %r9w
+# INTEL: cmovo	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x40,0xc2
+
+# ATT:   cmovol	%ecx, %edx, %r10d
+# INTEL: cmovo	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x40,0xd1
+
+# ATT:   cmovoq	%r9, %r15, %r11
+# INTEL: cmovo	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x40,0xf9
+
+# ATT:   cmovow	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovo	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x40,0x54,0x80,0x7b
+
+# ATT:   cmovol	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovo	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x40,0x4c,0x80,0x7b
+
+# ATT:   cmovoq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovo	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x40,0x4c,0x80,0x7b
+
+# ATT:   cmovpw	%dx, %ax, %r9w
+# INTEL: cmovp	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4a,0xc2
+
+# ATT:   cmovpl	%ecx, %edx, %r10d
+# INTEL: cmovp	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4a,0xd1
+
+# ATT:   cmovpq	%r9, %r15, %r11
+# INTEL: cmovp	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4a,0xf9
+
+# ATT:   cmovpw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovp	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4a,0x54,0x80,0x7b
+
+# ATT:   cmovpl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4a,0x4c,0x80,0x7b
+
+# ATT:   cmovpq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovp	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4a,0x4c,0x80,0x7b
+
+# ATT:   cmovsw	%dx, %ax, %r9w
+# INTEL: cmovs	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x48,0xc2
+
+# ATT:   cmovsl	%ecx, %edx, %r10d
+# INTEL: cmovs	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x48,0xd1
+
+# ATT:   cmovsq	%r9, %r15, %r11
+# INTEL: cmovs	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x48,0xf9
+
+# ATT:   cmovsw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovs	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x48,0x54,0x80,0x7b
+
+# ATT:   cmovsl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovs	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x48,0x4c,0x80,0x7b
+
+# ATT:   cmovsq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovs	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x48,0x4c,0x80,0x7b
+
+# ATT:   cmovew	%dx, %ax, %r9w
+# INTEL: cmove	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x44,0xc2
+
+# ATT:   cmovel	%ecx, %edx, %r10d
+# INTEL: cmove	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x44,0xd1
+
+# ATT:   cmoveq	%r9, %r15, %r11
+# INTEL: cmove	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x44,0xf9
+
+# ATT:   cmovew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmove	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x44,0x54,0x80,0x7b
+
+# ATT:   cmovel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmove	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x44,0x4c,0x80,0x7b
+
+# ATT:   cmoveq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmove	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x44,0x4c,0x80,0x7b
diff --git a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
index 1c1f70b096bed..1156f5c409922 100644
--- a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
@@ -11,6 +11,12 @@
 # INTEL: add	r18, qword ptr [r17 + 123], r16
 0x62,0xec,0xec,0x10,0x01,0x41,0x7b
 
+## MRMDestMemCC
+
+# ATT:   cfcmovbq %r16, 123(%r17,%r18,4)
+# INTEL: cfcmovb	qword ptr [r17 + 4*r18 + 123], r16
+0x62,0xec,0xf8,0x0c,0x42,0x44,0x91,0x7b
+
 ## MRMSrcMem
 
 # ATT:   vbroadcasti32x4	(%r16,%r17), %zmm0
@@ -21,6 +27,16 @@
 # INTEL: sub	r18, r17, qword ptr [r16 + 123]
 0x62,0xec,0xec,0x10,0x2b,0x48,0x7b
 
+## MRMSrcMemCC
+
+# ATT:   cfcmovbq	123(%r16,%r17,4), %r18
+# INTEL: cfcmovb	r18, qword ptr [r16 + 4*r17 + 123]
+0x62,0xec,0xf8,0x08,0x42,0x54,0x88,0x7b
+
+# ATT:   cfcmovbq	123(%r16,%r17,4), %r18, %r19
+# INTEL: cfcmovb	r19, r18, qword ptr [r16 + 4*r17 + 123]
+0x62,0xec,0xe0,0x14,0x42,0x54,0x88,0x7b
+
 ## MRM0m
 
 # ATT:   vprorq	$0, (%r16,%r17), %zmm0
@@ -123,12 +139,28 @@
 # INTEL: {nf}	add	r17, r16
 0x62,0xec,0xfc,0x0c,0x01,0xc1
 
+## MRMDestRegCC
+
+# ATT:   cfcmovbq	%r16, %r17
+# INTEL: cfcmovb	r17, r16
+0x62,0xec,0xfc,0x0c,0x42,0xc1
+
 ## MRMSrcReg
 
 # ATT:   mulxq	%r16, %r17, %r18
 # INTEL: mulx	r18, r17, r16
 0x62,0xea,0xf7,0x00,0xf6,0xd0
 
+## MRMSrcRegCC
+
+# ATT:   cfcmovbq	%r16, %r17, %r18
+# INTEL: cfcmovb	r18, r17, r16
+0x62,0xec,0xec,0x14,0x42,0xc8
+
+# ATT:   cfcmovlq	%r16, %r17, %r18
+# INTEL: cfcmovl	r18, r17, r16
+0x62,0xec,0xec,0x14,0x4c,0xc8
+
 ## MRMSrcReg4VOp3
 
 # ATT:   bzhiq	%r19, %r23, %r27
diff --git a/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt b/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt
index 9e812e370146b..fd12d904a4a46 100644
--- a/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt
@@ -430,3 +430,17 @@
 # ATT:   ccmpoq {dfv=}	%r16, %r17
 # INTEL: ccmpo {dfv=}	r17, r16
 0x62,0xec,0x84,0x00,0x3b,0xc8
+
+## cfcmov
+
+# ATT:   cfcmovbew	%r16w, %r17w
+# INTEL: cfcmovbe	r17w, r16w
+0x62,0xec,0x7d,0x08,0x46,0xc8
+
+# ATT:   cfcmovbel	%r16d, %r17d
+# INTEL: cfcmovbe	r17d, r16d
+0x62,0xec,0x7c,0x08,0x46,0xc8
+
+# ATT:   cfcmovbeq	%r16, %r17
+# INTEL: cfcmovbe	r17, r16
+0x62,0xec,0xfc,0x08,0x46,0xc8
diff --git a/llvm/test/MC/Hexagon/directive-attribute-err.s b/llvm/test/MC/Hexagon/directive-attribute-err.s
new file mode 100644
index 0000000000000..52b145b1ff28a
--- /dev/null
+++ b/llvm/test/MC/Hexagon/directive-attribute-err.s
@@ -0,0 +1,24 @@
+/// attribute parsing error cases.
+
+// RUN: not llvm-mc -triple=hexagon -filetype=asm %s 2>&1 \
+// RUN:   | FileCheck %s
+
+  .attribute Tag_unknown_name, 0
+// CHECK: [[#@LINE-1]]:14: error: attribute name not recognized: Tag_unknown_name
+// CHECK-NEXT:   .attribute Tag_unknown_name
+
+  .attribute [non_constant_expression], 0
+// CHECK: [[#@LINE-1]]:14: error: expected numeric constant
+// CHECK-NEXT:   .attribute [non_constant_expression], 0
+
+  .attribute 42, "forty two"
+// CHECK: [[#@LINE-1]]:18: error: expected numeric constant
+// CHECK-NEXT:   .attribute 42, "forty two"
+
+  .attribute Tag_arch, "v75"
+// CHECK: [[#@LINE-1]]:24: error: expected numeric constant
+// CHECK-NEXT:   .attribute Tag_arch, "v75"
+
+  .attribute 0
+// CHECK: :[[#@LINE-1]]:15: error: expected comma
+// CHECK-NEXT:   .attribute 0
diff --git a/llvm/test/MC/Hexagon/directive-attribute.s b/llvm/test/MC/Hexagon/directive-attribute.s
new file mode 100644
index 0000000000000..d7c893061e18a
--- /dev/null
+++ b/llvm/test/MC/Hexagon/directive-attribute.s
@@ -0,0 +1,41 @@
+/// Check .attribute parsing.
+
+// RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-readelf -A - | \
+// RUN:     FileCheck %s --match-full-lines --implicit-check-not={{.}}
+
+.attribute 4, 71 // Tag_arch
+.attribute Tag_cabac, 1
+.attribute Tag_hvx_arch, 68
+.attribute 7, 1 // Tag_hvx_qfloat
+
+//      CHECK: BuildAttributes {
+// CHECK-NEXT:   FormatVersion: 0x41
+// CHECK-NEXT:   Section 1 {
+// CHECK-NEXT:     SectionLength: 25
+// CHECK-NEXT:     Vendor: hexagon
+// CHECK-NEXT:     Tag: Tag_File (0x1)
+// CHECK-NEXT:     Size: 13
+// CHECK-NEXT:     FileAttributes {
+// CHECK-NEXT:       Attribute {
+// CHECK-NEXT:         Tag: 4
+// CHECK-NEXT:         TagName: arch
+// CHECK-NEXT:         Value: 71
+// CHECK-NEXT:       }
+// CHECK-NEXT:       Attribute {
+// CHECK-NEXT:         Tag: 10
+// CHECK-NEXT:         TagName: cabac
+// CHECK-NEXT:         Value: 1
+// CHECK-NEXT:       }
+// CHECK-NEXT:       Attribute {
+// CHECK-NEXT:         Tag: 5
+// CHECK-NEXT:         TagName: hvx_arch
+// CHECK-NEXT:         Value: 68
+// CHECK-NEXT:       }
+// CHECK-NEXT:       Attribute {
+// CHECK-NEXT:         Tag: 7
+// CHECK-NEXT:         TagName: hvx_qfloat
+// CHECK-NEXT:         Value: 1
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
diff --git a/llvm/test/MC/Hexagon/hexagon_attributes.s b/llvm/test/MC/Hexagon/hexagon_attributes.s
new file mode 100644
index 0000000000000..e90536030d977
--- /dev/null
+++ b/llvm/test/MC/Hexagon/hexagon_attributes.s
@@ -0,0 +1,94 @@
+/// Check that file attributes are recorded in a .hexagon.attributes section.
+
+q0&=vcmp.gt(v0.bf,v0.bf)     // hvxv73, hvx-qfloat
+r3:2=cround(r1:0,#0x0)       // v67, audio
+v3:0.w=vrmpyz(v0.b,r0.b)     // hvxv73, zreg
+v1:0.sf=vadd(v0.bf,v0.bf)    // hvxv73, hvx-ieee-fp
+
+// RUN: llvm-mc --mattr=+v67,+hvxv73,+hvx-qfloat,+hvx-ieee-fp,+zreg,+audio %s \
+// RUN:   -triple=hexagon -filetype=obj --hexagon-add-build-attributes -o %t.o
+
+// RUN: llvm-readelf -A %t.o | \
+// RUN:   FileCheck %s --match-full-lines --implicit-check-not={{.}} --check-prefix=READELF
+
+/// llvm-objudmp should be able to determine subtarget features
+/// without manually passing in features when an attribute section is present.
+// RUN: llvm-objdump -d %t.o | FileCheck %s --check-prefix=OBJDUMP
+
+// RUN: llvm-mc --mattr=+v67,+hvxv73,+hvx-qfloat,+hvx-ieee-fp,+zreg,+audio %s \
+// RUN:   -triple=hexagon -filetype=asm --hexagon-add-build-attributes | \
+// RUN:     FileCheck %s --match-full-lines --implicit-check-not={{.}} --check-prefix=ASM
+
+//      READELF: BuildAttributes {
+// READELF-NEXT:   FormatVersion: 0x41
+// READELF-NEXT:   Section 1 {
+// READELF-NEXT:     SectionLength: 31
+// READELF-NEXT:     Vendor: hexagon
+// READELF-NEXT:     Tag: Tag_File (0x1)
+// READELF-NEXT:     Size: 19
+// READELF-NEXT:     FileAttributes {
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 4
+// READELF-NEXT:         TagName: arch
+// READELF-NEXT:         Value: 67
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 5
+// READELF-NEXT:         TagName: hvx_arch
+// READELF-NEXT:         Value: 73
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 6
+// READELF-NEXT:         TagName: hvx_ieeefp
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 7
+// READELF-NEXT:         TagName: hvx_qfloat
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 8
+// READELF-NEXT:         TagName: zreg
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 9
+// READELF-NEXT:         TagName: audio
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 10
+// READELF-NEXT:         TagName: cabac
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:     }
+// READELF-NEXT:   }
+// READELF-NEXT: }
+
+//      OBJDUMP: 1c80e0d0 {      q0 &= vcmp.gt(v0.bf,v0.bf) }
+// OBJDUMP-NEXT: 8ce0c042 {      r3:2 = cround(r1:0,#0x0) }
+// OBJDUMP-NEXT: 19e8c000 {      v3:0.w = vrmpyz(v0.b,r0.b) }
+// OBJDUMP-NEXT: 1d40e0c0 {      v1:0.sf = vadd(v0.bf,v0.bf) }
+
+//      ASM: .attribute      4, 67   // Tag_arch
+// ASM-NEXT: .attribute      5, 73   // Tag_hvx_arch
+// ASM-NEXT: .attribute      6, 1    // Tag_hvx_ieeefp
+// ASM-NEXT: .attribute      7, 1    // Tag_hvx_qfloat
+// ASM-NEXT: .attribute      8, 1    // Tag_zreg
+// ASM-NEXT: .attribute      9, 1    // Tag_audio
+// ASM-NEXT: .attribute      10, 1   // Tag_cabac
+// ASM-NEXT:        .text
+// ASM-EMPTY:
+// ASM-NEXT:        {
+// ASM-NEXT:                q0 &= vcmp.gt(v0.bf,v0.bf)
+// ASM-NEXT:        }
+// ASM-NEXT:        {
+// ASM-NEXT:                r3:2 = cround(r1:0,#0)
+// ASM-NEXT:        }
+// ASM-NEXT:        {
+// ASM-NEXT:                v3:0.w = vrmpyz(v0.b,r0.b)
+// ASM-NEXT:        }
+// ASM-NEXT:        {
+// ASM-NEXT:                v1:0.sf = vadd(v0.bf,v0.bf)
+// ASM-NEXT:        }
diff --git a/llvm/test/MC/LoongArch/Macros/macros-li-bad.s b/llvm/test/MC/LoongArch/Macros/macros-li-bad.s
index 194b86bfed273..c880a01020059 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-li-bad.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-li-bad.s
@@ -5,3 +5,9 @@ li.w $a0, 0x100000000
 
 li.d $a0, 0x10000000000000000
 # CHECK: :[[#@LINE-1]]:11: error: unknown operand
+
+li.w $a0, non_const_val
+# CHECK: :[[#@LINE-1]]:11: error: operand must be a 32 bit immediate
+
+li.d $a0, non_const_val
+# CHECK: :[[#@LINE-1]]:11: error: operand must be a 64 bit immediate
diff --git a/llvm/test/MC/RISCV/rv32-machine-csr-names.s b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
index e7a6d9ce718f2..016f448d8dac6 100644
--- a/llvm/test/MC/RISCV/rv32-machine-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
@@ -1149,3 +1149,59 @@ csrrs t2, 0x319, zero
 csrrs t1, miph, zero
 # uimm12
 csrrs t2, 0x354, zero
+
+################################################
+# Resumable Non-Maskable Interrupts(Smrnmi) CSRs
+################################################
+
+# mnscratch
+# name
+# CHECK-INST: csrrs t1, mnscratch, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x00,0x74]
+# CHECK-INST-ALIAS: csrr t1, mnscratch
+# uimm12
+# CHECK-INST: csrrs t2, mnscratch, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x00,0x74]
+# CHECK-INST-ALIAS: csrr t2, mnscratch
+csrrs t1, mnscratch, zero
+# uimm12
+csrrs t2, 0x740, zero
+
+# mnepc
+# name
+# CHECK-INST: csrrs t1, mnepc, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x10,0x74]
+# CHECK-INST-ALIAS: csrr t1, mnepc
+# uimm12
+# CHECK-INST: csrrs t2, mnepc, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x10,0x74]
+# CHECK-INST-ALIAS: csrr t2, mnepc
+csrrs t1, mnepc, zero
+# uimm12
+csrrs t2, 0x741, zero
+
+# mncause
+# name
+# CHECK-INST: csrrs t1, mncause, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x20,0x74]
+# CHECK-INST-ALIAS: csrr t1, mncause
+# uimm12
+# CHECK-INST: csrrs t2, mncause, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x20,0x74]
+# CHECK-INST-ALIAS: csrr t2, mncause
+csrrs t1, mncause, zero
+# uimm12
+csrrs t2, 0x742, zero
+
+# mnstatus
+# name
+# CHECK-INST: csrrs t1, mnstatus, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x40,0x74]
+# CHECK-INST-ALIAS: csrr t1, mnstatus
+# uimm12
+# CHECK-INST: csrrs t2, mnstatus, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x40,0x74]
+# CHECK-INST-ALIAS: csrr t2, mnstatus
+csrrs t1, mnstatus, zero
+# uimm12
+csrrs t2, 0x744, zero
diff --git a/llvm/test/MC/RISCV/xsifive-invalid.s b/llvm/test/MC/RISCV/xsifive-invalid.s
new file mode 100644
index 0000000000000..5210d29f9d363
--- /dev/null
+++ b/llvm/test/MC/RISCV/xsifive-invalid.s
@@ -0,0 +1,20 @@
+# RUN: not llvm-mc -triple riscv32 < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple riscv64 < %s 2>&1 | FileCheck %s
+
+sf.cflush.d.l1 0x10 # CHECK: :[[@LINE]]:16: error: invalid operand for instruction
+
+sf.cdiscard.d.l1 0x10 # CHECK: :[[@LINE]]:18: error: invalid operand for instruction
+
+sf.cflush.d.l1 x0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSiFivecflushdlone' (SiFive sf.cflush.d.l1 Instruction){{$}}
+
+sf.cflush.d.l1 x7 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSiFivecflushdlone' (SiFive sf.cflush.d.l1 Instruction){{$}}
+
+sf.cdiscard.d.l1 x0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSiFivecdiscarddlone' (SiFive sf.cdiscard.d.l1 Instruction){{$}}
+
+sf.cdiscard.d.l1 x7 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSiFivecdiscarddlone' (SiFive sf.cdiscard.d.l1 Instruction){{$}}
+
+sf.cease x1 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction
+
+sf.cease 0x10 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction
+
+sf.cease # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSfcease' (SiFive sf.cease Instruction){{$}}
diff --git a/llvm/test/MC/RISCV/xsifive-valid.s b/llvm/test/MC/RISCV/xsifive-valid.s
new file mode 100644
index 0000000000000..8aa0ab1bd8ba3
--- /dev/null
+++ b/llvm/test/MC/RISCV/xsifive-valid.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease < %s \
+# RUN:     | llvm-objdump --mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease < %s \
+# RUN:     | llvm-objdump --mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: sf.cflush.d.l1     zero
+# CHECK-ENC: encoding: [0x73,0x00,0x00,0xfc]
+sf.cflush.d.l1 x0
+# CHECK-INST: sf.cflush.d.l1     zero
+# CHECK-ENC: encoding: [0x73,0x00,0x00,0xfc]
+sf.cflush.d.l1
+
+# CHECK-INST: sf.cflush.d.l1     t2
+# CHECK-ENC: encoding: [0x73,0x80,0x03,0xfc]
+sf.cflush.d.l1 x7
+
+# CHECK-INST: sf.cdiscard.d.l1   zero
+# CHECK-ENC: encoding: [0x73,0x00,0x20,0xfc]
+sf.cdiscard.d.l1 x0
+# CHECK-INST: sf.cdiscard.d.l1   zero
+# CHECK-ENC: encoding: [0x73,0x00,0x20,0xfc]
+sf.cdiscard.d.l1
+
+# CHECK-INST: sf.cdiscard.d.l1   t2
+# CHECK-ENC: encoding: [0x73,0x80,0x23,0xfc]
+sf.cdiscard.d.l1 x7
+
+# CHECK-INST: sf.cease
+# CHECK-ENC: encoding: [0x73,0x00,0x50,0x30]
+sf.cease
diff --git a/llvm/test/MC/X86/apx/cfcmov-att.s b/llvm/test/MC/X86/apx/cfcmov-att.s
new file mode 100644
index 0000000000000..a6947ca7bc0de
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cfcmov-att.s
@@ -0,0 +1,841 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+
+# CHECK: cfcmovbw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x42,0xe9]
+         cfcmovbw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovbw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x42,0xcd]
+         cfcmovbw	%r17w, %r21w
+
+# CHECK: cfcmovbw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x42,0xf2]
+         cfcmovbl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovbl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x42,0xd6]
+         cfcmovbl	%r18d, %r22d
+
+# CHECK: cfcmovbl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x42,0xfb]
+         cfcmovbq	%r19, %r23, %r27
+
+# CHECK: cfcmovbq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x42,0xdf]
+         cfcmovbq	%r19, %r23
+
+# CHECK: cfcmovbq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovbw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovbl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovbl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovbq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovbq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovbew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x46,0xe9]
+         cfcmovbew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovbew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x46,0xcd]
+         cfcmovbew	%r17w, %r21w
+
+# CHECK: cfcmovbew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x46,0xf2]
+         cfcmovbel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovbel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x46,0xd6]
+         cfcmovbel	%r18d, %r22d
+
+# CHECK: cfcmovbel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbeq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x46,0xfb]
+         cfcmovbeq	%r19, %r23, %r27
+
+# CHECK: cfcmovbeq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x46,0xdf]
+         cfcmovbeq	%r19, %r23
+
+# CHECK: cfcmovbeq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbeq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovbew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovbel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovbel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovbeq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbeq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovbeq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbeq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovlw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4c,0xe9]
+         cfcmovlw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovlw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4c,0xcd]
+         cfcmovlw	%r17w, %r21w
+
+# CHECK: cfcmovlw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovll	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4c,0xf2]
+         cfcmovll	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovll	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4c,0xd6]
+         cfcmovll	%r18d, %r22d
+
+# CHECK: cfcmovll	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovll	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovlq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4c,0xfb]
+         cfcmovlq	%r19, %r23, %r27
+
+# CHECK: cfcmovlq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4c,0xdf]
+         cfcmovlq	%r19, %r23
+
+# CHECK: cfcmovlq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovlw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovlw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovll	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovll	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovll	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovll	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovlq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovlq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovlew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4e,0xe9]
+         cfcmovlew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovlew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4e,0xcd]
+         cfcmovlew	%r17w, %r21w
+
+# CHECK: cfcmovlew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovlel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4e,0xf2]
+         cfcmovlel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovlel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4e,0xd6]
+         cfcmovlel	%r18d, %r22d
+
+# CHECK: cfcmovlel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovleq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4e,0xfb]
+         cfcmovleq	%r19, %r23, %r27
+
+# CHECK: cfcmovleq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4e,0xdf]
+         cfcmovleq	%r19, %r23
+
+# CHECK: cfcmovleq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovleq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovlew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovlew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovlel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovlel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovleq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovleq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovleq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovleq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovaew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x43,0xe9]
+         cfcmovaew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovaew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x43,0xcd]
+         cfcmovaew	%r17w, %r21w
+
+# CHECK: cfcmovaew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovael	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x43,0xf2]
+         cfcmovael	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovael	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x43,0xd6]
+         cfcmovael	%r18d, %r22d
+
+# CHECK: cfcmovael	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovael	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovaeq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x43,0xfb]
+         cfcmovaeq	%r19, %r23, %r27
+
+# CHECK: cfcmovaeq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x43,0xdf]
+         cfcmovaeq	%r19, %r23
+
+# CHECK: cfcmovaeq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaeq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovaew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovaew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovael	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovael	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovael	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovael	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovaeq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaeq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovaeq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaeq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovaw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x47,0xe9]
+         cfcmovaw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovaw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x47,0xcd]
+         cfcmovaw	%r17w, %r21w
+
+# CHECK: cfcmovaw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmoval	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x47,0xf2]
+         cfcmoval	%r18d, %r22d, %r26d
+
+# CHECK: cfcmoval	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x47,0xd6]
+         cfcmoval	%r18d, %r22d
+
+# CHECK: cfcmoval	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmoval	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovaq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x47,0xfb]
+         cfcmovaq	%r19, %r23, %r27
+
+# CHECK: cfcmovaq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x47,0xdf]
+         cfcmovaq	%r19, %r23
+
+# CHECK: cfcmovaq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovaw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovaw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmoval	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmoval	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmoval	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmoval	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovaq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovaq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovgew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4d,0xe9]
+         cfcmovgew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovgew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4d,0xcd]
+         cfcmovgew	%r17w, %r21w
+
+# CHECK: cfcmovgew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovgel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4d,0xf2]
+         cfcmovgel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovgel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4d,0xd6]
+         cfcmovgel	%r18d, %r22d
+
+# CHECK: cfcmovgel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovgeq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4d,0xfb]
+         cfcmovgeq	%r19, %r23, %r27
+
+# CHECK: cfcmovgeq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4d,0xdf]
+         cfcmovgeq	%r19, %r23
+
+# CHECK: cfcmovgeq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgeq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovgew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovgew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovgel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovgel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovgeq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgeq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovgeq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgeq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovnow	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x41,0xe9]
+         cfcmovnow	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovnow	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x41,0xcd]
+         cfcmovnow	%r17w, %r21w
+
+# CHECK: cfcmovnow	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnow	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnol	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x41,0xf2]
+         cfcmovnol	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovnol	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x41,0xd6]
+         cfcmovnol	%r18d, %r22d
+
+# CHECK: cfcmovnol	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnol	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnoq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x41,0xfb]
+         cfcmovnoq	%r19, %r23, %r27
+
+# CHECK: cfcmovnoq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x41,0xdf]
+         cfcmovnoq	%r19, %r23
+
+# CHECK: cfcmovnoq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnoq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnow	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnow	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovnow	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnow	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovnol	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnol	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovnol	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnol	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovnoq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnoq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovnoq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnoq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovnpw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4b,0xe9]
+         cfcmovnpw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovnpw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4b,0xcd]
+         cfcmovnpw	%r17w, %r21w
+
+# CHECK: cfcmovnpw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnpl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4b,0xf2]
+         cfcmovnpl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovnpl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4b,0xd6]
+         cfcmovnpl	%r18d, %r22d
+
+# CHECK: cfcmovnpl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnpq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4b,0xfb]
+         cfcmovnpq	%r19, %r23, %r27
+
+# CHECK: cfcmovnpq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4b,0xdf]
+         cfcmovnpq	%r19, %r23
+
+# CHECK: cfcmovnpq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnpw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovnpw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovnpl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovnpl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovnpq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovnpq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovnsw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x49,0xe9]
+         cfcmovnsw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovnsw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x49,0xcd]
+         cfcmovnsw	%r17w, %r21w
+
+# CHECK: cfcmovnsw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnsl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x49,0xf2]
+         cfcmovnsl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovnsl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x49,0xd6]
+         cfcmovnsl	%r18d, %r22d
+
+# CHECK: cfcmovnsl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnsq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x49,0xfb]
+         cfcmovnsq	%r19, %r23, %r27
+
+# CHECK: cfcmovnsq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x49,0xdf]
+         cfcmovnsq	%r19, %r23
+
+# CHECK: cfcmovnsq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnsw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovnsw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovnsl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovnsl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovnsq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovnsq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovnew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x45,0xe9]
+         cfcmovnew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovnew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x45,0xcd]
+         cfcmovnew	%r17w, %r21w
+
+# CHECK: cfcmovnew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x45,0xf2]
+         cfcmovnel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovnel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x45,0xd6]
+         cfcmovnel	%r18d, %r22d
+
+# CHECK: cfcmovnel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovneq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x45,0xfb]
+         cfcmovneq	%r19, %r23, %r27
+
+# CHECK: cfcmovneq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x45,0xdf]
+         cfcmovneq	%r19, %r23
+
+# CHECK: cfcmovneq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovneq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovnew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovnel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovnel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovneq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovneq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovneq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovneq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovpw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4a,0xe9]
+         cfcmovpw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovpw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4a,0xcd]
+         cfcmovpw	%r17w, %r21w
+
+# CHECK: cfcmovpw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovpl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4a,0xf2]
+         cfcmovpl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovpl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4a,0xd6]
+         cfcmovpl	%r18d, %r22d
+
+# CHECK: cfcmovpl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovpq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4a,0xfb]
+         cfcmovpq	%r19, %r23, %r27
+
+# CHECK: cfcmovpq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4a,0xdf]
+         cfcmovpq	%r19, %r23
+
+# CHECK: cfcmovpq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovpw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovpw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovpl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovpl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovpq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovpq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovsw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x48,0xe9]
+         cfcmovsw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovsw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x48,0xcd]
+         cfcmovsw	%r17w, %r21w
+
+# CHECK: cfcmovsw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovsl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x48,0xf2]
+         cfcmovsl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovsl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x48,0xd6]
+         cfcmovsl	%r18d, %r22d
+
+# CHECK: cfcmovsl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovsq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x48,0xfb]
+         cfcmovsq	%r19, %r23, %r27
+
+# CHECK: cfcmovsq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x48,0xdf]
+         cfcmovsq	%r19, %r23
+
+# CHECK: cfcmovsq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovsw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovsw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovsl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovsl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovsq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovsq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x44,0xe9]
+         cfcmovew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x44,0xcd]
+         cfcmovew	%r17w, %r21w
+
+# CHECK: cfcmovew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x44,0xf2]
+         cfcmovel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x44,0xd6]
+         cfcmovel	%r18d, %r22d
+
+# CHECK: cfcmovel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmoveq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x44,0xfb]
+         cfcmoveq	%r19, %r23, %r27
+
+# CHECK: cfcmoveq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x44,0xdf]
+         cfcmoveq	%r19, %r23
+
+# CHECK: cfcmoveq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmoveq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmoveq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmoveq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmoveq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmoveq	291(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/cfcmov-intel.s b/llvm/test/MC/X86/apx/cfcmov-intel.s
new file mode 100644
index 0000000000000..58b145d7fef2e
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cfcmov-intel.s
@@ -0,0 +1,841 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: cfcmovb	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x42,0xe9]
+         cfcmovb	r25w, r21w, r17w
+
+# CHECK: cfcmovb	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x42,0xcd]
+         cfcmovb	r21w, r17w
+
+# CHECK: cfcmovb	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovb	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x42,0xf2]
+         cfcmovb	r26d, r22d, r18d
+
+# CHECK: cfcmovb	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x42,0xd6]
+         cfcmovb	r22d, r18d
+
+# CHECK: cfcmovb	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovb	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x42,0xfb]
+         cfcmovb	r27, r23, r19
+
+# CHECK: cfcmovb	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x42,0xdf]
+         cfcmovb	r23, r19
+
+# CHECK: cfcmovb	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovb	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x46,0xe9]
+         cfcmovbe	r25w, r21w, r17w
+
+# CHECK: cfcmovbe	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x46,0xcd]
+         cfcmovbe	r21w, r17w
+
+# CHECK: cfcmovbe	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovbe	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x46,0xf2]
+         cfcmovbe	r26d, r22d, r18d
+
+# CHECK: cfcmovbe	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x46,0xd6]
+         cfcmovbe	r22d, r18d
+
+# CHECK: cfcmovbe	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovbe	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x46,0xfb]
+         cfcmovbe	r27, r23, r19
+
+# CHECK: cfcmovbe	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x46,0xdf]
+         cfcmovbe	r23, r19
+
+# CHECK: cfcmovbe	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovbe	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4c,0xe9]
+         cfcmovl	r25w, r21w, r17w
+
+# CHECK: cfcmovl	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4c,0xcd]
+         cfcmovl	r21w, r17w
+
+# CHECK: cfcmovl	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovl	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4c,0xf2]
+         cfcmovl	r26d, r22d, r18d
+
+# CHECK: cfcmovl	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4c,0xd6]
+         cfcmovl	r22d, r18d
+
+# CHECK: cfcmovl	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovl	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4c,0xfb]
+         cfcmovl	r27, r23, r19
+
+# CHECK: cfcmovl	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4c,0xdf]
+         cfcmovl	r23, r19
+
+# CHECK: cfcmovl	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovl	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4e,0xe9]
+         cfcmovle	r25w, r21w, r17w
+
+# CHECK: cfcmovle	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4e,0xcd]
+         cfcmovle	r21w, r17w
+
+# CHECK: cfcmovle	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovle	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4e,0xf2]
+         cfcmovle	r26d, r22d, r18d
+
+# CHECK: cfcmovle	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4e,0xd6]
+         cfcmovle	r22d, r18d
+
+# CHECK: cfcmovle	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovle	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4e,0xfb]
+         cfcmovle	r27, r23, r19
+
+# CHECK: cfcmovle	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4e,0xdf]
+         cfcmovle	r23, r19
+
+# CHECK: cfcmovle	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovle	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x43,0xe9]
+         cfcmovae	r25w, r21w, r17w
+
+# CHECK: cfcmovae	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x43,0xcd]
+         cfcmovae	r21w, r17w
+
+# CHECK: cfcmovae	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovae	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x43,0xf2]
+         cfcmovae	r26d, r22d, r18d
+
+# CHECK: cfcmovae	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x43,0xd6]
+         cfcmovae	r22d, r18d
+
+# CHECK: cfcmovae	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovae	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x43,0xfb]
+         cfcmovae	r27, r23, r19
+
+# CHECK: cfcmovae	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x43,0xdf]
+         cfcmovae	r23, r19
+
+# CHECK: cfcmovae	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovae	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x47,0xe9]
+         cfcmova	r25w, r21w, r17w
+
+# CHECK: cfcmova	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x47,0xcd]
+         cfcmova	r21w, r17w
+
+# CHECK: cfcmova	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmova	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x47,0xf2]
+         cfcmova	r26d, r22d, r18d
+
+# CHECK: cfcmova	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x47,0xd6]
+         cfcmova	r22d, r18d
+
+# CHECK: cfcmova	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmova	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x47,0xfb]
+         cfcmova	r27, r23, r19
+
+# CHECK: cfcmova	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x47,0xdf]
+         cfcmova	r23, r19
+
+# CHECK: cfcmova	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmova	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4d,0xe9]
+         cfcmovge	r25w, r21w, r17w
+
+# CHECK: cfcmovge	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4d,0xcd]
+         cfcmovge	r21w, r17w
+
+# CHECK: cfcmovge	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovge	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4d,0xf2]
+         cfcmovge	r26d, r22d, r18d
+
+# CHECK: cfcmovge	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4d,0xd6]
+         cfcmovge	r22d, r18d
+
+# CHECK: cfcmovge	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovge	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4d,0xfb]
+         cfcmovge	r27, r23, r19
+
+# CHECK: cfcmovge	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4d,0xdf]
+         cfcmovge	r23, r19
+
+# CHECK: cfcmovge	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovge	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x41,0xe9]
+         cfcmovno	r25w, r21w, r17w
+
+# CHECK: cfcmovno	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x41,0xcd]
+         cfcmovno	r21w, r17w
+
+# CHECK: cfcmovno	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovno	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x41,0xf2]
+         cfcmovno	r26d, r22d, r18d
+
+# CHECK: cfcmovno	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x41,0xd6]
+         cfcmovno	r22d, r18d
+
+# CHECK: cfcmovno	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovno	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x41,0xfb]
+         cfcmovno	r27, r23, r19
+
+# CHECK: cfcmovno	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x41,0xdf]
+         cfcmovno	r23, r19
+
+# CHECK: cfcmovno	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovno	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4b,0xe9]
+         cfcmovnp	r25w, r21w, r17w
+
+# CHECK: cfcmovnp	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4b,0xcd]
+         cfcmovnp	r21w, r17w
+
+# CHECK: cfcmovnp	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovnp	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4b,0xf2]
+         cfcmovnp	r26d, r22d, r18d
+
+# CHECK: cfcmovnp	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4b,0xd6]
+         cfcmovnp	r22d, r18d
+
+# CHECK: cfcmovnp	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovnp	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4b,0xfb]
+         cfcmovnp	r27, r23, r19
+
+# CHECK: cfcmovnp	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4b,0xdf]
+         cfcmovnp	r23, r19
+
+# CHECK: cfcmovnp	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovnp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x49,0xe9]
+         cfcmovns	r25w, r21w, r17w
+
+# CHECK: cfcmovns	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x49,0xcd]
+         cfcmovns	r21w, r17w
+
+# CHECK: cfcmovns	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovns	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x49,0xf2]
+         cfcmovns	r26d, r22d, r18d
+
+# CHECK: cfcmovns	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x49,0xd6]
+         cfcmovns	r22d, r18d
+
+# CHECK: cfcmovns	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovns	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x49,0xfb]
+         cfcmovns	r27, r23, r19
+
+# CHECK: cfcmovns	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x49,0xdf]
+         cfcmovns	r23, r19
+
+# CHECK: cfcmovns	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovns	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x45,0xe9]
+         cfcmovne	r25w, r21w, r17w
+
+# CHECK: cfcmovne	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x45,0xcd]
+         cfcmovne	r21w, r17w
+
+# CHECK: cfcmovne	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovne	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x45,0xf2]
+         cfcmovne	r26d, r22d, r18d
+
+# CHECK: cfcmovne	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x45,0xd6]
+         cfcmovne	r22d, r18d
+
+# CHECK: cfcmovne	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovne	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x45,0xfb]
+         cfcmovne	r27, r23, r19
+
+# CHECK: cfcmovne	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x45,0xdf]
+         cfcmovne	r23, r19
+
+# CHECK: cfcmovne	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovne	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4a,0xe9]
+         cfcmovp	r25w, r21w, r17w
+
+# CHECK: cfcmovp	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4a,0xcd]
+         cfcmovp	r21w, r17w
+
+# CHECK: cfcmovp	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovp	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4a,0xf2]
+         cfcmovp	r26d, r22d, r18d
+
+# CHECK: cfcmovp	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4a,0xd6]
+         cfcmovp	r22d, r18d
+
+# CHECK: cfcmovp	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovp	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4a,0xfb]
+         cfcmovp	r27, r23, r19
+
+# CHECK: cfcmovp	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4a,0xdf]
+         cfcmovp	r23, r19
+
+# CHECK: cfcmovp	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x48,0xe9]
+         cfcmovs	r25w, r21w, r17w
+
+# CHECK: cfcmovs	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x48,0xcd]
+         cfcmovs	r21w, r17w
+
+# CHECK: cfcmovs	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovs	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x48,0xf2]
+         cfcmovs	r26d, r22d, r18d
+
+# CHECK: cfcmovs	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x48,0xd6]
+         cfcmovs	r22d, r18d
+
+# CHECK: cfcmovs	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovs	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x48,0xfb]
+         cfcmovs	r27, r23, r19
+
+# CHECK: cfcmovs	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x48,0xdf]
+         cfcmovs	r23, r19
+
+# CHECK: cfcmovs	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovs	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x44,0xe9]
+         cfcmove	r25w, r21w, r17w
+
+# CHECK: cfcmove	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x44,0xcd]
+         cfcmove	r21w, r17w
+
+# CHECK: cfcmove	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmove	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x44,0xf2]
+         cfcmove	r26d, r22d, r18d
+
+# CHECK: cfcmove	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x44,0xd6]
+         cfcmove	r22d, r18d
+
+# CHECK: cfcmove	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmove	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x44,0xfb]
+         cfcmove	r27, r23, r19
+
+# CHECK: cfcmove	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x44,0xdf]
+         cfcmove	r23, r19
+
+# CHECK: cfcmove	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmove	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/cmov-att.s b/llvm/test/MC/X86/apx/cmov-att.s
new file mode 100644
index 0000000000000..4b8e6786d8015
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cmov-att.s
@@ -0,0 +1,293 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-96: error:
+# ERROR-NOT: error:
+# CHECK: cmovbw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x42,0xc2]
+         cmovbw	%dx, %ax, %r9w
+# CHECK: cmovbl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x42,0xd1]
+         cmovbl	%ecx, %edx, %r10d
+# CHECK: cmovbq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x42,0xf9]
+         cmovbq	%r9, %r15, %r11
+# CHECK: cmovbw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x42,0x54,0x80,0x7b]
+         cmovbw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovbl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x42,0x4c,0x80,0x7b]
+         cmovbl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovbq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x42,0x4c,0x80,0x7b]
+         cmovbq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovbew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x46,0xc2]
+         cmovbew	%dx, %ax, %r9w
+# CHECK: cmovbel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x46,0xd1]
+         cmovbel	%ecx, %edx, %r10d
+# CHECK: cmovbeq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x46,0xf9]
+         cmovbeq	%r9, %r15, %r11
+# CHECK: cmovbew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x46,0x54,0x80,0x7b]
+         cmovbew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovbel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x46,0x4c,0x80,0x7b]
+         cmovbel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovbeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x46,0x4c,0x80,0x7b]
+         cmovbeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovlw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4c,0xc2]
+         cmovlw	%dx, %ax, %r9w
+# CHECK: cmovll	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4c,0xd1]
+         cmovll	%ecx, %edx, %r10d
+# CHECK: cmovlq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4c,0xf9]
+         cmovlq	%r9, %r15, %r11
+# CHECK: cmovlw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4c,0x54,0x80,0x7b]
+         cmovlw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovll	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4c,0x4c,0x80,0x7b]
+         cmovll	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovlq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4c,0x4c,0x80,0x7b]
+         cmovlq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovlew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4e,0xc2]
+         cmovlew	%dx, %ax, %r9w
+# CHECK: cmovlel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4e,0xd1]
+         cmovlel	%ecx, %edx, %r10d
+# CHECK: cmovleq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4e,0xf9]
+         cmovleq	%r9, %r15, %r11
+# CHECK: cmovlew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4e,0x54,0x80,0x7b]
+         cmovlew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovlel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4e,0x4c,0x80,0x7b]
+         cmovlel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovleq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4e,0x4c,0x80,0x7b]
+         cmovleq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovaew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x43,0xc2]
+         cmovaew	%dx, %ax, %r9w
+# CHECK: cmovael	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x43,0xd1]
+         cmovael	%ecx, %edx, %r10d
+# CHECK: cmovaeq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x43,0xf9]
+         cmovaeq	%r9, %r15, %r11
+# CHECK: cmovaew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x43,0x54,0x80,0x7b]
+         cmovaew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovael	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x43,0x4c,0x80,0x7b]
+         cmovael	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovaeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x43,0x4c,0x80,0x7b]
+         cmovaeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovaw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x47,0xc2]
+         cmovaw	%dx, %ax, %r9w
+# CHECK: cmoval	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x47,0xd1]
+         cmoval	%ecx, %edx, %r10d
+# CHECK: cmovaq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x47,0xf9]
+         cmovaq	%r9, %r15, %r11
+# CHECK: cmovaw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x47,0x54,0x80,0x7b]
+         cmovaw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmoval	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x47,0x4c,0x80,0x7b]
+         cmoval	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovaq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x47,0x4c,0x80,0x7b]
+         cmovaq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovgew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4d,0xc2]
+         cmovgew	%dx, %ax, %r9w
+# CHECK: cmovgel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4d,0xd1]
+         cmovgel	%ecx, %edx, %r10d
+# CHECK: cmovgeq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4d,0xf9]
+         cmovgeq	%r9, %r15, %r11
+# CHECK: cmovgew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4d,0x54,0x80,0x7b]
+         cmovgew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovgel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4d,0x4c,0x80,0x7b]
+         cmovgel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovgeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4d,0x4c,0x80,0x7b]
+         cmovgeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovgw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4f,0xc2]
+         cmovgw	%dx, %ax, %r9w
+# CHECK: cmovgl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4f,0xd1]
+         cmovgl	%ecx, %edx, %r10d
+# CHECK: cmovgq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4f,0xf9]
+         cmovgq	%r9, %r15, %r11
+# CHECK: cmovgw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4f,0x54,0x80,0x7b]
+         cmovgw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovgl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4f,0x4c,0x80,0x7b]
+         cmovgl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovgq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4f,0x4c,0x80,0x7b]
+         cmovgq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovnow	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x41,0xc2]
+         cmovnow	%dx, %ax, %r9w
+# CHECK: cmovnol	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x41,0xd1]
+         cmovnol	%ecx, %edx, %r10d
+# CHECK: cmovnoq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x41,0xf9]
+         cmovnoq	%r9, %r15, %r11
+# CHECK: cmovnow	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x41,0x54,0x80,0x7b]
+         cmovnow	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovnol	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x41,0x4c,0x80,0x7b]
+         cmovnol	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovnoq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x41,0x4c,0x80,0x7b]
+         cmovnoq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovnpw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4b,0xc2]
+         cmovnpw	%dx, %ax, %r9w
+# CHECK: cmovnpl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4b,0xd1]
+         cmovnpl	%ecx, %edx, %r10d
+# CHECK: cmovnpq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4b,0xf9]
+         cmovnpq	%r9, %r15, %r11
+# CHECK: cmovnpw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4b,0x54,0x80,0x7b]
+         cmovnpw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovnpl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4b,0x4c,0x80,0x7b]
+         cmovnpl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovnpq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4b,0x4c,0x80,0x7b]
+         cmovnpq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovnsw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x49,0xc2]
+         cmovnsw	%dx, %ax, %r9w
+# CHECK: cmovnsl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x49,0xd1]
+         cmovnsl	%ecx, %edx, %r10d
+# CHECK: cmovnsq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x49,0xf9]
+         cmovnsq	%r9, %r15, %r11
+# CHECK: cmovnsw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x49,0x54,0x80,0x7b]
+         cmovnsw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovnsl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x49,0x4c,0x80,0x7b]
+         cmovnsl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovnsq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x49,0x4c,0x80,0x7b]
+         cmovnsq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovnew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x45,0xc2]
+         cmovnew	%dx, %ax, %r9w
+# CHECK: cmovnel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x45,0xd1]
+         cmovnel	%ecx, %edx, %r10d
+# CHECK: cmovneq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x45,0xf9]
+         cmovneq	%r9, %r15, %r11
+# CHECK: cmovnew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x45,0x54,0x80,0x7b]
+         cmovnew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovnel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x45,0x4c,0x80,0x7b]
+         cmovnel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovneq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x45,0x4c,0x80,0x7b]
+         cmovneq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovow	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x40,0xc2]
+         cmovow	%dx, %ax, %r9w
+# CHECK: cmovol	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x40,0xd1]
+         cmovol	%ecx, %edx, %r10d
+# CHECK: cmovoq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x40,0xf9]
+         cmovoq	%r9, %r15, %r11
+# CHECK: cmovow	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x40,0x54,0x80,0x7b]
+         cmovow	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovol	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x40,0x4c,0x80,0x7b]
+         cmovol	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovoq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x40,0x4c,0x80,0x7b]
+         cmovoq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovpw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4a,0xc2]
+         cmovpw	%dx, %ax, %r9w
+# CHECK: cmovpl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4a,0xd1]
+         cmovpl	%ecx, %edx, %r10d
+# CHECK: cmovpq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4a,0xf9]
+         cmovpq	%r9, %r15, %r11
+# CHECK: cmovpw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4a,0x54,0x80,0x7b]
+         cmovpw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovpl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4a,0x4c,0x80,0x7b]
+         cmovpl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovpq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4a,0x4c,0x80,0x7b]
+         cmovpq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovsw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x48,0xc2]
+         cmovsw	%dx, %ax, %r9w
+# CHECK: cmovsl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x48,0xd1]
+         cmovsl	%ecx, %edx, %r10d
+# CHECK: cmovsq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x48,0xf9]
+         cmovsq	%r9, %r15, %r11
+# CHECK: cmovsw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x48,0x54,0x80,0x7b]
+         cmovsw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovsl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x48,0x4c,0x80,0x7b]
+         cmovsl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovsq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x48,0x4c,0x80,0x7b]
+         cmovsq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x44,0xc2]
+         cmovew	%dx, %ax, %r9w
+# CHECK: cmovel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x44,0xd1]
+         cmovel	%ecx, %edx, %r10d
+# CHECK: cmoveq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x44,0xf9]
+         cmoveq	%r9, %r15, %r11
+# CHECK: cmovew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x44,0x54,0x80,0x7b]
+         cmovew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x44,0x4c,0x80,0x7b]
+         cmovel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmoveq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x44,0x4c,0x80,0x7b]
+         cmoveq	123(%r8,%rax,4), %r9, %r15
diff --git a/llvm/test/MC/X86/apx/cmov-intel.s b/llvm/test/MC/X86/apx/cmov-intel.s
new file mode 100644
index 0000000000000..f481346bc3470
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cmov-intel.s
@@ -0,0 +1,290 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: cmovb	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x42,0xc2]
+         cmovb	r9w, ax, dx
+# CHECK: cmovb	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x42,0xd1]
+         cmovb	r10d, edx, ecx
+# CHECK: cmovb	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x42,0xf9]
+         cmovb	r11, r15, r9
+# CHECK: cmovb	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x42,0x54,0x80,0x7b]
+         cmovb	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovb	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x42,0x4c,0x80,0x7b]
+         cmovb	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovb	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x42,0x4c,0x80,0x7b]
+         cmovb	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovbe	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x46,0xc2]
+         cmovbe	r9w, ax, dx
+# CHECK: cmovbe	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x46,0xd1]
+         cmovbe	r10d, edx, ecx
+# CHECK: cmovbe	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x46,0xf9]
+         cmovbe	r11, r15, r9
+# CHECK: cmovbe	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x46,0x54,0x80,0x7b]
+         cmovbe	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovbe	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x46,0x4c,0x80,0x7b]
+         cmovbe	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovbe	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x46,0x4c,0x80,0x7b]
+         cmovbe	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovl	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4c,0xc2]
+         cmovl	r9w, ax, dx
+# CHECK: cmovl	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4c,0xd1]
+         cmovl	r10d, edx, ecx
+# CHECK: cmovl	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4c,0xf9]
+         cmovl	r11, r15, r9
+# CHECK: cmovl	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4c,0x54,0x80,0x7b]
+         cmovl	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovl	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4c,0x4c,0x80,0x7b]
+         cmovl	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovl	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4c,0x4c,0x80,0x7b]
+         cmovl	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovle	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4e,0xc2]
+         cmovle	r9w, ax, dx
+# CHECK: cmovle	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4e,0xd1]
+         cmovle	r10d, edx, ecx
+# CHECK: cmovle	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4e,0xf9]
+         cmovle	r11, r15, r9
+# CHECK: cmovle	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4e,0x54,0x80,0x7b]
+         cmovle	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovle	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4e,0x4c,0x80,0x7b]
+         cmovle	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovle	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4e,0x4c,0x80,0x7b]
+         cmovle	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovae	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x43,0xc2]
+         cmovae	r9w, ax, dx
+# CHECK: cmovae	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x43,0xd1]
+         cmovae	r10d, edx, ecx
+# CHECK: cmovae	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x43,0xf9]
+         cmovae	r11, r15, r9
+# CHECK: cmovae	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x43,0x54,0x80,0x7b]
+         cmovae	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovae	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x43,0x4c,0x80,0x7b]
+         cmovae	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovae	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x43,0x4c,0x80,0x7b]
+         cmovae	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmova	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x47,0xc2]
+         cmova	r9w, ax, dx
+# CHECK: cmova	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x47,0xd1]
+         cmova	r10d, edx, ecx
+# CHECK: cmova	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x47,0xf9]
+         cmova	r11, r15, r9
+# CHECK: cmova	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x47,0x54,0x80,0x7b]
+         cmova	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmova	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x47,0x4c,0x80,0x7b]
+         cmova	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmova	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x47,0x4c,0x80,0x7b]
+         cmova	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovge	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4d,0xc2]
+         cmovge	r9w, ax, dx
+# CHECK: cmovge	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4d,0xd1]
+         cmovge	r10d, edx, ecx
+# CHECK: cmovge	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4d,0xf9]
+         cmovge	r11, r15, r9
+# CHECK: cmovge	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4d,0x54,0x80,0x7b]
+         cmovge	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovge	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4d,0x4c,0x80,0x7b]
+         cmovge	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovge	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4d,0x4c,0x80,0x7b]
+         cmovge	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovg	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4f,0xc2]
+         cmovg	r9w, ax, dx
+# CHECK: cmovg	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4f,0xd1]
+         cmovg	r10d, edx, ecx
+# CHECK: cmovg	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4f,0xf9]
+         cmovg	r11, r15, r9
+# CHECK: cmovg	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4f,0x54,0x80,0x7b]
+         cmovg	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovg	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4f,0x4c,0x80,0x7b]
+         cmovg	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovg	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4f,0x4c,0x80,0x7b]
+         cmovg	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovno	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x41,0xc2]
+         cmovno	r9w, ax, dx
+# CHECK: cmovno	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x41,0xd1]
+         cmovno	r10d, edx, ecx
+# CHECK: cmovno	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x41,0xf9]
+         cmovno	r11, r15, r9
+# CHECK: cmovno	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x41,0x54,0x80,0x7b]
+         cmovno	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovno	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x41,0x4c,0x80,0x7b]
+         cmovno	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovno	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x41,0x4c,0x80,0x7b]
+         cmovno	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovnp	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4b,0xc2]
+         cmovnp	r9w, ax, dx
+# CHECK: cmovnp	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4b,0xd1]
+         cmovnp	r10d, edx, ecx
+# CHECK: cmovnp	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4b,0xf9]
+         cmovnp	r11, r15, r9
+# CHECK: cmovnp	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4b,0x54,0x80,0x7b]
+         cmovnp	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovnp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4b,0x4c,0x80,0x7b]
+         cmovnp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovnp	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4b,0x4c,0x80,0x7b]
+         cmovnp	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovns	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x49,0xc2]
+         cmovns	r9w, ax, dx
+# CHECK: cmovns	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x49,0xd1]
+         cmovns	r10d, edx, ecx
+# CHECK: cmovns	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x49,0xf9]
+         cmovns	r11, r15, r9
+# CHECK: cmovns	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x49,0x54,0x80,0x7b]
+         cmovns	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovns	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x49,0x4c,0x80,0x7b]
+         cmovns	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovns	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x49,0x4c,0x80,0x7b]
+         cmovns	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovne	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x45,0xc2]
+         cmovne	r9w, ax, dx
+# CHECK: cmovne	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x45,0xd1]
+         cmovne	r10d, edx, ecx
+# CHECK: cmovne	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x45,0xf9]
+         cmovne	r11, r15, r9
+# CHECK: cmovne	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x45,0x54,0x80,0x7b]
+         cmovne	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovne	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x45,0x4c,0x80,0x7b]
+         cmovne	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovne	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x45,0x4c,0x80,0x7b]
+         cmovne	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovo	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x40,0xc2]
+         cmovo	r9w, ax, dx
+# CHECK: cmovo	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x40,0xd1]
+         cmovo	r10d, edx, ecx
+# CHECK: cmovo	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x40,0xf9]
+         cmovo	r11, r15, r9
+# CHECK: cmovo	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x40,0x54,0x80,0x7b]
+         cmovo	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovo	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x40,0x4c,0x80,0x7b]
+         cmovo	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovo	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x40,0x4c,0x80,0x7b]
+         cmovo	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovp	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4a,0xc2]
+         cmovp	r9w, ax, dx
+# CHECK: cmovp	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4a,0xd1]
+         cmovp	r10d, edx, ecx
+# CHECK: cmovp	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4a,0xf9]
+         cmovp	r11, r15, r9
+# CHECK: cmovp	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4a,0x54,0x80,0x7b]
+         cmovp	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4a,0x4c,0x80,0x7b]
+         cmovp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovp	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4a,0x4c,0x80,0x7b]
+         cmovp	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovs	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x48,0xc2]
+         cmovs	r9w, ax, dx
+# CHECK: cmovs	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x48,0xd1]
+         cmovs	r10d, edx, ecx
+# CHECK: cmovs	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x48,0xf9]
+         cmovs	r11, r15, r9
+# CHECK: cmovs	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x48,0x54,0x80,0x7b]
+         cmovs	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovs	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x48,0x4c,0x80,0x7b]
+         cmovs	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovs	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x48,0x4c,0x80,0x7b]
+         cmovs	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmove	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x44,0xc2]
+         cmove	r9w, ax, dx
+# CHECK: cmove	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x44,0xd1]
+         cmove	r10d, edx, ecx
+# CHECK: cmove	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x44,0xf9]
+         cmove	r11, r15, r9
+# CHECK: cmove	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x44,0x54,0x80,0x7b]
+         cmove	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmove	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x44,0x4c,0x80,0x7b]
+         cmove	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmove	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x44,0x4c,0x80,0x7b]
+         cmove	r15, r9, qword ptr [r8 + 4*rax + 123]
diff --git a/llvm/test/MC/X86/apx/evex-format-att.s b/llvm/test/MC/X86/apx/evex-format-att.s
index 055a29fe00f32..36df3f3757dc3 100644
--- a/llvm/test/MC/X86/apx/evex-format-att.s
+++ b/llvm/test/MC/X86/apx/evex-format-att.s
@@ -10,6 +10,12 @@
 # CHECK: encoding: [0x62,0xec,0xec,0x10,0x01,0x41,0x7b]
          addq	%r16, 123(%r17), %r18
 
+## MRMDestMemCC
+
+# CHECK: cfcmovbq %r16, 123(%r17,%r18,4)
+# CHECK: encoding: [0x62,0xec,0xf8,0x0c,0x42,0x44,0x91,0x7b]
+         cfcmovbq %r16, 123(%r17,%r18,4)
+
 ## MRMSrcMem
 
 # CHECK: vbroadcasti32x4	(%r16,%r17), %zmm0
@@ -20,6 +26,16 @@
 # CHECK: encoding: [0x62,0xec,0xec,0x10,0x2b,0x48,0x7b]
          subq	123(%r16), %r17, %r18
 
+## MRMSrcMemCC
+
+# CHECK: cfcmovbq	123(%r16,%r17,4), %r18
+# CHECK: encoding: [0x62,0xec,0xf8,0x08,0x42,0x54,0x88,0x7b]
+         cfcmovbq	123(%r16,%r17,4), %r18
+
+# CHECK: cfcmovbq	123(%r16,%r17,4), %r18, %r19
+# CHECK: encoding: [0x62,0xec,0xe0,0x14,0x42,0x54,0x88,0x7b]
+         cfcmovbq	123(%r16,%r17,4), %r18, %r19
+
 ## MRM0m
 
 # CHECK: vprorq	$0, (%r16,%r17), %zmm0
@@ -122,12 +138,24 @@
 # CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x01,0xc1]
          {nf}	addq	%r16, %r17
 
+## MRMDestRegCC
+
+# CHECK: cfcmovbq	%r16, %r17
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x42,0xc1]
+         cfcmovbq	%r16, %r17
+
 ## MRMSrcReg
 
 # CHECK: mulxq	%r16, %r17, %r18
 # CHECK: encoding: [0x62,0xea,0xf7,0x00,0xf6,0xd0]
          mulxq	%r16, %r17, %r18
 
+## MRMSrcRegCC
+
+# CHECK: cfcmovbq	%r16, %r17, %r18
+# CHECK: encoding: [0x62,0xec,0xec,0x14,0x42,0xc8]
+         cfcmovbq	%r16, %r17, %r18
+
 ## MRMSrcReg4VOp3
 
 # CHECK: bzhiq	%r19, %r23, %r27
diff --git a/llvm/test/MC/X86/apx/evex-format-intel.s b/llvm/test/MC/X86/apx/evex-format-intel.s
index 06b56078aab90..2b346e0e85806 100644
--- a/llvm/test/MC/X86/apx/evex-format-intel.s
+++ b/llvm/test/MC/X86/apx/evex-format-intel.s
@@ -10,6 +10,12 @@
 # CHECK: encoding: [0x62,0xec,0xec,0x10,0x01,0x41,0x7b]
          add	r18, qword ptr [r17 + 123], r16
 
+## MRMDestMemCC
+
+# CHECK: cfcmovb	qword ptr [r17 + 4*r18 + 123], r16
+# CHECK: encoding: [0x62,0xec,0xf8,0x0c,0x42,0x44,0x91,0x7b]
+         cfcmovb	qword ptr [r17 + 4*r18 + 123], r16
+
 ## MRMSrcMem
 
 # CHECK: vbroadcasti32x4	zmm0, xmmword ptr [r16 + r17]
@@ -20,6 +26,16 @@
 # CHECK: encoding: [0x62,0xec,0xec,0x10,0x2b,0x48,0x7b]
          sub	r18, r17, qword ptr [r16 + 123]
 
+## MRMSrcMemCC
+
+# CHECK: cfcmovb	r18, qword ptr [r16 + 4*r17 + 123]
+# CHECK: encoding: [0x62,0xec,0xf8,0x08,0x42,0x54,0x88,0x7b]
+         cfcmovb	r18, qword ptr [r16 + 4*r17 + 123]
+
+# CHECK: cfcmovb	r19, r18, qword ptr [r16 + 4*r17 + 123]
+# CHECK: encoding: [0x62,0xec,0xe0,0x14,0x42,0x54,0x88,0x7b]
+         cfcmovb	r19, r18, qword ptr [r16 + 4*r17 + 123]
+
 ## MRM0m
 
 # CHECK: vprorq	zmm0, zmmword ptr [r16 + r17], 0
@@ -122,12 +138,24 @@
 # CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x01,0xc1]
          {nf}	add	r17, r16
 
+## MRMDestRegCC
+
+# CHECK: cfcmovb	r17, r16
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x42,0xc1]
+         cfcmovb	r17, r16
+
 ## MRMSrcReg
 
 # CHECK: mulx	r18, r17, r16
 # CHECK: encoding: [0x62,0xea,0xf7,0x00,0xf6,0xd0]
          mulx	r18, r17, r16
 
+## MRMSrcRegCC
+
+# CHECK: cfcmovb	r18, r17, r16
+# CHECK: encoding: [0x62,0xec,0xec,0x14,0x42,0xc8]
+         cfcmovb	r18, r17, r16
+
 ## MRMSrcReg4VOp3
 
 # CHECK: bzhi	r27, r23, r19
diff --git a/llvm/test/TableGen/MacroFusion.td b/llvm/test/TableGen/MacroFusion.td
index ce76e7f0f7fa6..b54b0506c5635 100644
--- a/llvm/test/TableGen/MacroFusion.td
+++ b/llvm/test/TableGen/MacroFusion.td
@@ -33,6 +33,8 @@ let Namespace = "Test" in {
 
 def Inst0 : TestInst<0>;
 def Inst1 : TestInst<1>;
+let isCommutable = true in
+def Inst2 : TestInst<2>;
 
 def BothFusionPredicate: BothFusionPredicateWithMCInstPredicate<CheckRegOperand<0, X0>>;
 def TestBothFusionPredicate: Fusion<"test-both-fusion-predicate", "HasBothFusionPredicate",
@@ -42,16 +44,32 @@ def TestBothFusionPredicate: Fusion<"test-both-fusion-predicate", "HasBothFusion
 def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
                              CheckOpcode<[Inst0]>,
                              CheckAll<[
-                              CheckOpcode<[Inst1]>,
-                              CheckRegOperand<0, X0>
+                               CheckOpcode<[Inst1]>,
+                               CheckRegOperand<0, X0>
                              ]>>;
 
+let IsCommutable = 1 in
+def TestCommutableFusion: SimpleFusion<"test-commutable-fusion", "HasTestCommutableFusion",
+                                       "Test Commutable Fusion",
+                                       CheckOpcode<[Inst0]>,
+                                       CheckAll<[
+                                         CheckOpcode<[Inst1]>,
+                                         CheckRegOperand<0, X0>
+                                       ]>>;
+
+def TestSingleFusion: SingleFusion<"test-single-fusion", "HasTestSingleFusion",
+                                   "Test SingleFusion",
+                                   Inst0, Inst2,
+                                   secondInstPred=CheckRegOperand<0, X0>>;
+
 // CHECK-PREDICATOR:       #ifdef GET_Test_MACRO_FUSION_PRED_DECL
 // CHECK-PREDICATOR-NEXT:  #undef GET_Test_MACRO_FUSION_PRED_DECL
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  namespace llvm {
 // CHECK-PREDICATOR-NEXT:  bool isTestBothFusionPredicate(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
+// CHECK-PREDICATOR-NEXT:  bool isTestCommutableFusion(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);   
 // CHECK-PREDICATOR-NEXT:  bool isTestFusion(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
+// CHECK-PREDICATOR-NEXT:  bool isTestSingleFusion(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
 // CHECK-PREDICATOR-NEXT:  } // end namespace llvm
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  #endif
@@ -78,7 +96,7 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:    }
 // CHECK-PREDICATOR-NEXT:    return true;
 // CHECK-PREDICATOR-NEXT:  }
-// CHECK-PREDICATOR-NEXT:  bool isTestFusion(
+// CHECK-PREDICATOR-NEXT:  bool isTestCommutableFusion(
 // CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
 // CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
 // CHECK-PREDICATOR-NEXT:      const MachineInstr *FirstMI,
@@ -99,14 +117,58 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:      if (( MI->getOpcode() != Test::Inst0 ))
 // CHECK-PREDICATOR-NEXT:        return false;
 // CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!SecondMI.getOperand(0).getReg().isVirtual()) {
+// CHECK-PREDICATOR-NEXT:      if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(1).getReg()) {
+// CHECK-PREDICATOR-NEXT:        if (!SecondMI.getDesc().isCommutable())
+// CHECK-PREDICATOR-NEXT:          return false;
+// CHECK-PREDICATOR-NEXT:        unsigned SrcOpIdx1 = 1, SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+// CHECK-PREDICATOR-NEXT:        if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))
+// CHECK-PREDICATOR-NEXT:          if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())
+// CHECK-PREDICATOR-NEXT:            return false;
+// CHECK-PREDICATOR-NEXT:      }
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      Register FirstDest = FirstMI->getOperand(0).getReg();
+// CHECK-PREDICATOR-NEXT:      if (FirstDest.isVirtual() && !MRI.hasOneNonDBGUse(FirstDest))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!(FirstMI->getOperand(0).isReg() &&
+// CHECK-PREDICATOR-NEXT:          SecondMI.getOperand(1).isReg() &&
+// CHECK-PREDICATOR-NEXT:          FirstMI->getOperand(0).getReg() == SecondMI.getOperand(1).getReg())) {
+// CHECK-PREDICATOR-NEXT:      if (!SecondMI.getDesc().isCommutable())
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:      unsigned SrcOpIdx1 = 1, SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+// CHECK-PREDICATOR-NEXT:      if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))
+// CHECK-PREDICATOR-NEXT:        if (FirstMI->getOperand(0).getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())
+// CHECK-PREDICATOR-NEXT:          return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    return true;
+// CHECK-PREDICATOR-NEXT:  }
+// CHECK-PREDICATOR-NEXT:  bool isTestFusion(
+// CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
+// CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *FirstMI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr &SecondMI) {
+// CHECK-PREDICATOR-NEXT:    auto &MRI = SecondMI.getMF()->getRegInfo();
 // CHECK-PREDICATOR-NEXT:    {
 // CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = &SecondMI;
 // CHECK-PREDICATOR-NEXT:      if (!(
-// CHECK-PREDICATOR-NEXT:          MI->getOperand(0).getReg().isVirtual()
-// CHECK-PREDICATOR-NEXT:          || MI->getOperand(0).getReg() == MI->getOperand(1).getReg()
+// CHECK-PREDICATOR-NEXT:          ( MI->getOpcode() == Test::Inst1 )
+// CHECK-PREDICATOR-NEXT:          && MI->getOperand(0).getReg() == Test::X0
 // CHECK-PREDICATOR-NEXT:        ))
 // CHECK-PREDICATOR-NEXT:        return false;
 // CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!FirstMI)
+// CHECK-PREDICATOR-NEXT:      return true;
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = FirstMI;
+// CHECK-PREDICATOR-NEXT:      if (( MI->getOpcode() != Test::Inst0 ))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!SecondMI.getOperand(0).getReg().isVirtual()) {
+// CHECK-PREDICATOR-NEXT:      if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(1).getReg())
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
 // CHECK-PREDICATOR-NEXT:    {
 // CHECK-PREDICATOR-NEXT:      Register FirstDest = FirstMI->getOperand(0).getReg();
 // CHECK-PREDICATOR-NEXT:      if (FirstDest.isVirtual() && !MRI.hasOneNonDBGUse(FirstDest))
@@ -118,12 +180,66 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:      return false;
 // CHECK-PREDICATOR-NEXT:    return true;
 // CHECK-PREDICATOR-NEXT:  }
+// CHECK-PREDICATOR-NEXT:  bool isTestSingleFusion(
+// CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
+// CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *FirstMI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr &SecondMI) {
+// CHECK-PREDICATOR-NEXT:    auto &MRI = SecondMI.getMF()->getRegInfo();
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = &SecondMI;
+// CHECK-PREDICATOR-NEXT:      if (!(
+// CHECK-PREDICATOR-NEXT:          ( MI->getOpcode() == Test::Inst2 )
+// CHECK-PREDICATOR-NEXT:          && MI->getOperand(0).getReg() == Test::X0
+// CHECK-PREDICATOR-NEXT:        ))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!FirstMI)
+// CHECK-PREDICATOR-NEXT:      return true;
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = FirstMI;
+// CHECK-PREDICATOR-NEXT:      if (!(
+// CHECK-PREDICATOR-NEXT:          ( MI->getOpcode() == Test::Inst0 )
+// CHECK-PREDICATOR-NEXT:          && true
+// CHECK-PREDICATOR-NEXT:        ))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!SecondMI.getOperand(0).getReg().isVirtual()) {
+// CHECK-PREDICATOR-NEXT:      if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(1).getReg()) {
+// CHECK-PREDICATOR-NEXT:        if (!SecondMI.getDesc().isCommutable())
+// CHECK-PREDICATOR-NEXT:          return false;
+// CHECK-PREDICATOR-NEXT:        unsigned SrcOpIdx1 = 1, SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+// CHECK-PREDICATOR-NEXT:        if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))
+// CHECK-PREDICATOR-NEXT:          if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())
+// CHECK-PREDICATOR-NEXT:            return false;
+// CHECK-PREDICATOR-NEXT:      }
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      Register FirstDest = FirstMI->getOperand(0).getReg();
+// CHECK-PREDICATOR-NEXT:      if (FirstDest.isVirtual() && !MRI.hasOneNonDBGUse(FirstDest))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!(FirstMI->getOperand(0).isReg() &&
+// CHECK-PREDICATOR-NEXT:          SecondMI.getOperand(1).isReg() &&
+// CHECK-PREDICATOR-NEXT:          FirstMI->getOperand(0).getReg() == SecondMI.getOperand(1).getReg())) {
+// CHECK-PREDICATOR-NEXT:      if (!SecondMI.getDesc().isCommutable())
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:      unsigned SrcOpIdx1 = 1, SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+// CHECK-PREDICATOR-NEXT:      if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))
+// CHECK-PREDICATOR-NEXT:        if (FirstMI->getOperand(0).getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())
+// CHECK-PREDICATOR-NEXT:          return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    return true;
+// CHECK-PREDICATOR-NEXT:  }
 // CHECK-PREDICATOR-NEXT:  } // end namespace llvm
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  #endif
 
 // Check that we have generated target subfeature.
+// CHECK-SUBTARGET: { "test-both-fusion-predicate", "Test BothFusionPredicate", Test::TestBothFusionPredicate
+// CHECK-SUBTARGET: { "test-commutable-fusion", "Test Commutable Fusion", Test::TestCommutableFusion
 // CHECK-SUBTARGET: { "test-fusion", "Test Fusion", Test::TestFusion
+// CHECK-SUBTARGET: { "test-single-fusion", "Test SingleFusion", Test::TestSingleFusion
 
 // Check that we have generated `getMacroFusions()` function.
 // CHECK-SUBTARGET:      std::vector<MacroFusionPredTy> getMacroFusions() const override;
@@ -131,6 +247,8 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-SUBTARGET:      std::vector<MacroFusionPredTy> TestGenSubtargetInfo::getMacroFusions() const {
 // CHECK-SUBTARGET-NEXT:   std::vector<MacroFusionPredTy> Fusions;
 // CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestBothFusionPredicate)) Fusions.push_back(llvm::isTestBothFusionPredicate); 
+// CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestCommutableFusion)) Fusions.push_back(llvm::isTestCommutableFusion); 
 // CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestFusion)) Fusions.push_back(llvm::isTestFusion);
+// CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestSingleFusion)) Fusions.push_back(llvm::isTestSingleFusion);
 // CHECK-SUBTARGET-NEXT:   return Fusions;
 // CHECK-SUBTARGET-NEXT: }
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index eea4f87cae9ce..7b65e483c39d0 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -1937,8 +1937,11 @@ static const X86FoldTableEntry Table2[] = {
   {X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16},
   {X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16},
   {X86::CMOV16rr, X86::CMOV16rm, 0},
+  {X86::CMOV16rr_ND, X86::CMOV16rm_ND, 0},
   {X86::CMOV32rr, X86::CMOV32rm, 0},
+  {X86::CMOV32rr_ND, X86::CMOV32rm_ND, 0},
   {X86::CMOV64rr, X86::CMOV64rm, 0},
+  {X86::CMOV64rr_ND, X86::CMOV64rm_ND, 0},
   {X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16},
   {X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16},
   {X86::CMPSDrri, X86::CMPSDrmi, 0},
diff --git a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
index d64f8cf3e56dc..05c57052e674d 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
@@ -2641,8 +2641,8 @@ define float @assume_false_smallest_normal(float %arg) {
 }
 
 define float @clamp_false_nan(float %arg) {
-; CHECK-LABEL: define nofpclass(nan inf nzero sub norm) float @clamp_false_nan(
-; CHECK-SAME: float returned nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define float @clamp_false_nan(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
   %fcmp = fcmp false float %arg, 0x7FF8000000000000
@@ -2784,12 +2784,12 @@ define float @clamp_true_smallest_normal_0.0(float %arg) {
 }
 
 define float @clamp_true_nan(float %arg) {
-; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_nan(
-; CHECK-SAME: float nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    ret float 0.000000e+00
+; CHECK-LABEL: define float @clamp_true_nan(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float [[ARG]]
 ;
   %fcmp = fcmp true float %arg, 0x7FF8000000000000
-  %select = select i1 %fcmp, float 0.0, float %arg
+  %select = select i1 %fcmp, float %arg, float 0.0
   ret float %select
 }
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/fpclass-test.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/fpclass-test.ll
new file mode 100644
index 0000000000000..63ab22e96ad2a
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/fpclass-test.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=aarch64 < %s | FileCheck %s
+
+define i1 @test_is_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp ueq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp one double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp oeq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp une double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define <vscale x 2 x i1> @test_vec_is_inf_or_nan(<vscale x 2 x double> %arg) {
+; CHECK-LABEL: define <vscale x 2 x i1> @test_vec_is_inf_or_nan(
+; CHECK-SAME: <vscale x 2 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2f64(<vscale x 2 x double> [[ARG]], i32 519)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %arg)
+  %ret = fcmp ueq <vscale x 2 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 2 x i1> %ret
+}
+
+define <vscale x 2 x i1> @test_vec_is_not_inf_or_nan(<vscale x 2 x double> %arg) {
+; CHECK-LABEL: define <vscale x 2 x i1> @test_vec_is_not_inf_or_nan(
+; CHECK-SAME: <vscale x 2 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2f64(<vscale x 2 x double> [[ARG]], i32 504)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %arg)
+  %ret = fcmp one <vscale x 2 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 2 x i1> %ret
+}
+
+define <vscale x 2 x i1> @test_vec_is_inf(<vscale x 2 x double> %arg) {
+; CHECK-LABEL: define <vscale x 2 x i1> @test_vec_is_inf(
+; CHECK-SAME: <vscale x 2 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2f64(<vscale x 2 x double> [[ARG]], i32 516)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %arg)
+  %ret = fcmp oeq <vscale x 2 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 2 x i1> %ret
+}
+
+define <vscale x 2 x i1> @test_vec_is_not_inf(<vscale x 2 x double> %arg) {
+; CHECK-LABEL: define <vscale x 2 x i1> @test_vec_is_not_inf(
+; CHECK-SAME: <vscale x 2 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2f64(<vscale x 2 x double> [[ARG]], i32 507)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %arg)
+  %ret = fcmp une <vscale x 2 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 2 x i1> %ret
+}
+
+define i1 @test_fp128_is_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp ueq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp one fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp une fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/fpclass-test.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/fpclass-test.ll
new file mode 100644
index 0000000000000..7c00218bdcce3
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/fpclass-test.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=riscv64 < %s | FileCheck %s
+
+define i1 @test_is_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp ueq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp one double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp oeq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp une double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define <vscale x 4 x i1> @test_vec_is_inf_or_nan(<vscale x 4 x double> %arg) {
+; CHECK-LABEL: define <vscale x 4 x i1> @test_vec_is_inf_or_nan(
+; CHECK-SAME: <vscale x 4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.is.fpclass.nxv4f64(<vscale x 4 x double> [[ARG]], i32 519)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> %arg)
+  %ret = fcmp ueq <vscale x 4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 4 x i1> %ret
+}
+
+define <vscale x 4 x i1> @test_vec_is_not_inf_or_nan(<vscale x 4 x double> %arg) {
+; CHECK-LABEL: define <vscale x 4 x i1> @test_vec_is_not_inf_or_nan(
+; CHECK-SAME: <vscale x 4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.is.fpclass.nxv4f64(<vscale x 4 x double> [[ARG]], i32 504)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> %arg)
+  %ret = fcmp one <vscale x 4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 4 x i1> %ret
+}
+
+define <vscale x 4 x i1> @test_vec_is_inf(<vscale x 4 x double> %arg) {
+; CHECK-LABEL: define <vscale x 4 x i1> @test_vec_is_inf(
+; CHECK-SAME: <vscale x 4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.is.fpclass.nxv4f64(<vscale x 4 x double> [[ARG]], i32 516)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> %arg)
+  %ret = fcmp oeq <vscale x 4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 4 x i1> %ret
+}
+
+define <vscale x 4 x i1> @test_vec_is_not_inf(<vscale x 4 x double> %arg) {
+; CHECK-LABEL: define <vscale x 4 x i1> @test_vec_is_not_inf(
+; CHECK-SAME: <vscale x 4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.is.fpclass.nxv4f64(<vscale x 4 x double> [[ARG]], i32 507)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> %arg)
+  %ret = fcmp une <vscale x 4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 4 x i1> %ret
+}
+
+define i1 @test_fp128_is_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp ueq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp one fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp une fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fpclass-test.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fpclass-test.ll
new file mode 100644
index 0000000000000..525caeb3e79a1
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fpclass-test.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+define i1 @test_is_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp ueq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp one double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp oeq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp une double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define <4 x i1> @test_vec_is_inf_or_nan(<4 x double> %arg) {
+; CHECK-LABEL: define <4 x i1> @test_vec_is_inf_or_nan(
+; CHECK-SAME: <4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> [[ARG]], i32 519)
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %abs = tail call <4 x double> @llvm.fabs.v4f64(<4 x double> %arg)
+  %ret = fcmp ueq <4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @test_vec_is_not_inf_or_nan(<4 x double> %arg) {
+; CHECK-LABEL: define <4 x i1> @test_vec_is_not_inf_or_nan(
+; CHECK-SAME: <4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> [[ARG]], i32 504)
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %abs = tail call <4 x double> @llvm.fabs.v4f64(<4 x double> %arg)
+  %ret = fcmp one <4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @test_vec_is_inf(<4 x double> %arg) {
+; CHECK-LABEL: define <4 x i1> @test_vec_is_inf(
+; CHECK-SAME: <4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> [[ARG]], i32 516)
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %abs = tail call <4 x double> @llvm.fabs.v4f64(<4 x double> %arg)
+  %ret = fcmp oeq <4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @test_vec_is_not_inf(<4 x double> %arg) {
+; CHECK-LABEL: define <4 x i1> @test_vec_is_not_inf(
+; CHECK-SAME: <4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> [[ARG]], i32 507)
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %abs = tail call <4 x double> @llvm.fabs.v4f64(<4 x double> %arg)
+  %ret = fcmp une <4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <4 x i1> %ret
+}
+
+define i1 @test_fp128_is_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp ueq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp one fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp une fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_x86_fp80_is_inf_or_nan(x86_fp80 %arg) {
+; CHECK-LABEL: define i1 @test_x86_fp80_is_inf_or_nan(
+; CHECK-SAME: x86_fp80 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call x86_fp80 @llvm.fabs.f80(x86_fp80 %arg)
+  %ret = fcmp ueq x86_fp80 %abs, 0xK7FFF8000000000000000
+  ret i1 %ret
+}
+
+define i1 @test_x86_fp80_is_not_inf_or_nan(x86_fp80 %arg) {
+; CHECK-LABEL: define i1 @test_x86_fp80_is_not_inf_or_nan(
+; CHECK-SAME: x86_fp80 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call x86_fp80 @llvm.fabs.f80(x86_fp80 %arg)
+  %ret = fcmp one x86_fp80 %abs, 0xK7FFF8000000000000000
+  ret i1 %ret
+}
+
+define i1 @test_x86_fp80_is_inf(x86_fp80 %arg) {
+; CHECK-LABEL: define i1 @test_x86_fp80_is_inf(
+; CHECK-SAME: x86_fp80 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call x86_fp80 @llvm.fabs.f80(x86_fp80 %arg)
+  %ret = fcmp oeq x86_fp80 %abs, 0xK7FFF8000000000000000
+  ret i1 %ret
+}
+
+define i1 @test_x86_fp80_is_not_inf(x86_fp80 %arg) {
+; CHECK-LABEL: define i1 @test_x86_fp80_is_not_inf(
+; CHECK-SAME: x86_fp80 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call x86_fp80 @llvm.fabs.f80(x86_fp80 %arg)
+  %ret = fcmp une x86_fp80 %abs, 0xK7FFF8000000000000000
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
index 196a104adc023..6d8890d71e2be 100644
--- a/llvm/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
+++ b/llvm/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -mtriple=arm64-darwin-unknown -S -passes=consthoist < %s | FileCheck %s
+; RUN: opt -mtriple=arm64-darwin-unknown -S -passes=consthoist < %s | FileCheck %s --check-prefixes=CHECK,CV
+; RUN: opt -mtriple=arm64-darwin-unknown -S -passes=consthoist -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat < %s | FileCheck %s --check-prefixes=CHECK,CI
 
 define i128 @test1(i128 %a) {
 ; CHECK-LABEL: define i128 @test1(
@@ -122,13 +123,37 @@ define i64 @sdiv_minsize(i64 %a) minsize {
 }
 
 define <2 x i64> @sdiv_v2i64(<2 x i64> %a) {
-; CHECK-LABEL: define <2 x i64> @sdiv_v2i64(
-; CHECK-SAME: <2 x i64> [[A:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <2 x i64> [[A]], <i64 4294967087, i64 4294967087>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP1]], <i64 4294967087, i64 4294967087>
-; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+; CV-LABEL: define <2 x i64> @sdiv_v2i64(
+; CV-SAME: <2 x i64> [[A:%.*]]) {
+; CV-NEXT:    [[TMP1:%.*]] = sdiv <2 x i64> [[A]], <i64 4294967087, i64 4294967087>
+; CV-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP1]], <i64 4294967087, i64 4294967087>
+; CV-NEXT:    ret <2 x i64> [[TMP2]]
+;
+; CI-LABEL: define <2 x i64> @sdiv_v2i64(
+; CI-SAME: <2 x i64> [[A:%.*]]) {
+; CI-NEXT:    [[TMP1:%.*]] = sdiv <2 x i64> [[A]], splat (i64 4294967087)
+; CI-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP1]], splat (i64 4294967087)
+; CI-NEXT:    ret <2 x i64> [[TMP2]]
 ;
   %1 = sdiv <2 x i64> %a, <i64 4294967087, i64 4294967087>
   %2 = add <2 x i64> %1, <i64 4294967087, i64 4294967087>
   ret <2 x i64> %2
 }
+
+define <vscale x 2 x i64> @sdiv_nxv2i64(<vscale x 2 x i64> %a) {
+; CV-LABEL: define <vscale x 2 x i64> @sdiv_nxv2i64(
+; CV-SAME: <vscale x 2 x i64> [[A:%.*]]) {
+; CV-NEXT:    [[TMP1:%.*]] = sdiv <vscale x 2 x i64> [[A]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4294967087, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CV-NEXT:    [[TMP2:%.*]] = add <vscale x 2 x i64> [[TMP1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4294967087, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CV-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+;
+; CI-LABEL: define <vscale x 2 x i64> @sdiv_nxv2i64(
+; CI-SAME: <vscale x 2 x i64> [[A:%.*]]) {
+; CI-NEXT:    [[TMP1:%.*]] = sdiv <vscale x 2 x i64> [[A]], splat (i64 4294967087)
+; CI-NEXT:    [[TMP2:%.*]] = add <vscale x 2 x i64> [[TMP1]], splat (i64 4294967087)
+; CI-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+;
+  %1 = sdiv <vscale x 2 x i64> %a, splat (i64 4294967087)
+  %2 = add <vscale x 2 x i64> %1, splat (i64 4294967087)
+  ret <vscale x 2 x i64> %2
+}
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
index 9c2b1ece1624b..d9dba92ec4eb7 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
@@ -88,7 +88,6 @@ exit:
   ret void
 }
 
-; FIXME: The fakeresume1 here should be marked as musttail.
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
index 860032bd3cf8e..2257d5aee473e 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
@@ -88,7 +88,6 @@ exit:
   ret void
 }
 
-; FIXME: The fakeresume1 here should be marked as musttail.
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
index df725b9a7fa47..696bd55d2dfdd 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=dfa-jump-threading %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -dfa-early-exit-heuristic=false %s | FileCheck %s
 
 ; These tests check if selects are unfolded properly for jump threading
 ; opportunities. There are three different patterns to consider:
diff --git a/llvm/test/Transforms/DFAJumpThreading/unpredictable-heuristic.ll b/llvm/test/Transforms/DFAJumpThreading/unpredictable-heuristic.ll
new file mode 100644
index 0000000000000..9743f0acc8165
--- /dev/null
+++ b/llvm/test/Transforms/DFAJumpThreading/unpredictable-heuristic.ll
@@ -0,0 +1,124 @@
+; REQUIRES: asserts
+; RUN: opt -S -passes=dfa-jump-threading %s -debug-only=dfa-jump-threading 2>&1 | FileCheck %s
+
+; CHECK-COUNT-3: Exiting early due to unpredictability heuristic.
+
+@.str.1 = private unnamed_addr constant [3 x i8] c"10\00", align 1
+@.str.2 = private unnamed_addr constant [3 x i8] c"30\00", align 1
+@.str.3 = private unnamed_addr constant [3 x i8] c"20\00", align 1
+@.str.4 = private unnamed_addr constant [3 x i8] c"40\00", align 1
+
+define void @test1(i32 noundef %num, i32 noundef %num2) {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %entry, %sw.epilog
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %num.addr.1, %sw.epilog ]
+  switch i32 %num.addr.0, label %sw.default [
+    i32 10, label %sw.bb
+    i32 30, label %sw.bb1
+    i32 20, label %sw.bb2
+    i32 40, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %while.body
+  %call.i = tail call i32 @bar(ptr noundef nonnull @.str.1)
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %while.body
+  %call.i4 = tail call i32 @bar(ptr noundef nonnull @.str.2)
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %while.body
+  %call.i5 = tail call i32 @bar(ptr noundef nonnull @.str.3)
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %while.body
+  %call.i6 = tail call i32 @bar(ptr noundef nonnull @.str.4)
+  %call = tail call noundef i32 @foo()
+  %add = add nsw i32 %call, %num2
+  br label %sw.epilog
+
+sw.default:                                       ; preds = %while.body
+  ret void
+
+sw.epilog:                                        ; preds = %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb
+  %num.addr.1 = phi i32 [ %add, %sw.bb3 ], [ 40, %sw.bb2 ], [ 20, %sw.bb1 ], [ 30, %sw.bb ]
+  br label %while.body
+}
+
+
+define void @test2(i32 noundef %num, i32 noundef %num2) {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %entry, %sw.epilog
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %num.addr.1, %sw.epilog ]
+  switch i32 %num.addr.0, label %sw.default [
+    i32 10, label %sw.epilog
+    i32 30, label %sw.bb1
+    i32 20, label %sw.bb2
+    i32 40, label %sw.bb3
+  ]
+
+sw.bb1:                                           ; preds = %while.body
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %while.body
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %while.body
+  br label %sw.epilog
+
+sw.default:                                       ; preds = %while.body
+  ret void
+
+sw.epilog:                                        ; preds = %while.body, %sw.bb3, %sw.bb2, %sw.bb1
+  %.str.4.sink = phi ptr [ @.str.4, %sw.bb3 ], [ @.str.3, %sw.bb2 ], [ @.str.2, %sw.bb1 ], [ @.str.1, %while.body ]
+  %num.addr.1 = phi i32 [ %num2, %sw.bb3 ], [ 40, %sw.bb2 ], [ 20, %sw.bb1 ], [ 30, %while.body ]
+  %call.i6 = tail call i32 @bar(ptr noundef nonnull %.str.4.sink)
+  br label %while.body
+}
+
+
+define void @test3(i32 noundef %num, i32 noundef %num2) {
+entry:
+  %add = add nsw i32 %num2, 40
+  br label %while.body
+
+while.body:                                       ; preds = %entry, %sw.epilog
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %num.addr.1, %sw.epilog ]
+  switch i32 %num.addr.0, label %sw.default [
+    i32 10, label %sw.bb
+    i32 30, label %sw.bb1
+    i32 20, label %sw.bb2
+    i32 40, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %while.body
+  %call.i = tail call i32 @bar(ptr noundef nonnull @.str.1)
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %while.body
+  %call.i5 = tail call i32 @bar(ptr noundef nonnull @.str.2)
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %while.body
+  %call.i6 = tail call i32 @bar(ptr noundef nonnull @.str.3)
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %while.body
+  %call.i7 = tail call i32 @bar(ptr noundef nonnull @.str.4)
+  br label %sw.epilog
+
+sw.default:                                       ; preds = %while.body
+  ret void
+
+sw.epilog:                                        ; preds = %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb
+  %num.addr.1 = phi i32 [ %add, %sw.bb3 ], [ 40, %sw.bb2 ], [ 20, %sw.bb1 ], [ 30, %sw.bb ]
+  br label %while.body
+}
+
+
+declare noundef i32 @foo()
+declare noundef i32 @bar(ptr nocapture noundef readonly)
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
index 3961fec5b3be1..76f5248302bad 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
@@ -15,12 +15,12 @@ define half @si129tohalf(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
@@ -100,12 +100,12 @@ define float @si129tofloat(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
@@ -184,12 +184,12 @@ define double @si129todouble(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 53
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 53
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 54, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 55, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 54, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 55, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
@@ -273,12 +273,12 @@ define x86_fp80 @si129tox86_fp80(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP7]], 113
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i129 115, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
@@ -357,12 +357,12 @@ define fp128 @si129tofp128(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP7]], 113
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i129 115, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
index e05ff198ecc33..96d87a5cace98 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
@@ -15,12 +15,12 @@ define half @ui129tohalf(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
@@ -100,12 +100,12 @@ define float @ui129tofloat(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
@@ -184,12 +184,12 @@ define double @ui129todouble(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 53
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 53
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 54, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 55, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 54, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 55, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
@@ -273,12 +273,12 @@ define x86_fp80 @ui129tox86_fp80(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP7]], 113
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i129 115, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
@@ -357,12 +357,12 @@ define fp128 @ui129tofp128(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP7]], 113
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i129 115, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
diff --git a/llvm/test/Transforms/Float2Int/basic.ll b/llvm/test/Transforms/Float2Int/basic.ll
index a454b773f4eba..2854a83179b7e 100644
--- a/llvm/test/Transforms/Float2Int/basic.ll
+++ b/llvm/test/Transforms/Float2Int/basic.ll
@@ -1,29 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=float2int -S | FileCheck %s -check-prefixes=CHECK,NONE
-; RUN: opt < %s -passes=float2int -S --data-layout="n64" | FileCheck %s -check-prefixes=CHECK,ONLY64
-; RUN: opt < %s -passes=float2int -S --data-layout="n8:16:32:64"| FileCheck %s -check-prefixes=CHECK,MULTIPLE
+; RUN: opt < %s -passes='float2int' -S | FileCheck %s
 
 ;
 ; Positive tests
 ;
 
 define i16 @simple1(i8 %a) {
-; NONE-LABEL: @simple1(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 1
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; NONE-NEXT:    ret i16 [[TMP2]]
-;
-; ONLY64-LABEL: @simple1(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = add i64 [[TMP1]], 1
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i16
-; ONLY64-NEXT:    ret i16 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple1(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = add i16 [[TMP1]], 1
-; MULTIPLE-NEXT:    ret i16 [[T21]]
+; CHECK-LABEL: @simple1(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fadd float %t1, 1.0
@@ -32,23 +19,11 @@ define i16 @simple1(i8 %a) {
 }
 
 define i8 @simple2(i8 %a) {
-; NONE-LABEL: @simple2(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i8
-; NONE-NEXT:    ret i8 [[TMP2]]
-;
-; ONLY64-LABEL: @simple2(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = sub i64 [[TMP1]], 1
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i8
-; ONLY64-NEXT:    ret i8 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple2(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = sub i16 [[TMP1]], 1
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = trunc i16 [[T21]] to i8
-; MULTIPLE-NEXT:    ret i8 [[TMP2]]
+; CHECK-LABEL: @simple2(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i8
+; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fsub float %t1, 1.0
@@ -57,22 +32,10 @@ define i8 @simple2(i8 %a) {
 }
 
 define i32 @simple3(i8 %a) {
-; NONE-LABEL: @simple3(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
-; NONE-NEXT:    ret i32 [[T21]]
-;
-; ONLY64-LABEL: @simple3(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = sub i64 [[TMP1]], 1
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i32
-; ONLY64-NEXT:    ret i32 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple3(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = sub i16 [[TMP1]], 1
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i16 [[T21]] to i32
-; MULTIPLE-NEXT:    ret i32 [[TMP2]]
+; CHECK-LABEL: @simple3(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    ret i32 [[T21]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fsub float %t1, 1.0
@@ -81,23 +44,11 @@ define i32 @simple3(i8 %a) {
 }
 
 define i1 @cmp(i8 %a, i8 %b) {
-; NONE-LABEL: @cmp(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; NONE-NEXT:    [[T31:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
-; NONE-NEXT:    ret i1 [[T31]]
-;
-; ONLY64-LABEL: @cmp(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
-; ONLY64-NEXT:    [[T31:%.*]] = icmp slt i64 [[TMP1]], [[TMP2]]
-; ONLY64-NEXT:    ret i1 [[T31]]
-;
-; MULTIPLE-LABEL: @cmp(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
-; MULTIPLE-NEXT:    [[T31:%.*]] = icmp slt i16 [[TMP1]], [[TMP2]]
-; MULTIPLE-NEXT:    ret i1 [[T31]]
+; CHECK-LABEL: @cmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[T31:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[T31]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = uitofp i8 %b to float
@@ -119,27 +70,12 @@ define i32 @simple4(i32 %a) {
 }
 
 define i32 @simple5(i8 %a, i8 %b) {
-; NONE-LABEL: @simple5(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; NONE-NEXT:    [[T31:%.*]] = add i32 [[TMP1]], 1
-; NONE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; NONE-NEXT:    ret i32 [[T42]]
-;
-; ONLY64-LABEL: @simple5(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
-; ONLY64-NEXT:    [[T31:%.*]] = add i64 [[TMP1]], 1
-; ONLY64-NEXT:    [[T42:%.*]] = mul i64 [[T31]], [[TMP2]]
-; ONLY64-NEXT:    [[TMP3:%.*]] = trunc i64 [[T42]] to i32
-; ONLY64-NEXT:    ret i32 [[TMP3]]
-;
-; MULTIPLE-LABEL: @simple5(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; MULTIPLE-NEXT:    [[T31:%.*]] = add i32 [[TMP1]], 1
-; MULTIPLE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; MULTIPLE-NEXT:    ret i32 [[T42]]
+; CHECK-LABEL: @simple5(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[T31:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[T42]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = uitofp i8 %b to float
@@ -150,27 +86,12 @@ define i32 @simple5(i8 %a, i8 %b) {
 }
 
 define i32 @simple6(i8 %a, i8 %b) {
-; NONE-LABEL: @simple6(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; NONE-NEXT:    [[T31:%.*]] = sub i32 0, [[TMP1]]
-; NONE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; NONE-NEXT:    ret i32 [[T42]]
-;
-; ONLY64-LABEL: @simple6(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
-; ONLY64-NEXT:    [[T31:%.*]] = sub i64 0, [[TMP1]]
-; ONLY64-NEXT:    [[T42:%.*]] = mul i64 [[T31]], [[TMP2]]
-; ONLY64-NEXT:    [[TMP3:%.*]] = trunc i64 [[T42]] to i32
-; ONLY64-NEXT:    ret i32 [[TMP3]]
-;
-; MULTIPLE-LABEL: @simple6(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; MULTIPLE-NEXT:    [[T31:%.*]] = sub i32 0, [[TMP1]]
-; MULTIPLE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; MULTIPLE-NEXT:    ret i32 [[T42]]
+; CHECK-LABEL: @simple6(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[T31:%.*]] = sub i32 0, [[TMP1]]
+; CHECK-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[T42]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = uitofp i8 %b to float
@@ -184,37 +105,15 @@ define i32 @simple6(i8 %a, i8 %b) {
 ; cause failure of the other.
 
 define i32 @multi1(i8 %a, i8 %b, i8 %c, float %d) {
-; NONE-LABEL: @multi1(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; NONE-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
-; NONE-NEXT:    [[X1:%.*]] = add i32 [[TMP1]], [[TMP2]]
-; NONE-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
-; NONE-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
-; NONE-NEXT:    [[R:%.*]] = add i32 [[X1]], [[W]]
-; NONE-NEXT:    ret i32 [[R]]
-;
-; ONLY64-LABEL: @multi1(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
-; ONLY64-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
-; ONLY64-NEXT:    [[X1:%.*]] = add i64 [[TMP1]], [[TMP2]]
-; ONLY64-NEXT:    [[TMP3:%.*]] = trunc i64 [[X1]] to i32
-; ONLY64-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
-; ONLY64-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
-; ONLY64-NEXT:    [[R:%.*]] = add i32 [[TMP3]], [[W]]
-; ONLY64-NEXT:    ret i32 [[R]]
-;
-; MULTIPLE-LABEL: @multi1(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
-; MULTIPLE-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
-; MULTIPLE-NEXT:    [[X1:%.*]] = add i16 [[TMP1]], [[TMP2]]
-; MULTIPLE-NEXT:    [[TMP3:%.*]] = zext i16 [[X1]] to i32
-; MULTIPLE-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
-; MULTIPLE-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
-; MULTIPLE-NEXT:    [[R:%.*]] = add i32 [[TMP3]], [[W]]
-; MULTIPLE-NEXT:    ret i32 [[R]]
+; CHECK-LABEL: @multi1(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
+; CHECK-NEXT:    [[X1:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
+; CHECK-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[X1]], [[W]]
+; CHECK-NEXT:    ret i32 [[R]]
 ;
   %fa = uitofp i8 %a to float
   %fb = uitofp i8 %b to float
@@ -228,22 +127,11 @@ define i32 @multi1(i8 %a, i8 %b, i8 %c, float %d) {
 }
 
 define i16 @simple_negzero(i8 %a) {
-; NONE-LABEL: @simple_negzero(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 0
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; NONE-NEXT:    ret i16 [[TMP2]]
-;
-; ONLY64-LABEL: @simple_negzero(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = add i64 [[TMP1]], 0
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i16
-; ONLY64-NEXT:    ret i16 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple_negzero(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = add i16 [[TMP1]], 0
-; MULTIPLE-NEXT:    ret i16 [[T21]]
+; CHECK-LABEL: @simple_negzero(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fadd fast float %t1, -0.0
@@ -252,26 +140,12 @@ define i16 @simple_negzero(i8 %a) {
 }
 
 define i32 @simple_negative(i8 %call) {
-; NONE-LABEL: @simple_negative(
-; NONE-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i32
-; NONE-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP1]], -3
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[MUL1]] to i8
-; NONE-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
-; NONE-NEXT:    ret i32 [[CONV3]]
-;
-; ONLY64-LABEL: @simple_negative(
-; ONLY64-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i64
-; ONLY64-NEXT:    [[MUL1:%.*]] = mul i64 [[TMP1]], -3
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[MUL1]] to i8
-; ONLY64-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
-; ONLY64-NEXT:    ret i32 [[CONV3]]
-;
-; MULTIPLE-LABEL: @simple_negative(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i16
-; MULTIPLE-NEXT:    [[MUL1:%.*]] = mul i16 [[TMP1]], -3
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = trunc i16 [[MUL1]] to i8
-; MULTIPLE-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
-; MULTIPLE-NEXT:    ret i32 [[CONV3]]
+; CHECK-LABEL: @simple_negative(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i32
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP1]], -3
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[MUL1]] to i8
+; CHECK-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[CONV3]]
 ;
   %conv1 = sitofp i8 %call to float
   %mul = fmul float %conv1, -3.000000e+00
@@ -281,22 +155,11 @@ define i32 @simple_negative(i8 %call) {
 }
 
 define i16 @simple_fneg(i8 %a) {
-; NONE-LABEL: @simple_fneg(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = sub i32 0, [[TMP1]]
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; NONE-NEXT:    ret i16 [[TMP2]]
-;
-; ONLY64-LABEL: @simple_fneg(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = sub i64 0, [[TMP1]]
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i16
-; ONLY64-NEXT:    ret i16 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple_fneg(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = sub i16 0, [[TMP1]]
-; MULTIPLE-NEXT:    ret i16 [[T21]]
+; CHECK-LABEL: @simple_fneg(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = sub i32 0, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fneg fast float %t1
diff --git a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
index 0c055981260b2..55013aa96551d 100644
--- a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
+++ b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
@@ -2,9 +2,9 @@
 
 ; CHECK-LABEL: define {{.*}}@fun
 ; CHECK: call {{.*}}@fun.cold.1(
-; CHECK-NEXT: ret void
+; CHECK-NEXT: unreachable
 ; CHECK: call {{.*}}@fun.cold.2(
-; CHECK-NEXT: ret void
+; CHECK-NEXT: unreachable
 define void @fun() {
 entry:
   br i1 undef, label %A.then, label %A.else
diff --git a/llvm/test/Transforms/IRCE/compound-loop-bound.ll b/llvm/test/Transforms/IRCE/compound-loop-bound.ll
index 0930d19e22154..e50d8c6127f40 100644
--- a/llvm/test/Transforms/IRCE/compound-loop-bound.ll
+++ b/llvm/test/Transforms/IRCE/compound-loop-bound.ll
@@ -16,23 +16,56 @@ define void @incrementing_loop(ptr %arr, ptr %len_ptr, i32 %K, i32 %M) {
 ; CHECK-NEXT:    br i1 [[AND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK:       preheader:
 ; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[K]], i32 [[M]])
+; CHECK-NEXT:    [[SMIN1:%.*]] = call i32 @llvm.smin.i32(i32 [[LEN]], i32 [[M]])
+; CHECK-NEXT:    [[SMIN2:%.*]] = call i32 @llvm.smin.i32(i32 [[SMIN1]], i32 [[K]])
+; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN2]], i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i32 0, [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:       loop.preheader:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[IDX_NEXT:%.*]], [[IN_BOUNDS:%.*]] ]
-; CHECK-NEXT:    [[IDX_NEXT]] = add i32 [[IDX]], 1
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_NEXT:%.*]], [[IN_BOUNDS:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[IDX_NEXT]] = add nsw i32 [[IDX]], 1
 ; CHECK-NEXT:    [[GUARD:%.*]] = icmp slt i32 [[IDX]], [[LEN]]
-; CHECK-NEXT:    br i1 [[GUARD]], label [[IN_BOUNDS]], label [[OUT_OF_BOUNDS:%.*]]
+; CHECK-NEXT:    br i1 true, label [[IN_BOUNDS]], label [[OUT_OF_BOUNDS_LOOPEXIT3:%.*]]
 ; CHECK:       in.bounds:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    [[NEXT:%.*]] = icmp slt i32 [[IDX_NEXT]], [[SMIN]]
-; CHECK-NEXT:    br i1 [[NEXT]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[IDX_NEXT]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK:       main.exit.selector:
+; CHECK-NEXT:    [[IDX_NEXT_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT]], [[IN_BOUNDS]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[IDX_NEXT_LCSSA]], [[SMIN]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MAIN_PSEUDO_EXIT]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    [[IDX_COPY:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[IDX_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[IDX_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[POSTLOOP:%.*]]
+; CHECK:       out.of.bounds.loopexit:
+; CHECK-NEXT:    br label [[OUT_OF_BOUNDS:%.*]]
+; CHECK:       out.of.bounds.loopexit3:
+; CHECK-NEXT:    br label [[OUT_OF_BOUNDS]]
 ; CHECK:       out.of.bounds:
 ; CHECK-NEXT:    ret void
+; CHECK:       exit.loopexit.loopexit:
+; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
+; CHECK:       postloop:
+; CHECK-NEXT:    br label [[LOOP_POSTLOOP:%.*]]
+; CHECK:       loop.postloop:
+; CHECK-NEXT:    [[IDX_POSTLOOP:%.*]] = phi i32 [ [[IDX_COPY]], [[POSTLOOP]] ], [ [[IDX_NEXT_POSTLOOP:%.*]], [[IN_BOUNDS_POSTLOOP:%.*]] ]
+; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add i32 [[IDX_POSTLOOP]], 1
+; CHECK-NEXT:    [[GUARD_POSTLOOP:%.*]] = icmp slt i32 [[IDX_POSTLOOP]], [[LEN]]
+; CHECK-NEXT:    br i1 [[GUARD_POSTLOOP]], label [[IN_BOUNDS_POSTLOOP]], label [[OUT_OF_BOUNDS_LOOPEXIT:%.*]]
+; CHECK:       in.bounds.postloop:
+; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
+; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], [[SMIN]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !loop_constrainer.loop.clone !6
 ;
 entry:
   %len = load i32, ptr %len_ptr, !range !0
@@ -78,24 +111,58 @@ define void @decrementing_loop(ptr %arr, ptr %len_ptr, i32 %K, i32 %M) {
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CHECK0]], [[CHECK1]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK:       preheader:
-; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[K]], i32 [[M]])
+; CHECK-NEXT:    [[INDVAR_START:%.*]] = call i32 @llvm.smin.i32(i32 [[K]], i32 [[M]])
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDVAR_START]], 1
+; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[LEN]], i32 [[TMP0]])
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN]], i32 0)
+; CHECK-NEXT:    [[EXIT_PRELOOP_AT:%.*]] = add nsw i32 [[SMAX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[INDVAR_START]], [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP_PREHEADER:%.*]], label [[PRELOOP_PSEUDO_EXIT:%.*]]
+; CHECK:       loop.preloop.preheader:
+; CHECK-NEXT:    br label [[LOOP_PRELOOP:%.*]]
+; CHECK:       mainloop:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[SMIN]], [[PREHEADER]] ], [ [[IDX_DEC:%.*]], [[IN_BOUNDS:%.*]] ]
-; CHECK-NEXT:    [[IDX_DEC]] = sub i32 [[IDX]], 1
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_PRELOOP_COPY:%.*]], [[MAINLOOP:%.*]] ], [ [[IDX_DEC:%.*]], [[IN_BOUNDS:%.*]] ]
+; CHECK-NEXT:    [[IDX_DEC]] = sub nsw i32 [[IDX]], 1
 ; CHECK-NEXT:    [[GUARD:%.*]] = icmp slt i32 [[IDX]], [[LEN]]
-; CHECK-NEXT:    br i1 [[GUARD]], label [[IN_BOUNDS]], label [[OUT_OF_BOUNDS:%.*]]
+; CHECK-NEXT:    br i1 true, label [[IN_BOUNDS]], label [[OUT_OF_BOUNDS_LOOPEXIT1:%.*]]
 ; CHECK:       in.bounds:
 ; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    [[NEXT:%.*]] = icmp sgt i32 [[IDX_DEC]], -1
-; CHECK-NEXT:    br i1 [[NEXT]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK-NEXT:    br i1 [[NEXT]], label [[LOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]]
+; CHECK:       out.of.bounds.loopexit:
+; CHECK-NEXT:    br label [[OUT_OF_BOUNDS:%.*]]
+; CHECK:       out.of.bounds.loopexit1:
+; CHECK-NEXT:    br label [[OUT_OF_BOUNDS]]
 ; CHECK:       out.of.bounds:
 ; CHECK-NEXT:    ret void
+; CHECK:       exit.loopexit.loopexit:
+; CHECK-NEXT:    br label [[EXIT_LOOPEXIT:%.*]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
+; CHECK:       loop.preloop:
+; CHECK-NEXT:    [[IDX_PRELOOP:%.*]] = phi i32 [ [[IDX_DEC_PRELOOP:%.*]], [[IN_BOUNDS_PRELOOP:%.*]] ], [ [[INDVAR_START]], [[LOOP_PRELOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[IDX_DEC_PRELOOP]] = sub i32 [[IDX_PRELOOP]], 1
+; CHECK-NEXT:    [[GUARD_PRELOOP:%.*]] = icmp slt i32 [[IDX_PRELOOP]], [[LEN]]
+; CHECK-NEXT:    br i1 [[GUARD_PRELOOP]], label [[IN_BOUNDS_PRELOOP]], label [[OUT_OF_BOUNDS_LOOPEXIT:%.*]]
+; CHECK:       in.bounds.preloop:
+; CHECK-NEXT:    [[ADDR_PRELOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_PRELOOP]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
+; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp sgt i32 [[IDX_DEC_PRELOOP]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[IDX_DEC_PRELOOP]], [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone !6
+; CHECK:       preloop.exit.selector:
+; CHECK-NEXT:    [[IDX_DEC_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_DEC_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[IDX_DEC_PRELOOP_LCSSA]], -1
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRELOOP_PSEUDO_EXIT]], label [[EXIT_LOOPEXIT]]
+; CHECK:       preloop.pseudo.exit:
+; CHECK-NEXT:    [[IDX_PRELOOP_COPY]] = phi i32 [ [[INDVAR_START]], [[PREHEADER]] ], [ [[IDX_DEC_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END:%.*]] = phi i32 [ [[INDVAR_START]], [[PREHEADER]] ], [ [[IDX_DEC_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[MAINLOOP]]
 ;
   entry:
   %len = load i32, ptr %len_ptr, !range !0
diff --git a/llvm/test/Transforms/IROutliner/outlining-no-return-functions.ll b/llvm/test/Transforms/IROutliner/outlining-no-return-functions.ll
index 6b12207a32dff..d2c41376f6b2c 100644
--- a/llvm/test/Transforms/IROutliner/outlining-no-return-functions.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-no-return-functions.ll
@@ -29,19 +29,19 @@ bb1:
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    call void @outlined_ir_func_0()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    call void @outlined_ir_func_0()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: @f3(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    call void @outlined_ir_func_0()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define internal void @outlined_ir_func_0(
diff --git a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
index 0e21bf8ba347a..59a0241bfe9fd 100644
--- a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
+++ b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
@@ -493,3 +493,55 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %cmp = icmp ult i32 %add, %length
   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
 }
+
+; Test that we can handle shl and disjoint or in getExtendedOperandRecurrence.
+define void @foo7(i32 %n, ptr %a, i32 %x) {
+; CHECK-LABEL: @foo7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add1 = add nsw i32 %x, 2
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add2, %for.body ]
+  %mul = shl nsw i32 %i.07, 1
+  %add = or disjoint i32 %mul, 1
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+  store i32 %i.07, ptr %arrayidx, align 4
+  %add2 = add nsw i32 %add1, %i.07
+  %cmp = icmp slt i32 %add2, %n
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
diff --git a/llvm/test/Transforms/InstCombine/and-or-implied-cond-not.ll b/llvm/test/Transforms/InstCombine/and-or-implied-cond-not.ll
new file mode 100644
index 0000000000000..89b3164018ac9
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/and-or-implied-cond-not.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @test_imply_not1(i32 %depth) {
+; CHECK-LABEL: define i1 @test_imply_not1(
+; CHECK-SAME: i32 [[DEPTH:%.*]]) {
+; CHECK-NEXT:    [[CMP1_NOT1:%.*]] = icmp eq i32 [[DEPTH]], 16
+; CHECK-NEXT:    call void @use(i1 [[CMP1_NOT1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[DEPTH]], 8
+; CHECK-NEXT:    call void @use(i1 [[CMP2]])
+; CHECK-NEXT:    br i1 [[CMP1_NOT1]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @func1()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @func2()
+; CHECK-NEXT:    unreachable
+;
+  %cmp1 = icmp eq i32 %depth, 16
+  call void @use(i1 %cmp1)
+  %cmp2 = icmp slt i32 %depth, 8
+  call void @use(i1 %cmp2)
+  %cmp.not = xor i1 %cmp1, true
+  %brmerge = or i1 %cmp2, %cmp.not
+  br i1 %brmerge, label %if.then, label %if.else
+if.then:
+  call void @func1()
+  unreachable
+
+if.else:
+  call void @func2()
+  unreachable
+}
+
+define i1 @test_imply_not2(i32 %a, i1 %cmp2) {
+; CHECK-LABEL: define i1 @test_imply_not2(
+; CHECK-SAME: i32 [[A:%.*]], i1 [[CMP2:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[BRMERGE]]
+;
+  %cmp1 = icmp eq i32 %a, 0
+  %or.cond = select i1 %cmp1, i1 %cmp2, i1 false
+  %cmp.not = xor i1 %cmp1, true
+  %brmerge = or i1 %or.cond, %cmp.not
+  ret i1 %brmerge
+}
+
+define i1 @test_imply_not3(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: define i1 @test_imply_not3(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A]], [[B]]
+; CHECK-NEXT:    call void @use(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP2]], i1 [[COND]], i1 false
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp eq i32 %a, %b
+  call void @use(i1 %cmp1)
+  %cmp2 = icmp slt i32 %a, %b
+  %cmp.not = xor i1 %cmp1, true
+  %sel = select i1 %cmp.not, i1 %cond, i1 false
+  %and = and i1 %cmp2, %sel
+  ret i1 %and
+}
+
+declare void @func1()
+declare void @func2()
+declare void @use(i1)
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 927f0a86b0a25..87c75fb2b5559 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -386,7 +386,7 @@ define i1 @nonnull5(ptr %a) {
 define i32 @assumption_conflicts_with_known_bits(i32 %a, i32 %b) {
 ; CHECK-LABEL: @assumption_conflicts_with_known_bits(
 ; CHECK-NEXT:    store i1 true, ptr poison, align 1
-; CHECK-NEXT:    ret i32 1
+; CHECK-NEXT:    ret i32 poison
 ;
   %and1 = and i32 %b, 3
   %B1 = lshr i32 %and1, %and1
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index 7d2b872985d5c..82cdb3ce6bee6 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -1004,3 +1004,123 @@ define float @test_ui_add_with_signed_constant(i32 %shr.i) {
   %add = fadd float %sub, -16383.0
   ret float %add
 }
+
+
+;; Reduced form of bug noticed due to #82555
+define float @missed_nonzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @missed_nonzero_check_on_constant_for_si_fmul(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp i16 [[CONV_I]] to float
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul float [[CONV1_I]], 0.000000e+00
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret float [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i = trunc i32 %sel to i16
+  %conv1.i = sitofp i16 %conv.i to float
+  %mul3.i.i = fmul float %conv1.i, 0.000000e+00
+  store i32 %sel, ptr %g_2345, align 4
+  ret float %mul3.i.i
+}
+
+define <2 x float> @missed_nonzero_check_on_constant_for_si_fmul_vec(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @missed_nonzero_check_on_constant_for_si_fmul_vec(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I_S:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], zeroinitializer
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i.s = trunc i32 %sel to i16
+  %conv.i.v = insertelement <2 x i16> poison, i16 %conv.i.s, i64 0
+  %conv.i = insertelement <2 x i16> %conv.i.v, i16 %conv.i.s, i64 1
+  %conv1.i = sitofp <2 x i16> %conv.i to <2 x float>
+  %mul3.i.i = fmul <2 x float> %conv1.i, zeroinitializer
+  store i32 %sel, ptr %g_2345, align 4
+  ret <2 x float> %mul3.i.i
+}
+
+define float @negzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @negzero_check_on_constant_for_si_fmul(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp i16 [[CONV_I]] to float
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul float [[CONV1_I]], -0.000000e+00
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret float [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i = trunc i32 %sel to i16
+  %conv1.i = sitofp i16 %conv.i to float
+  %mul3.i.i = fmul float %conv1.i, -0.000000e+00
+  store i32 %sel, ptr %g_2345, align 4
+  ret float %mul3.i.i
+}
+
+define <2 x float> @nonzero_check_on_constant_for_si_fmul_vec_w_undef(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_vec_w_undef(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I_S:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], <float undef, float 0.000000e+00>
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i.s = trunc i32 %sel to i16
+  %conv.i.v = insertelement <2 x i16> poison, i16 %conv.i.s, i64 0
+  %conv.i = insertelement <2 x i16> %conv.i.v, i16 %conv.i.s, i64 1
+  %conv1.i = sitofp <2 x i16> %conv.i to <2 x float>
+  %mul3.i.i = fmul <2 x float> %conv1.i, <float undef, float 0.000000e+00>
+  store i32 %sel, ptr %g_2345, align 4
+  ret <2 x float> %mul3.i.i
+}
+
+define <2 x float> @nonzero_check_on_constant_for_si_fmul_nz_vec_w_undef(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_nz_vec_w_undef(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I_S:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], <float undef, float 1.000000e+00>
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i.s = trunc i32 %sel to i16
+  %conv.i.v = insertelement <2 x i16> poison, i16 %conv.i.s, i64 0
+  %conv.i = insertelement <2 x i16> %conv.i.v, i16 %conv.i.s, i64 1
+  %conv1.i = sitofp <2 x i16> %conv.i to <2 x float>
+  %mul3.i.i = fmul <2 x float> %conv1.i, <float undef, float 1.000000e+00>
+  store i32 %sel, ptr %g_2345, align 4
+  ret <2 x float> %mul3.i.i
+}
+
+define <2 x float> @nonzero_check_on_constant_for_si_fmul_negz_vec_w_undef(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_negz_vec_w_undef(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I_S:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], <float undef, float -0.000000e+00>
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i.s = trunc i32 %sel to i16
+  %conv.i.v = insertelement <2 x i16> poison, i16 %conv.i.s, i64 0
+  %conv.i = insertelement <2 x i16> %conv.i.v, i16 %conv.i.s, i64 1
+  %conv1.i = sitofp <2 x i16> %conv.i to <2 x float>
+  %mul3.i.i = fmul <2 x float> %conv1.i, <float undef, float -0.000000e+00>
+  store i32 %sel, ptr %g_2345, align 4
+  ret <2 x float> %mul3.i.i
+}
diff --git a/llvm/test/Transforms/InstCombine/copysign-fneg-fabs.ll b/llvm/test/Transforms/InstCombine/copysign-fneg-fabs.ll
index af939cf74399a..a63eeab6a4b1e 100644
--- a/llvm/test/Transforms/InstCombine/copysign-fneg-fabs.ll
+++ b/llvm/test/Transforms/InstCombine/copysign-fneg-fabs.ll
@@ -275,6 +275,85 @@ define half @fneg_fabs_copysign_multi_use_fabs(half %x, half %y, ptr %ptr) {
   ret half %fabs.copysign
 }
 
+define half @copysign_pos(half %a) {
+; CHECK-LABEL: @copysign_pos(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH3C00, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xH3C00, half %a)
+  ret half %ret
+}
+
+define half @copysign_neg(half %a) {
+; CHECK-LABEL: @copysign_neg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH3C00, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xHBC00, half %a)
+  ret half %ret
+}
+
+define half @copysign_negzero(half %a) {
+; CHECK-LABEL: @copysign_negzero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xH8000, half %a)
+  ret half %ret
+}
+
+define half @copysign_negnan(half %a) {
+; CHECK-LABEL: @copysign_negnan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH7E00, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xHFE00, half %a)
+  ret half %ret
+}
+
+define half @copysign_neginf(half %a) {
+; CHECK-LABEL: @copysign_neginf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH7C00, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xHFC00, half %a)
+  ret half %ret
+}
+
+define <4 x half> @copysign_splat(<4 x half> %a) {
+; CHECK-LABEL: @copysign_splat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x half> @llvm.copysign.v4f16(<4 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, <4 x half> [[A:%.*]])
+; CHECK-NEXT:    ret <4 x half> [[RET]]
+;
+entry:
+  %ret = call <4 x half> @llvm.copysign.v4f16(<4 x half> splat(half 0xHBC00), <4 x half> %a)
+  ret <4 x half> %ret
+}
+
+; TODO: Support constant folding of fabs
+
+define <4 x half> @copysign_vec4(<4 x half> %a) {
+; CHECK-LABEL: @copysign_vec4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x half> @llvm.copysign.v4f16(<4 x half> <half 0xH3C00, half 0xHBC00, half undef, half poison>, <4 x half> [[A:%.*]])
+; CHECK-NEXT:    ret <4 x half> [[RET]]
+;
+entry:
+  %ret = call <4 x half> @llvm.copysign.v4f16(<4 x half> <half 0xH3C00, half 0xHBC00, half undef, half poison>, <4 x half> %a)
+  ret <4 x half> %ret
+}
+
 declare half @llvm.fabs.f16(half)
 declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
 declare half @llvm.copysign.f16(half, half)
diff --git a/llvm/test/Transforms/InstCombine/double-float-shrink-1.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-1.ll
index 85c9a01e5faba..693730256b6b0 100644
--- a/llvm/test/Transforms/InstCombine/double-float-shrink-1.ll
+++ b/llvm/test/Transforms/InstCombine/double-float-shrink-1.ll
@@ -224,11 +224,20 @@ define double @expm1_test2(float %f)   {
 ; exp10f() doesn't exist for this triple, so it doesn't shrink.
 
 define float @exp10_test1(float %f)   {
-; CHECK-LABEL: @exp10_test1(
-; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[F:%.*]] to double
-; CHECK-NEXT:    [[CALL:%.*]] = call fast double @exp10(double [[CONV]])
-; CHECK-NEXT:    [[CONV1:%.*]] = fptrunc double [[CALL]] to float
-; CHECK-NEXT:    ret float [[CONV1]]
+; LINUX-LABEL: define float @exp10_test1(
+; LINUX-SAME: float [[F:%.*]]) {
+; LINUX-NEXT:    [[EXP10F:%.*]] = call fast float @__exp10f(float [[F]])
+; LINUX-NEXT:    ret float [[EXP10F]]
+;
+; MS64-LABEL: define float @exp10_test1(
+; MS64-SAME: float [[F:%.*]]) {
+; MS64-NEXT:    [[EXP10F:%.*]] = call fast float @exp10f(float [[F]])
+; MS64-NEXT:    ret float [[EXP10F]]
+;
+; MS32-LABEL: define float @exp10_test1(
+; MS32-SAME: float [[F:%.*]]) {
+; MS32-NEXT:    [[EXP10F:%.*]] = call fast float @exp10f(float [[F]])
+; MS32-NEXT:    ret float [[EXP10F]]
 ;
   %conv = fpext float %f to double
   %call = call fast double @exp10(double %conv)
diff --git a/llvm/test/Transforms/InstCombine/extract-select-agg.ll b/llvm/test/Transforms/InstCombine/extract-select-agg.ll
new file mode 100644
index 0000000000000..6ba6b1a575601
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/extract-select-agg.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i64 @test_select_agg_constant_agg(i64 %val, i1 %cond) {
+; CHECK-LABEL: define i64 @test_select_agg_constant_agg(
+; CHECK-SAME: i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = zext i1 [[COND]] to i64
+; CHECK-NEXT:    ret i64 [[RET]]
+;
+entry:
+  %sel = select i1 %cond, { i64, i64 } {i64 1, i64 2}, { i64, i64 } {i64 0, i64 3}
+  %ret = extractvalue { i64, i64 } %sel, 0
+  ret i64 %ret
+}
+
+define void @test_select_agg_constant_agg_multiuse(i64 %val, i1 %cond) {
+; CHECK-LABEL: define void @test_select_agg_constant_agg_multiuse(
+; CHECK-SAME: i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = zext i1 [[COND]] to i64
+; CHECK-NEXT:    call void @use(i64 [[RET]])
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[COND]], i64 2, i64 3
+; CHECK-NEXT:    call void @use(i64 [[V1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sel = select i1 %cond, { i64, i64 } {i64 1, i64 2}, { i64, i64 } {i64 0, i64 3}
+  %v0 = extractvalue { i64, i64 } %sel, 0
+  call void @use(i64 %v0)
+  %v1 = extractvalue { i64, i64 } %sel, 1
+  call void @use(i64 %v1)
+  ret void
+}
+
+; TODO: it can be folded to zext i1 %cond to i64
+define i64 @test_select_agg_constant(i64 %val, i1 %cond) {
+; CHECK-LABEL: define i64 @test_select_agg_constant(
+; CHECK-SAME: i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = insertvalue { i64, i64 } { i64 1, i64 poison }, i64 [[VAL]], 1
+; CHECK-NEXT:    [[B:%.*]] = insertvalue { i64, i64 } { i64 0, i64 poison }, i64 [[VAL]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], { i64, i64 } [[A]], { i64, i64 } [[B]]
+; CHECK-NEXT:    [[RET:%.*]] = extractvalue { i64, i64 } [[SEL]], 0
+; CHECK-NEXT:    ret i64 [[RET]]
+;
+entry:
+  %a = insertvalue { i64, i64 } { i64 1, i64 poison }, i64 %val, 1
+  %b = insertvalue { i64, i64 } { i64 0, i64 poison }, i64 %val, 1
+  %sel = select i1 %cond, { i64, i64 } %a, { i64, i64 } %b
+  %ret = extractvalue { i64, i64 } %sel, 0
+  ret i64 %ret
+}
+
+define void @test_select_agg_multiuse(i1 %cond, i64 %v1, i64 %v2, i64 %v3, i64 %v4) {
+; CHECK-LABEL: define void @test_select_agg_multiuse(
+; CHECK-SAME: i1 [[COND:%.*]], i64 [[V1:%.*]], i64 [[V2:%.*]], i64 [[V3:%.*]], i64 [[V4:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A0:%.*]] = insertvalue { i64, i64 } poison, i64 [[V1]], 0
+; CHECK-NEXT:    [[A1:%.*]] = insertvalue { i64, i64 } [[A0]], i64 [[V2]], 1
+; CHECK-NEXT:    [[B0:%.*]] = insertvalue { i64, i64 } poison, i64 [[V3]], 0
+; CHECK-NEXT:    [[B1:%.*]] = insertvalue { i64, i64 } [[B0]], i64 [[V4]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], { i64, i64 } [[A1]], { i64, i64 } [[B1]]
+; CHECK-NEXT:    [[X:%.*]] = extractvalue { i64, i64 } [[SEL]], 0
+; CHECK-NEXT:    call void @use(i64 [[X]])
+; CHECK-NEXT:    [[Y:%.*]] = extractvalue { i64, i64 } [[SEL]], 1
+; CHECK-NEXT:    call void @use(i64 [[Y]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a0 = insertvalue { i64, i64 } poison, i64 %v1, 0
+  %a1 = insertvalue { i64, i64 } %a0, i64 %v2, 1
+  %b0 = insertvalue { i64, i64 } poison, i64 %v3, 0
+  %b1 = insertvalue { i64, i64 } %b0, i64 %v4, 1
+  %sel = select i1 %cond, { i64, i64 } %a1, { i64, i64 } %b1
+  %x = extractvalue { i64, i64 } %sel, 0
+  call void @use(i64 %x)
+  %y = extractvalue { i64, i64 } %sel, 1
+  call void @use(i64 %y)
+  ret void
+}
+
+declare void @use(i64)
diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll
index 159c84d0dd8aa..f2701d16d0f3d 100644
--- a/llvm/test/Transforms/InstCombine/fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp.ll
@@ -644,7 +644,7 @@ define <2 x i1> @is_signbit_set_anyzero(<2 x double> %x) {
 
 define i1 @is_signbit_clear(double %x) {
 ; CHECK-LABEL: @is_signbit_clear(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp ogt double [[S]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -655,7 +655,7 @@ define i1 @is_signbit_clear(double %x) {
 
 define i1 @is_signbit_clear_1(double %x) {
 ; CHECK-LABEL: @is_signbit_clear_1(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp ugt double [[S]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -666,7 +666,7 @@ define i1 @is_signbit_clear_1(double %x) {
 
 define i1 @is_signbit_clear_2(double %x) {
 ; CHECK-LABEL: @is_signbit_clear_2(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp oge double [[S]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -677,7 +677,7 @@ define i1 @is_signbit_clear_2(double %x) {
 
 define i1 @is_signbit_clear_3(double %x) {
 ; CHECK-LABEL: @is_signbit_clear_3(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp uge double [[S]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -705,7 +705,7 @@ define i1 @is_signbit_set_extra_use(double %x, ptr %p) {
 
 define i1 @is_signbit_clear_nonzero(double %x) {
 ; CHECK-LABEL: @is_signbit_clear_nonzero(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp ogt double [[S]], 1.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
index a7adcd1ac61e7..dedd12f8cc7a3 100644
--- a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
+++ b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
@@ -427,13 +427,13 @@ define float @fmul_by_snan_if_0_oeq_zero_f32(float %x) {
 define float @fmul_by_var_if_0_oeq_zero_f32(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -441,14 +441,14 @@ define float @fmul_by_fabs_var_if_0_oeq_zero_f32(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_fabs_var_if_0_oeq_zero_f32(
 ; CHECK-NEXT:    [[Y_FABS:%.*]] = call float @llvm.fabs.f32(float [[Y:%.*]])
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y_FABS]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y_FABS]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %y.fabs = call float @llvm.fabs.f32(float %y)
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %y.fabs
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -467,13 +467,13 @@ define float @fmul_by_fabs_nnan_ninf_var_if_0_oeq_zero_f32(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nsz float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -481,13 +481,13 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul ninf nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nsz ninf float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -495,13 +495,13 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nsz nnan float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -509,13 +509,13 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -558,26 +558,26 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_nsz_inverted(f
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf nsz float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf nsz float %y, %x
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -585,26 +585,26 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(float %x
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero(float %x, float nofpclass(nzero) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero_negsub(float %x, float nofpclass(nzero nsub) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero_negsub(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -622,26 +622,26 @@ define float @fmul_by_var_if_0_oeq_zero_f32_known_never_nan_inf_select_nsz(float
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero(float %x, float nofpclass(nan inf nzero) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero_nsub(float %x, float nofpclass(nan inf nzero nsub) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero_nsub(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -692,26 +692,26 @@ define float @fmul_by_var_if_not_one_0_zero_f32_assume_finite_fmul_nsz(float %x,
 define float @fmul_by_self_if_0_oeq_zero_f32(float %x) {
 ; CHECK-LABEL: @fmul_by_self_if_0_oeq_zero_f32(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %x
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
 define float @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x) {
 ; CHECK-LABEL: @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf nsz float %x, %x
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll
index 3e5c6fd20b12d..32bfdb52bb5f7 100644
--- a/llvm/test/Transforms/InstCombine/fpcast.ll
+++ b/llvm/test/Transforms/InstCombine/fpcast.ll
@@ -347,3 +347,91 @@ define double @masked_uint_to_fpext3(i32 %x) {
   %r = fpext float %f to double
   ret double %r
 }
+
+define i32 @fptosi_nonnorm(float nofpclass(norm) %x) {
+; CHECK-LABEL: @fptosi_nonnorm(
+; CHECK-NEXT:    ret i32 0
+;
+  %ret = fptosi float %x to i32
+  ret i32 %ret
+}
+
+define i32 @fptoui_nonnorm(float nofpclass(pnorm) %x) {
+; CHECK-LABEL: @fptoui_nonnorm(
+; CHECK-NEXT:    ret i32 0
+;
+  %ret = fptoui float %x to i32
+  ret i32 %ret
+}
+
+define i32 @fptosi_nonnnorm(float nofpclass(nnorm) %x) {
+; CHECK-LABEL: @fptosi_nonnnorm(
+; CHECK-NEXT:    [[RET:%.*]] = fptosi float [[X:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %ret = fptosi float %x to i32
+  ret i32 %ret
+}
+
+define i32 @fptoui_nonnnorm(float nofpclass(nnorm) %x) {
+; CHECK-LABEL: @fptoui_nonnnorm(
+; CHECK-NEXT:    [[RET:%.*]] = fptoui float [[X:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %ret = fptoui float %x to i32
+  ret i32 %ret
+}
+
+define i32 @fptosi_nonnorm_copysign(float %x) {
+; CHECK-LABEL: @fptosi_nonnorm_copysign(
+; CHECK-NEXT:    ret i32 0
+;
+  %val = call float @llvm.copysign.f32(float 0.0, float %x)
+  %ret = fptosi float %val to i32
+  ret i32 %ret
+}
+
+define <2 x i32> @fptosi_nonnorm_copysign_vec(<2 x float> %x) {
+; CHECK-LABEL: @fptosi_nonnorm_copysign_vec(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %val = call <2 x float> @llvm.copysign.v2f32(<2 x float> zeroinitializer, <2 x float> %x)
+  %ret = fptosi <2 x float> %val to <2 x i32>
+  ret <2 x i32> %ret
+}
+
+define i32 @fptosi_nonnorm_fmul(float %x) {
+; CHECK-LABEL: @fptosi_nonnorm_fmul(
+; CHECK-NEXT:    [[SEL:%.*]] = fmul float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[RET:%.*]] = fptosi float [[SEL]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %sel = fmul float %x, 0.000000e+00
+  %ret = fptosi float %sel to i32
+  ret i32 %ret
+}
+
+define i32 @fptosi_select(i1 %cond) {
+; CHECK-LABEL: @fptosi_select(
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i32 1, i32 -1
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %sel = select i1 %cond, float 1.0, float -1.0
+  %ret = fptosi float %sel to i32
+  ret i32 %ret
+}
+
+define i32 @mul_pos_zero_convert(i32 %a) {
+; CHECK-LABEL: @mul_pos_zero_convert(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FP:%.*]] = sitofp i32 [[A:%.*]] to float
+; CHECK-NEXT:    [[RET:%.*]] = fmul float [[FP]], 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[RET]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %fp = sitofp i32 %a to float
+  %ret = fmul float %fp, 0.000000e+00
+  %conv = fptosi float %ret to i32
+  ret i32 %conv
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
index 0706092289584..5de3e89d7027a 100644
--- a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
@@ -226,12 +226,11 @@ define i1 @src_is_mask_shl_lshr(i8 %x_in, i8 %y, i1 %cond) {
 
 define i1 @src_is_mask_shl_lshr_fail_not_allones(i8 %x_in, i8 %y, i1 %cond) {
 ; CHECK-LABEL: @src_is_mask_shl_lshr_fail_not_allones(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[TMP1]], -2
-; CHECK-NEXT:    [[NOTMASK:%.*]] = xor i8 [[MASK]], -1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[NOTMASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[MASK]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -471,6 +470,24 @@ define i1 @src_is_notmask_x_xor_neg_x(i8 %x_in, i8 %y, i1 %cond) {
   ret i1 %r
 }
 
+define i1 @src_is_notmask_x_xor_neg_x_inv(i8 %x_in, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_is_notmask_x_xor_neg_x_inv(
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
+; CHECK-NEXT:    [[NEG_Y:%.*]] = add i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NOTMASK0:%.*]] = xor i8 [[NEG_Y]], [[Y]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i8 [[NOTMASK0]], i8 7
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[TMP3]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x = xor i8 %x_in, 123
+  %neg_y = sub i8 0, %y
+  %nmask0 = xor i8 %y, %neg_y
+  %notmask = select i1 %cond, i8 %nmask0, i8 -8
+  %and = and i8 %notmask, %x
+  %r = icmp eq i8 %and, 0
+  ret i1 %r
+}
+
 define i1 @src_is_notmask_shl_fail_multiuse_invert(i8 %x_in, i8 %y, i1 %cond) {
 ; CHECK-LABEL: @src_is_notmask_shl_fail_multiuse_invert(
 ; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 122
@@ -572,11 +589,10 @@ define i1 @src_is_notmask_neg_p2(i8 %x_in, i8 %y) {
 
 define i1 @src_is_notmask_neg_p2_fail_not_invertable(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_notmask_neg_p2_fail_not_invertable(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[Y:%.*]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i8 [[Y]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = icmp uge i8 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -657,3 +673,238 @@ define i1 @src_is_mask_const_sge(i8 %x_in) {
   %r = icmp sge i8 %and, %x
   ret i1 %r
 }
+
+define i1 @src_x_and_mask_slt(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_mask_slt(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[MASK_POS:%.*]] = icmp sgt i8 [[MASK]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MASK_POS]])
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %mask_pos = icmp sge i8 %mask, 0
+  call void @llvm.assume(i1 %mask_pos)
+  %and = and i8 %x, %mask
+  %r = icmp slt i8 %and, %x
+  ret i1 %r
+}
+
+define i1 @src_x_and_mask_sge(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_mask_sge(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[MASK_POS:%.*]] = icmp sgt i8 [[MASK]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MASK_POS]])
+; CHECK-NEXT:    [[R:%.*]] = icmp sge i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %mask_pos = icmp sge i8 %mask, 0
+  call void @llvm.assume(i1 %mask_pos)
+  %and = and i8 %x, %mask
+  %r = icmp sge i8 %and, %x
+  ret i1 %r
+}
+
+define i1 @src_x_and_mask_slt_fail_maybe_neg(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_mask_slt_fail_maybe_neg(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[AND]], [[X]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %and = and i8 %x, %mask
+  %r = icmp slt i8 %and, %x
+  ret i1 %r
+}
+
+define i1 @src_x_and_mask_sge_fail_maybe_neg(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_mask_sge_fail_maybe_neg(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sge i8 [[AND]], [[X]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %and = and i8 %x, %mask
+  %r = icmp sge i8 %and, %x
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_eq(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_eq(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R1:%.*]] = icmp ule i8 [[NOT_MASK0]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT_COND:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[NOT_COND]], i1 true, i1 [[R1]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp eq i8 %not_mask, %and
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_ne(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_ne(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R1:%.*]] = icmp ugt i8 [[NOT_MASK0]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[COND:%.*]], i1 [[R1]], i1 false
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp ne i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_ult(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_ult(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R1:%.*]] = icmp ugt i8 [[NOT_MASK0]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[COND:%.*]], i1 [[R1]], i1 false
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp ult i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_uge(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_uge(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R1:%.*]] = icmp ule i8 [[NOT_MASK0]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT_COND:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[NOT_COND]], i1 true, i1 [[R1]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp uge i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_slt(i8 %x, i8 %y) {
+; CHECK-LABEL: @src_x_and_nmask_slt(
+; CHECK-NEXT:    [[NOT_MASK:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i8 [[NOT_MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask = shl i8 -1, %y
+  %and = and i8 %x, %not_mask
+  %r = icmp slt i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_sge(i8 %x, i8 %y) {
+; CHECK-LABEL: @src_x_and_nmask_sge(
+; CHECK-NEXT:    [[NOT_MASK:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[NOT_MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask = shl i8 -1, %y
+  %and = and i8 %x, %not_mask
+  %r = icmp sge i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_slt_fail_maybe_z(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_slt_fail_maybe_z(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[NOT_MASK:%.*]] = select i1 [[COND:%.*]], i8 [[NOT_MASK0]], i8 0
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[NOT_MASK]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[AND]], [[NOT_MASK]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp slt i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_sge_fail_maybe_z(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_sge_fail_maybe_z(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[NOT_MASK:%.*]] = select i1 [[COND:%.*]], i8 [[NOT_MASK0]], i8 0
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[NOT_MASK]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sge i8 [[AND]], [[NOT_MASK]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp sge i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_or_mask_eq(i8 %x, i8 %y, i8 %z, i1 %c2, i1 %cond) {
+; CHECK-LABEL: @src_x_or_mask_eq(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C2:%.*]], i8 [[TMP1]], i8 -46
+; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.umax.i8(i8 [[Z:%.*]], i8 [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP3]], -12
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[TMP4]], [[MASK]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %nx = xor i8 %x, 123
+  %nx_c = select i1 %c2, i8 %nx, i8 45
+  %nz = xor i8 %z, -1
+  %nx_cc = call i8 @llvm.umin.i8(i8 %nz, i8 %nx_c)
+  %nx_ccc = add i8 %nx_cc, 12
+  %or = or i8 %nx_ccc, %mask
+  %r = icmp eq i8 %or, -1
+  ret i1 %r
+}
+
+define i1 @src_x_or_mask_ne(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_or_mask_ne(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %nx = xor i8 %x, -1
+  %or = or i8 %mask, %nx
+  %r = icmp ne i8 %or, -1
+  ret i1 %r
+}
+
+define i1 @src_x_or_mask_ne_fail_multiuse(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_or_mask_ne_fail_multiuse(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[MASK]], [[NX]]
+; CHECK-NEXT:    call void @use.i8(i8 [[OR]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[OR]], -1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %nx = xor i8 %x, -1
+  %or = or i8 %mask, %nx
+  call void @use.i8(i8 %or)
+  %r = icmp ne i8 %or, -1
+  ret i1 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll b/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll
new file mode 100644
index 0000000000000..bacdb54f787d6
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define i1 @sgt_3_impliesF_eq_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @sgt_3_impliesF_eq_2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[SEL:%.*]], [[X]]
+; CHECK-NEXT:    [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i8 %x, 3
+  %sel = select i1 %cmp, i8 2, i8 %y
+  %cmp2 = icmp eq i8 %sel, %x
+  ret i1 %cmp2
+}
+
+define i1 @sgt_3_impliesT_sgt_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @sgt_3_impliesT_sgt_2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 [[SEL:%.*]], [[X]]
+; CHECK-NEXT:    [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i8 %x, 3
+  %sel = select i1 %cmp, i8 2, i8 %y
+  %cmp2 = icmp sgt i8 %sel, %x
+  ret i1 %cmp2
+}
+
+define i1 @sgt_x_impliesF_eq_smin_todo(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @sgt_x_impliesF_eq_smin_todo(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 -128, i8 [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[SEL]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp = icmp sgt i8 %x, %z
+  %sel = select i1 %cmp, i8 -128, i8 %y
+  %cmp2 = icmp eq i8 %sel, %x
+  ret i1 %cmp2
+}
+
+define i1 @slt_x_impliesT_ne_smin_todo(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @slt_x_impliesT_ne_smin_todo(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 127, i8 [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[SEL]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp = icmp slt i8 %x, %z
+  %sel = select i1 %cmp, i8 127, i8 %y
+  %cmp2 = icmp ne i8 %x, %sel
+  ret i1 %cmp2
+}
+
+define i1 @ult_x_impliesT_eq_umax_todo(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @ult_x_impliesT_eq_umax_todo(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 -1, i8 [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[SEL]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp = icmp ugt i8 %z, %x
+  %sel = select i1 %cmp, i8 255, i8 %y
+  %cmp2 = icmp ne i8 %sel, %x
+  ret i1 %cmp2
+}
+
+define i1 @ult_1_impliesF_eq_1(i8 %x, i8 %y) {
+; CHECK-LABEL: @ult_1_impliesF_eq_1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[SEL:%.*]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[X:%.*]], [[SEL]]
+; CHECK-NEXT:    [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp ult i8 %x, 1
+  %sel = select i1 %cmp, i8 1, i8 %y
+  %cmp2 = icmp eq i8 %x, %sel
+  ret i1 %cmp2
+}
+
+define i1 @ugt_x_impliesF_eq_umin_todo(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @ugt_x_impliesF_eq_umin_todo(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 0, i8 [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[SEL]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp = icmp ugt i8 %z, %x
+  %sel = select i1 %cmp, i8 0, i8 %y
+  %cmp2 = icmp eq i8 %x, %sel
+  ret i1 %cmp2
+}
diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
index a203b28bcb82a..f37226bbd5b09 100644
--- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
@@ -240,3 +240,43 @@ define i32 @vec_to_scalar_select_vector(<2 x i1> %b) {
   %c = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %s)
   ret i32 %c
 }
+
+define i8 @test_drop_noundef(i1 %cond, i8 %val) {
+; CHECK-LABEL: @test_drop_noundef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.smin.i8(i8 [[VAL:%.*]], i8 0)
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i8 -1, i8 [[TMP0]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  %sel = select i1 %cond, i8 -1, i8 %val
+  %ret = call noundef i8 @llvm.smin.i8(i8 %sel, i8 0)
+  ret i8 %ret
+}
+
+define i1 @pr85536(i32 %a) {
+; CHECK-LABEL: @pr85536(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[A:%.*]], 31
+; CHECK-NEXT:    [[SHL1:%.*]] = shl nsw i32 -1, [[A]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SHL1]] to i64
+; CHECK-NEXT:    [[SHL2:%.*]] = shl i64 [[ZEXT]], 48
+; CHECK-NEXT:    [[SHR:%.*]] = ashr exact i64 [[SHL2]], 48
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.smin.i64(i64 [[SHR]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 65535
+; CHECK-NEXT:    [[RET1:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[CMP1]], i1 [[RET1]], i1 false
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cmp1 = icmp ugt i32 %a, 30
+  %shl1 = shl nsw i32 -1, %a
+  %zext = zext i32 %shl1 to i64
+  %shl2 = shl i64 %zext, 48
+  %shr = ashr exact i64 %shl2, 48
+  %sel = select i1 %cmp1, i64 -1, i64 %shr
+  %smin = call noundef i64 @llvm.smin.i64(i64 %sel, i64 0)
+  %masked = and i64 %smin, 65535
+  %ret = icmp eq i64 %masked, 0
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index e7141d7c25ad2..a176d16f2cdfb 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -2049,3 +2049,94 @@ define i32 @zext_negpow2_use(i8 %x) {
   %r = mul i32 %zx, -16777216 ; -1 << 24
   ret i32 %r
 }
+
+define i32 @mul_sext_icmp_with_zero(i32 %x) {
+; CHECK-LABEL: @mul_sext_icmp_with_zero(
+; CHECK-NEXT:    ret i32 0
+;
+  %cmp = icmp eq i32 %x, 0
+  %sext = sext i1 %cmp to i32
+  %mul = mul i32 %sext, %x
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool(
+; CHECK-NEXT:    [[Y_NEG:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  %mul = mul i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool_nuw(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool_nuw(
+; CHECK-NEXT:    [[Y_NEG:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  %mul = mul nuw i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool_nsw(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool_nsw(
+; CHECK-NEXT:    [[Y_NEG:%.*]] = sub nsw i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  %mul = mul nsw i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool_nuw_nsw(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool_nuw_nsw(
+; CHECK-NEXT:    [[Y_NEG:%.*]] = sub nsw i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  %mul = mul nuw nsw i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool_commuted(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[Y:%.*]], -2
+; CHECK-NEXT:    [[YY_NEG1:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[YY_NEG1]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %yy = xor i32 %y, 1
+  %sext = sext i1 %x to i32
+  %mul = mul i32 %yy, %sext
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_nonbool(i2 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_nonbool(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i2 [[X:%.*]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SEXT]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i2 %x to i32
+  %mul = mul i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_multiuse(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_multiuse(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[X:%.*]] to i32
+; CHECK-NEXT:    tail call void @use(i32 [[SEXT]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SEXT]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  tail call void @use(i32 %sext)
+  %mul = mul i32 %sext, %y
+  ret i32 %mul
+}
diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll
index f277d13eee930..98b5d98041560 100644
--- a/llvm/test/Transforms/InstCombine/not.ll
+++ b/llvm/test/Transforms/InstCombine/not.ll
@@ -3,6 +3,8 @@
 
 declare void @use1(i1)
 declare void @use8(i8)
+declare void @f1()
+declare void @f2()
 
 define i32 @test1(i32 %A) {
 ; CHECK-LABEL: @test1(
@@ -858,3 +860,204 @@ define i32 @test_zext(i32 %a, i32 %b){
   %not = xor i32 %add, -1
   ret i32 %not
 }
+
+define void @test_invert_demorgan_or(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @test_invert_demorgan_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[B1:%.*]], 0
+; CHECK-NEXT:    [[OR_NOT1:%.*]] = and i1 [[CMP2]], [[CMP3]]
+; CHECK-NEXT:    [[MERGE:%.*]] = and i1 [[OR_NOT1]], [[COND:%.*]]
+; CHECK-NEXT:    br i1 [[MERGE]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @f1()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp2 = icmp ne i32 %b, 0
+  %or = or i1 %cmp1, %cmp2
+  %not = xor i1 %cond, true
+  %merge = or i1 %not, %or
+  br i1 %merge, label %if.then, label %if.else
+if.then:
+  call void @f1()
+  unreachable
+if.else:
+  call void @f2()
+  unreachable
+}
+
+define i1 @test_invert_demorgan_or2(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: @test_invert_demorgan_or2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[A:%.*]], 24
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[B:%.*]], 60
+; CHECK-NEXT:    [[OR1_NOT1:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 [[C:%.*]], 60
+; CHECK-NEXT:    [[NOT:%.*]] = and i1 [[OR1_NOT1]], [[CMP3]]
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp1 = icmp ugt i64 %a, 23
+  %cmp2 = icmp ugt i64 %b, 59
+  %or1 = or i1 %cmp1, %cmp2
+  %cmp3 = icmp ugt i64 %c, 59
+  %or2 = or i1 %or1, %cmp3
+  %not = xor i1 %or2, true
+  ret i1 %not
+}
+
+define i1 @test_invert_demorgan_or3(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_invert_demorgan_or3(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[A:%.*]], 178206
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[B:%.*]], -196608
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TMP1]], -1506
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B]], -917760
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[TMP2]], -716213
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[B]], -1114112
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp ult i32 [[TMP3]], -196112
+; CHECK-NEXT:    [[OR1_NOT2:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[OR2_NOT1:%.*]] = and i1 [[OR1_NOT2]], [[CMP3]]
+; CHECK-NEXT:    [[NOT:%.*]] = and i1 [[OR2_NOT1]], [[CMP4]]
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp1 = icmp eq i32 %a, 178206
+  %v1 = add i32 %b, -195102
+  %cmp2 = icmp ult i32 %v1, 1506
+  %v2 = add i32 %b, -201547
+  %cmp3 = icmp ult i32 %v2, 716213
+  %v3 = add i32 %b, -918000
+  %cmp4 = icmp ult i32 %v3, 196112
+  %or1 = or i1 %cmp1, %cmp2
+  %or2 = or i1 %or1, %cmp3
+  %or3 = or i1 %or2, %cmp4
+  %not = xor i1 %or3, true
+  ret i1 %not
+}
+
+define i1 @test_invert_demorgan_logical_or(i64 %x, i64 %y) {
+; CHECK-LABEL: @test_invert_demorgan_logical_or(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[X:%.*]], 27
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i64 [[Y:%.*]], 0
+; CHECK-NEXT:    [[SEL_NOT1:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 false
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    [[NOT:%.*]] = and i1 [[CMP3]], [[SEL_NOT1]]
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp1 = icmp eq i64 %x, 27
+  %cmp2 = icmp eq i64 %y, 0
+  %sel = select i1 %cmp1, i1 true, i1 %cmp2
+  %cmp3 = icmp eq i64 %x, 0
+  %or = or i1 %cmp3, %sel
+  %not = xor i1 %or, true
+  ret i1 %not
+}
+
+define i1 @test_invert_demorgan_and(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @test_invert_demorgan_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[B1:%.*]], 0
+; CHECK-NEXT:    [[AND_NOT1:%.*]] = or i1 [[CMP2]], [[CMP3]]
+; CHECK-NEXT:    [[MERGE:%.*]] = or i1 [[AND_NOT1]], [[COND:%.*]]
+; CHECK-NEXT:    br i1 [[MERGE]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @f1()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp2 = icmp ne i32 %b, 0
+  %and = and i1 %cmp1, %cmp2
+  %not = xor i1 %cond, true
+  %merge = and i1 %not, %and
+  br i1 %merge, label %if.then, label %if.else
+if.then:
+  call void @f1()
+  unreachable
+if.else:
+  call void @f2()
+  unreachable
+}
+
+define i64 @test_invert_demorgan_and2(i64 %x) {
+; CHECK-LABEL: @test_invert_demorgan_and2(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[X:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = or i64 [[TMP1]], -9223372036854775808
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add = add i64 %x, 9223372036854775807
+  %and = and i64 %add, 9223372036854775807
+  %sub = xor i64 %and, -1
+  ret i64 %sub
+}
+
+define i1 @test_invert_demorgan_and3(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_invert_demorgan_and3(
+; CHECK-NEXT:    [[ADD:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ADD]], 4095
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 4095
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %not = xor i32 %a, -1
+  %add = add i32 %b, %not
+  %and = and i32 %add, 4095
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+define i1 @test_invert_demorgan_logical_and(i64 %x, i64 %y) {
+; CHECK-LABEL: @test_invert_demorgan_logical_and(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[X:%.*]], 27
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i64 [[Y:%.*]], 0
+; CHECK-NEXT:    [[SEL_NOT1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP2]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    [[NOT:%.*]] = and i1 [[CMP3]], [[SEL_NOT1]]
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp1 = icmp eq i64 %x, 27
+  %cmp2 = icmp eq i64 %y, 0
+  %sel = select i1 %cmp1, i1 %cmp2, i1 false
+  %cmp3 = icmp eq i64 %x, 0
+  %or = or i1 %cmp3, %sel
+  %not = xor i1 %or, true
+  ret i1 %not
+}
+
+define i1 @test_invert_demorgan_and_multiuse(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @test_invert_demorgan_and_multiuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    call void @use1(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[CMP2]], [[NOT]]
+; CHECK-NEXT:    [[MERGE:%.*]] = and i1 [[TMP0]], [[CMP1]]
+; CHECK-NEXT:    br i1 [[MERGE]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @f1()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp1 = icmp eq i32 %a, 0
+  call void @use1(i1 %cmp1)
+  %cmp2 = icmp ne i32 %b, 0
+  %and = and i1 %cmp1, %cmp2
+  %not = xor i1 %cond, true
+  %merge = and i1 %not, %and
+  br i1 %merge, label %if.then, label %if.else
+if.then:
+  call void @f1()
+  unreachable
+if.else:
+  call void @f2()
+  unreachable
+}
diff --git a/llvm/test/Transforms/InstCombine/pow-1.ll b/llvm/test/Transforms/InstCombine/pow-1.ll
index 7a72902126f84..5024d152bddf5 100644
--- a/llvm/test/Transforms/InstCombine/pow-1.ll
+++ b/llvm/test/Transforms/InstCombine/pow-1.ll
@@ -870,10 +870,30 @@ define float @test_simplify18(float %x) {
 ; CHECK-EXP10-NEXT:    [[__EXP10F:%.*]] = call float @__exp10f(float [[X]])
 ; CHECK-EXP10-NEXT:    ret float [[__EXP10F]]
 ;
-; CHECK-NO-EXP10-LABEL: define float @test_simplify18(
-; CHECK-NO-EXP10-SAME: float [[X:%.*]]) {
-; CHECK-NO-EXP10-NEXT:    [[RETVAL:%.*]] = call float @powf(float 1.000000e+01, float [[X]])
-; CHECK-NO-EXP10-NEXT:    ret float [[RETVAL]]
+; VC32-LABEL: define float @test_simplify18(
+; VC32-SAME: float [[X:%.*]]) {
+; VC32-NEXT:    [[RETVAL:%.*]] = call float @powf(float 1.000000e+01, float [[X]])
+; VC32-NEXT:    ret float [[RETVAL]]
+;
+; VC51-LABEL: define float @test_simplify18(
+; VC51-SAME: float [[X:%.*]]) {
+; VC51-NEXT:    [[RETVAL:%.*]] = call float @powf(float 1.000000e+01, float [[X]])
+; VC51-NEXT:    ret float [[RETVAL]]
+;
+; VC64-LABEL: define float @test_simplify18(
+; VC64-SAME: float [[X:%.*]]) {
+; VC64-NEXT:    [[EXP10F:%.*]] = call float @exp10f(float [[X]])
+; VC64-NEXT:    ret float [[EXP10F]]
+;
+; VC83-LABEL: define float @test_simplify18(
+; VC83-SAME: float [[X:%.*]]) {
+; VC83-NEXT:    [[EXP10F:%.*]] = call float @exp10f(float [[X]])
+; VC83-NEXT:    ret float [[EXP10F]]
+;
+; NOLIB-LABEL: define float @test_simplify18(
+; NOLIB-SAME: float [[X:%.*]]) {
+; NOLIB-NEXT:    [[RETVAL:%.*]] = call float @powf(float 1.000000e+01, float [[X]])
+; NOLIB-NEXT:    ret float [[RETVAL]]
 ;
   %retval = call float @powf(float 10.0, float %x)
   ret float %retval
@@ -885,11 +905,28 @@ define double @test_simplify19(double %x) {
 ; CHECK-EXP10-NEXT:    [[__EXP10:%.*]] = call double @__exp10(double [[X]])
 ; CHECK-EXP10-NEXT:    ret double [[__EXP10]]
 ;
-; CHECK-NO-EXP10-LABEL: define double @test_simplify19(
-; CHECK-NO-EXP10-SAME: double [[X:%.*]]) {
-; CHECK-NO-EXP10-NEXT:    [[RETVAL:%.*]] = call double @pow(double 1.000000e+01, double [[X]])
-; CHECK-NO-EXP10-NEXT:    ret double [[RETVAL]]
+; VC32-LABEL: define double @test_simplify19(
+; VC32-SAME: double [[X:%.*]]) {
+; VC32-NEXT:    [[EXP10:%.*]] = call double @exp10(double [[X]])
+; VC32-NEXT:    ret double [[EXP10]]
+;
+; VC19-LABEL: define double @test_simplify19(
+; VC19-SAME: double [[X:%.*]]) {
+; VC19-NEXT:    [[EXP10:%.*]] = call double @exp10(double [[X]])
+; VC19-NEXT:    ret double [[EXP10]]
+;
+; VC64-LABEL: define double @test_simplify19(
+; VC64-SAME: double [[X:%.*]]) {
+; VC64-NEXT:    [[EXP10:%.*]] = call double @exp10(double [[X]])
+; VC64-NEXT:    ret double [[EXP10]]
+;
+; NOLIB-LABEL: define double @test_simplify19(
+; NOLIB-SAME: double [[X:%.*]]) {
+; NOLIB-NEXT:    [[RETVAL:%.*]] = call double @pow(double 1.000000e+01, double [[X]])
+; NOLIB-NEXT:    ret double [[RETVAL]]
 ;
   %retval = call double @pow(double 10.0, double %x)
   ret double %retval
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-NO-EXP10: {{.*}}
diff --git a/llvm/test/Transforms/InstCombine/pow-exp.ll b/llvm/test/Transforms/InstCombine/pow-exp.ll
index 9d91ad2402eb1..177dd9885827f 100644
--- a/llvm/test/Transforms/InstCombine/pow-exp.ll
+++ b/llvm/test/Transforms/InstCombine/pow-exp.ll
@@ -136,12 +136,11 @@ define fp128 @powl_exp2l_not_fast(fp128 %x, fp128 %y) {
   ret fp128 %pow
 }
 
-; TODO: exp10() is not widely enabled by many targets yet.
-
 define float @powf_exp10f(float %x, float %y) {
-; CHECK-LABEL: @powf_exp10f(
-; CHECK-NEXT:    [[CALL:%.*]] = call fast float @exp10f(float [[X:%.*]]) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[POW:%.*]] = call fast float @llvm.pow.f32(float [[CALL]], float [[Y:%.*]])
+; CHECK-LABEL: define float @powf_exp10f(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]]) {
+; CHECK-NEXT:    [[CALL:%.*]] = call fast float @exp10f(float [[X]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[POW:%.*]] = call fast float @llvm.pow.f32(float [[CALL]], float [[Y]])
 ; CHECK-NEXT:    ret float [[POW]]
 ;
   %call = call fast float @exp10f(float %x) nounwind readnone
@@ -150,9 +149,10 @@ define float @powf_exp10f(float %x, float %y) {
 }
 
 define double @pow_exp10(double %x, double %y) {
-; CHECK-LABEL: @pow_exp10(
-; CHECK-NEXT:    [[CALL:%.*]] = call fast double @exp10(double [[X:%.*]]) #[[ATTR1]]
-; CHECK-NEXT:    [[POW:%.*]] = call fast double @llvm.pow.f64(double [[CALL]], double [[Y:%.*]])
+; CHECK-LABEL: define double @pow_exp10(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:    [[CALL:%.*]] = call fast double @exp10(double [[X]]) #[[ATTR1]]
+; CHECK-NEXT:    [[POW:%.*]] = call fast double @llvm.pow.f64(double [[CALL]], double [[Y]])
 ; CHECK-NEXT:    ret double [[POW]]
 ;
   %call = call fast double @exp10(double %x) nounwind readnone
@@ -161,9 +161,10 @@ define double @pow_exp10(double %x, double %y) {
 }
 
 define fp128 @pow_exp10l(fp128 %x, fp128 %y) {
-; CHECK-LABEL: @pow_exp10l(
-; CHECK-NEXT:    [[CALL:%.*]] = call fast fp128 @exp10l(fp128 [[X:%.*]]) #[[ATTR1]]
-; CHECK-NEXT:    [[POW:%.*]] = call fast fp128 @llvm.pow.f128(fp128 [[CALL]], fp128 [[Y:%.*]])
+; CHECK-LABEL: define fp128 @pow_exp10l(
+; CHECK-SAME: fp128 [[X:%.*]], fp128 [[Y:%.*]]) {
+; CHECK-NEXT:    [[CALL:%.*]] = call fast fp128 @exp10l(fp128 [[X]]) #[[ATTR1]]
+; CHECK-NEXT:    [[POW:%.*]] = call fast fp128 @llvm.pow.f128(fp128 [[CALL]], fp128 [[Y]])
 ; CHECK-NEXT:    ret fp128 [[POW]]
 ;
   %call = call fast fp128 @exp10l(fp128 %x) nounwind readnone
@@ -255,10 +256,10 @@ define double @pow_ok_base3(double %e) {
 }
 
 define double @pow_ok_ten_base(double %e) {
-; CHECK-LABEL: @pow_ok_ten_base(
-; CHECK-NEXT:    [[MUL:%.*]] = fmul nnan ninf afn double [[E:%.*]], 0x400A934F{{.*}}
-; CHECK-NEXT:    [[EXP2:%.*]] = tail call nnan ninf afn double @exp2(double [[MUL]])
-; CHECK-NEXT:    ret double [[EXP2]]
+; CHECK-LABEL: define double @pow_ok_ten_base(
+; CHECK-SAME: double [[E:%.*]]) {
+; CHECK-NEXT:    [[EXP10:%.*]] = tail call nnan ninf afn double @exp10(double [[E]])
+; CHECK-NEXT:    ret double [[EXP10]]
 ;
   %call = tail call afn nnan ninf double @pow(double 1.000000e+01, double %e)
   ret double %call
@@ -305,10 +306,10 @@ define float @powf_ok_base3(float %e) {
 }
 
 define float @powf_ok_ten_base(float %e) {
-; CHECK-LABEL: @powf_ok_ten_base(
-; CHECK-NEXT:    [[MUL:%.*]] = fmul nnan ninf afn float [[E:%.*]], 0x400A934{{.*}}
-; CHECK-NEXT:    [[EXP2F:%.*]] = tail call nnan ninf afn float @exp2f(float [[MUL]])
-; CHECK-NEXT:    ret float [[EXP2F]]
+; CHECK-LABEL: define float @powf_ok_ten_base(
+; CHECK-SAME: float [[E:%.*]]) {
+; CHECK-NEXT:    [[EXP10F:%.*]] = tail call nnan ninf afn float @exp10f(float [[E]])
+; CHECK-NEXT:    ret float [[EXP10F]]
 ;
   %call = tail call afn nnan ninf float @powf(float 1.000000e+01, float %e)
   ret float %call
diff --git a/llvm/test/Transforms/InstCombine/powi.ll b/llvm/test/Transforms/InstCombine/powi.ll
index 89efbb6f45361..43e34c889106e 100644
--- a/llvm/test/Transforms/InstCombine/powi.ll
+++ b/llvm/test/Transforms/InstCombine/powi.ll
@@ -125,22 +125,55 @@ entry:
   ret double %mul
 }
 
-define double @powi_fmul_powi_no_reassoc(double %x, i32 %y, i32 %z) {
-; CHECK-LABEL: @powi_fmul_powi_no_reassoc(
+; Negative test: Missing reassoc flag on fmul
+define double @powi_fmul_powi_no_reassoc1(double %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @powi_fmul_powi_no_reassoc1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
-; CHECK-NEXT:    [[P2:%.*]] = tail call double @llvm.powi.f64.i32(double [[X]], i32 [[Z:%.*]])
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P2:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X]], i32 [[Z:%.*]])
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[P2]], [[P1]]
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
   %mul = fmul double %p2, %p1
   ret double %mul
 }
 
+; Negative test: Missing reassoc flag on 2nd operand
+define double @powi_fmul_powi_no_reassoc2(double %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @powi_fmul_powi_no_reassoc2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P2:%.*]] = tail call double @llvm.powi.f64.i32(double [[X]], i32 [[Z:%.*]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P2]], [[P1]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+entry:
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %mul = fmul reassoc double %p2, %p1
+  ret double %mul
+}
 
+; Negative test: Missing reassoc flag on 1st operand
+define double @powi_fmul_powi_no_reassoc3(double %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @powi_fmul_powi_no_reassoc3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P2:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X]], i32 [[Z:%.*]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P2]], [[P1]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+entry:
+  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
+  %mul = fmul reassoc double %p2, %p1
+  ret double %mul
+}
+
+; All of the fmul and its operands should have the reassoc flags
 define double @powi_fmul_powi(double %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @powi_fmul_powi(
 ; CHECK-NEXT:  entry:
@@ -149,8 +182,8 @@ define double @powi_fmul_powi(double %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
   %mul = fmul reassoc double %p2, %p1
   ret double %mul
 }
@@ -163,8 +196,8 @@ define double @powi_fmul_powi_fast_on_fmul(double %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p1 = tail call fast double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call fast double @llvm.powi.f64.i32(double %x, i32 %z)
   %mul = fmul fast double %p2, %p1
   ret double %mul
 }
@@ -192,8 +225,23 @@ define double @powi_fmul_powi_same_power(double %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %mul = fmul reassoc double %p2, %p1
+  ret double %mul
+}
+
+define double @powi_fmul_powi_different_integer_types(double %x, i32 %y, i16 %z) {
+; CHECK-LABEL: @powi_fmul_powi_different_integer_types(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P2:%.*]] = tail call reassoc double @llvm.powi.f64.i16(double [[X]], i16 [[Z:%.*]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P2]], [[P1]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+entry:
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i16(double %x, i16 %z)
   %mul = fmul reassoc double %p2, %p1
   ret double %mul
 }
@@ -201,16 +249,16 @@ entry:
 define double @powi_fmul_powi_use_first(double %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @powi_fmul_powi_use_first(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 ; CHECK-NEXT:    tail call void @use(double [[P1]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[Y]], [[Z:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[X]], i32 [[TMP0]])
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
   tail call void @use(double %p1)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
   %mul = fmul reassoc double %p1, %p2
   ret double %mul
 }
@@ -218,16 +266,16 @@ entry:
 define double @powi_fmul_powi_use_second(double %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @powi_fmul_powi_use_second(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Z:%.*]])
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Z:%.*]])
 ; CHECK-NEXT:    tail call void @use(double [[P1]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[Y:%.*]], [[Z]]
 ; CHECK-NEXT:    [[MUL:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[X]], i32 [[TMP0]])
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
   tail call void @use(double %p1)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
   %mul = fmul reassoc double %p2, %p1
   ret double %mul
 }
@@ -333,11 +381,60 @@ define double @fdiv_pow_powi_negative(double %x) {
 ; Negative test: The 2nd powi argument is a variable
 define double @fdiv_pow_powi_negative_variable(double %x, i32 %y) {
 ; CHECK-LABEL: @fdiv_pow_powi_negative_variable(
-; CHECK-NEXT:    [[P1:%.*]] = call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P1:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc nnan double [[P1]], [[X]]
 ; CHECK-NEXT:    ret double [[DIV]]
 ;
-  %p1 = call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p1 = call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
   %div = fdiv reassoc nnan double %p1, %x
   ret double %div
 }
+
+; powi(X, Y) * X --> powi(X, Y+1)
+define double @powi_fmul_powi_x(double noundef %x) {
+; CHECK-LABEL: @powi_fmul_powi_x(
+; CHECK-NEXT:    [[MUL:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 4)
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 3)
+  %mul = fmul reassoc double %p1, %x
+  ret double %mul
+}
+
+; Negative test: Multi-use
+define double @powi_fmul_powi_x_multi_use(double noundef %x) {
+; CHECK-LABEL: @powi_fmul_powi_x_multi_use(
+; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 3)
+; CHECK-NEXT:    tail call void @use(double [[P1]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P1]], [[X]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 3)
+  tail call void @use(double %p1)
+  %mul = fmul reassoc double %p1, %x
+  ret double %mul
+}
+
+; Negative test: Miss fmf flag
+define double @powi_fmul_powi_x_missing_reassoc(double noundef %x) {
+; CHECK-LABEL: @powi_fmul_powi_x_missing_reassoc(
+; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 3)
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[P1]], [[X]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 3)
+  %mul = fmul double %p1, %x
+  ret double %mul
+}
+
+; Negative test: overflow
+define double @powi_fmul_powi_x_overflow(double noundef %x) {
+; CHECK-LABEL: @powi_fmul_powi_x_overflow(
+; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 2147483647)
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P1]], [[X]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 2147483647) ; INT_MAX
+  %mul = fmul reassoc double %p1, %x
+  ret double %mul
+}
diff --git a/llvm/test/Transforms/InstCombine/pr63791.ll b/llvm/test/Transforms/InstCombine/pr63791.ll
index 78cc1130fb33f..73a559f989261 100644
--- a/llvm/test/Transforms/InstCombine/pr63791.ll
+++ b/llvm/test/Transforms/InstCombine/pr63791.ll
@@ -15,7 +15,7 @@ define void @y() {
 ; CHECK-NEXT:    store i1 true, ptr poison, align 1
 ; CHECK-NEXT:    br i1 poison, label [[FOR_COND_I]], label [[FOR_COND5_PREHEADER_I]]
 ; CHECK:       for.cond5.preheader.i:
-; CHECK-NEXT:    br i1 false, label [[FOR_INC19_I:%.*]], label [[FOR_COND1_LOOPEXIT_I:%.*]]
+; CHECK-NEXT:    br i1 true, label [[FOR_COND1_LOOPEXIT_I:%.*]], label [[FOR_INC19_I:%.*]]
 ; CHECK:       for.inc19.i:
 ; CHECK-NEXT:    br i1 poison, label [[FOR_INC19_I]], label [[FOR_COND1_LOOPEXIT_I]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
index 496854c7d731a..77ff16a8b2e3d 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
@@ -1,8 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-define float @select_fadd(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fadd(
+define float @select_maybe_nan_fadd(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_maybe_nan_fadd(
+; CHECK-NEXT:    [[C:%.*]] = fadd float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fadd float %A, %B
+  %D = select i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fadd(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fadd(
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fadd float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
@@ -12,41 +23,52 @@ define float @select_fadd(i1 %cond, float %A, float %B) {
   ret float %D
 }
 
-define float @select_fadd_swapped(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fadd_swapped(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
+define float @select_nnan_fadd(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fadd(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fadd float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fadd_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fadd_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
+define float @select_nnan_fadd_swapped(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fadd_swapped(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[C]], [[A:%.*]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fadd float %A, %B
+  %D = select nnan i1 %cond, float %A, float %C
+  ret float %D
+}
+
+define float @select_nnan_fadd_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fadd_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fadd fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd fast float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fadd_swapped_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fadd_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
+define float @select_nnan_fadd_swapped_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fadd_swapped_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fadd fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd fast float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define <4 x float> @select_nsz_fadd_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fadd_v4f32(
+define <4 x float> @select_nnan_nsz_fadd_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fadd_v4f32(
 ; CHECK-NEXT:    [[C:%.*]] = select nnan nsz <4 x i1> [[COND:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    [[D:%.*]] = fadd nnan nsz <4 x float> [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[D]]
@@ -56,202 +78,202 @@ define <4 x float> @select_nsz_fadd_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x f
   ret <4 x float> %D
 }
 
-define <vscale x 4 x float> @select_nsz_fadd_nxv4f32(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fadd_nxv4f32(
+define <vscale x 4 x float> @select_nnan_nsz_fadd_nxv4f32(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fadd_nxv4f32(
 ; CHECK-NEXT:    [[C:%.*]] = select nnan nsz <vscale x 4 x i1> [[COND:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> zeroinitializer
 ; CHECK-NEXT:    [[D:%.*]] = fadd nnan nsz <vscale x 4 x float> [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[D]]
 ;
-  %C = fadd nsz nnan <vscale x 4 x float> %A, %B
-  %D = select nsz nnan <vscale x 4 x i1> %cond, <vscale x 4 x float> %C, <vscale x 4 x float> %A
+  %C = fadd nnan nsz <vscale x 4 x float> %A, %B
+  %D = select nnan nsz <vscale x 4 x i1> %cond, <vscale x 4 x float> %C, <vscale x 4 x float> %A
   ret <vscale x 4 x float> %D
 }
 
-define <vscale x 4 x float> @select_nsz_fadd_nxv4f32_swapops(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fadd_nxv4f32_swapops(
+define <vscale x 4 x float> @select_nnan_nsz_fadd_nxv4f32_swapops(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fadd_nxv4f32_swapops(
 ; CHECK-NEXT:    [[C:%.*]] = select fast <vscale x 4 x i1> [[COND:%.*]], <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fadd fast <vscale x 4 x float> [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[D]]
 ;
   %C = fadd fast <vscale x 4 x float> %A, %B
-  %D = select fast <vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %C
+  %D = select nnan fast <vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %C
   ret <vscale x 4 x float> %D
 }
 
-define float @select_fmul(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fmul(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+define float @select_nnan_fmul(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fmul(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fmul float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fmul_swapped(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fmul_swapped(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+define float @select_nnan_fmul_swapped(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fmul_swapped(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fmul float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define float @select_fmul_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fmul_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+define float @select_nnan_fmul_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fmul_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fmul fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul fast float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fmul_swapped_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fmul_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+define float @select_nnan_fmul_swapped_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fmul_swapped_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fmul fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul fast float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define float @select_fsub(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+define float @select_nnan_fsub(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fsub float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fsub_swapped(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub_swapped(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
+define float @select_nnan_fsub_swapped(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub_swapped(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fsub float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define float @select_fsub_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+define float @select_nnan_fsub_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fsub fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub fast float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fsub_swapped_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
+define float @select_nnan_fsub_swapped_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub_swapped_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fsub fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub fast float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define <4 x float> @select_nsz_fsub_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fsub_v4f32(
-; CHECK-NEXT:    [[C:%.*]] = select nsz <4 x i1> [[COND:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer
+define <4 x float> @select_nnan_nsz_fsub_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fsub_v4f32(
+; CHECK-NEXT:    [[C:%.*]] = select nnan nsz <4 x i1> [[COND:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    [[D:%.*]] = fsub <4 x float> [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret <4 x float> [[D]]
 ;
   %C = fsub <4 x float> %A, %B
-  %D = select nsz <4 x i1> %cond, <4 x float> %C, <4 x float> %A
+  %D = select nnan nsz <4 x i1> %cond, <4 x float> %C, <4 x float> %A
   ret <4 x float> %D
 }
 
-define <vscale x 4 x float> @select_nsz_fsub_nxv4f32(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fsub_nxv4f32(
-; CHECK-NEXT:    [[C:%.*]] = select nsz <vscale x 4 x i1> [[COND:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> zeroinitializer
+define <vscale x 4 x float> @select_nnan_nsz_fsub_nxv4f32(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fsub_nxv4f32(
+; CHECK-NEXT:    [[C:%.*]] = select nnan nsz <vscale x 4 x i1> [[COND:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> zeroinitializer
 ; CHECK-NEXT:    [[D:%.*]] = fsub <vscale x 4 x float> [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[D]]
 ;
   %C = fsub <vscale x 4 x float> %A, %B
-  %D = select nsz <vscale x 4 x i1> %cond, <vscale x 4 x float> %C, <vscale x 4 x float> %A
+  %D = select nnan nsz <vscale x 4 x i1> %cond, <vscale x 4 x float> %C, <vscale x 4 x float> %A
   ret <vscale x 4 x float> %D
 }
 
 ; 'fsub' can only fold on the amount subtracted.
-define float @select_fsub_invalid(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub_invalid(
+define float @select_nnan_fsub_invalid(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub_invalid(
 ; CHECK-NEXT:    [[C:%.*]] = fsub float [[B:%.*]], [[A:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[D:%.*]] = select nnan i1 [[COND:%.*]], float [[C]], float [[A]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub float %B, %A
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fdiv(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+define float @select_nnan_fdiv(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fdiv float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fdiv_swapped(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv_swapped(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+define float @select_nnan_fdiv_swapped(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv_swapped(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fdiv float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define float @select_fdiv_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+define float @select_nnan_fdiv_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fdiv fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv fast float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fdiv_swapped_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+define float @select_nnan_fdiv_swapped_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv_swapped_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fdiv fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv fast float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
 ; 'fdiv' can only fold on the divisor amount.
-define float @select_fdiv_invalid(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv_invalid(
+define float @select_nnan_fdiv_invalid(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv_invalid(
 ; CHECK-NEXT:    [[C:%.*]] = fdiv float [[B:%.*]], [[A:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[D:%.*]] = select nnan i1 [[COND:%.*]], float [[C]], float [[A]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv float %B, %A
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index a84904106eced..278cabdff9ed3 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2925,10 +2925,8 @@ define i8 @select_replacement_loop3(i32 noundef %x) {
 
 define i16 @select_replacement_loop4(i16 noundef %p_12) {
 ; CHECK-LABEL: @select_replacement_loop4(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[P_12:%.*]], 2
-; CHECK-NEXT:    [[AND1:%.*]] = and i16 [[P_12]], 1
-; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[CMP1]], i16 [[AND1]], i16 0
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i16 [[AND2]], [[P_12]]
+; CHECK-NEXT:    [[AND1:%.*]] = and i16 [[P_12:%.*]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i16 [[P_12]], 2
 ; CHECK-NEXT:    [[AND3:%.*]] = select i1 [[CMP2]], i16 [[AND1]], i16 0
 ; CHECK-NEXT:    ret i16 [[AND3]]
 ;
@@ -3708,3 +3706,59 @@ define i32 @src_select_xxory_eq0_xorxy_y(i32 %x, i32 %y) {
   %cond = select i1 %xor0, i32 %xor, i32 %y
   ret i32 %cond
 }
+
+define i32 @sequence_select_with_same_cond_false(i1 %c1, i1 %c2){
+; CHECK-LABEL: @sequence_select_with_same_cond_false(
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C1:%.*]], i32 23, i32 45
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[C2:%.*]], i32 666, i32 [[S1]]
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C1]], i32 789, i32 [[S2]]
+; CHECK-NEXT:    ret i32 [[S3]]
+;
+  %s1 = select i1 %c1, i32 23, i32 45
+  %s2 = select i1 %c2, i32 666, i32 %s1
+  %s3 = select i1 %c1, i32 789, i32 %s2
+  ret i32 %s3
+}
+
+define i32 @sequence_select_with_same_cond_true(i1 %c1, i1 %c2){
+; CHECK-LABEL: @sequence_select_with_same_cond_true(
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C1:%.*]], i32 45, i32 23
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[C2:%.*]], i32 [[S1]], i32 666
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C1]], i32 [[S2]], i32 789
+; CHECK-NEXT:    ret i32 [[S3]]
+;
+  %s1 = select i1 %c1, i32 45, i32 23
+  %s2 = select i1 %c2, i32 %s1, i32 666
+  %s3 = select i1 %c1, i32 %s2, i32 789
+  ret i32 %s3
+}
+
+define double @sequence_select_with_same_cond_double(double %a, i1 %c1, i1 %c2, double %r1, double %r2){
+; CHECK-LABEL: @sequence_select_with_same_cond_double(
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C1:%.*]], double 1.000000e+00, double 0.000000e+00
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[C2:%.*]], double [[S1]], double 2.000000e+00
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C1]], double [[S2]], double 3.000000e+00
+; CHECK-NEXT:    ret double [[S3]]
+;
+  %s1 = select i1 %c1, double 1.0, double 0.0
+  %s2 = select i1 %c2, double %s1, double 2.0
+  %s3 = select i1 %c1, double %s2, double 3.0
+  ret double %s3
+}
+
+declare void @use32(i32)
+
+define i32 @sequence_select_with_same_cond_extra_use(i1 %c1, i1 %c2){
+; CHECK-LABEL: @sequence_select_with_same_cond_extra_use(
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C1:%.*]], i32 23, i32 45
+; CHECK-NEXT:    call void @use32(i32 [[S1]])
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[C2:%.*]], i32 666, i32 [[S1]]
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C1]], i32 789, i32 [[S2]]
+; CHECK-NEXT:    ret i32 [[S3]]
+;
+  %s1 = select i1 %c1, i32 23, i32 45
+  call void @use32(i32 %s1)
+  %s2 = select i1 %c2, i32 666, i32 %s1
+  %s3 = select i1 %c1, i32 789, i32 %s2
+  ret i32 %s3
+}
diff --git a/llvm/test/Transforms/InstCombine/select_meta.ll b/llvm/test/Transforms/InstCombine/select_meta.ll
index cd133101736cf..aa794e82e0fdc 100644
--- a/llvm/test/Transforms/InstCombine/select_meta.ll
+++ b/llvm/test/Transforms/InstCombine/select_meta.ll
@@ -360,23 +360,23 @@ define i128 @select_ashr(i1 %cond, i128 %x, i128 %y) {
 
 define double @select_fmul(i1 %cond, double %x, double %y) {
 ; CHECK-LABEL: @select_fmul(
-; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], double [[Y:%.*]], double 1.000000e+00, !prof [[PROF0]], !unpredictable [[META2]]
+; CHECK-NEXT:    [[OP:%.*]] = select nnan i1 [[COND:%.*]], double [[Y:%.*]], double 1.000000e+00, !prof [[PROF0]], !unpredictable [[META2]]
 ; CHECK-NEXT:    [[RET:%.*]] = fmul double [[OP]], [[X:%.*]]
 ; CHECK-NEXT:    ret double [[RET]]
 ;
   %op = fmul double %x, %y
-  %ret = select i1 %cond, double %op, double %x, !prof !1, !unpredictable !3
+  %ret = select nnan i1 %cond, double %op, double %x, !prof !1, !unpredictable !3
   ret double %ret
 }
 
 define <2 x float> @select_fdiv(i1 %cond, <2 x float> %x, <2 x float> %y) {
 ; CHECK-LABEL: @select_fdiv(
-; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], <2 x float> [[Y:%.*]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, !prof [[PROF0]], !unpredictable [[META2]]
+; CHECK-NEXT:    [[OP:%.*]] = select nnan i1 [[COND:%.*]], <2 x float> [[Y:%.*]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, !prof [[PROF0]], !unpredictable [[META2]]
 ; CHECK-NEXT:    [[RET:%.*]] = fdiv <2 x float> [[X:%.*]], [[OP]]
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
   %op = fdiv <2 x float> %x, %y
-  %ret = select i1 %cond, <2 x float> %op, <2 x float> %x, !prof !1, !unpredictable !3
+  %ret = select nnan i1 %cond, <2 x float> %op, <2 x float> %x, !prof !1, !unpredictable !3
   ret <2 x float> %ret
 }
 
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index 62f32c2868371..bef7fc81a7d1f 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -1751,12 +1751,11 @@ define void @ashr_out_of_range_1(ptr %A) {
 ; CHECK-NEXT:    [[L:%.*]] = load i177, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[L_FROZEN:%.*]] = freeze i177 [[L]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i177 [[L_FROZEN]], -1
-; CHECK-NEXT:    [[B:%.*]] = select i1 [[TMP1]], i177 0, i177 [[L_FROZEN]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i177 [[B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i177 [[L_FROZEN]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i177, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[G11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 -24
-; CHECK-NEXT:    [[C17:%.*]] = icmp sgt i177 [[B]], [[L_FROZEN]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[C17]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP1]] to i64
 ; CHECK-NEXT:    [[G62:%.*]] = getelementptr i177, ptr [[G11]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i177 [[L_FROZEN]], -1
 ; CHECK-NEXT:    [[B28:%.*]] = select i1 [[TMP5]], i177 0, i177 [[L_FROZEN]]
diff --git a/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll b/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll
index 2e1ff0a21a3de..461c9b0fb1e0c 100644
--- a/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll
@@ -117,11 +117,9 @@ define i64 @sext_diff_i1_xor_sub_1(i64 %a, i1 %b, i1 %c) {
 define i64 @sext_multi_uses(i64 %a, i1 %b, i64 %x) {
 ; CHECK-LABEL: define i64 @sext_multi_uses(
 ; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]], i64 [[X:%.*]]) {
-; CHECK-NEXT:    [[C:%.*]] = sext i1 [[B]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[A]]
-; CHECK-NEXT:    [[E:%.*]] = select i1 [[B]], i64 [[TMP1]], i64 [[A]]
-; CHECK-NEXT:    [[F:%.*]] = mul i64 [[C]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = add i64 [[F]], [[E]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[X]], [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 0, [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B]], i64 [[TMP2]], i64 [[A]]
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
   %c = sext i1 %b to i64
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
index 3851bd090aef9..b4afb7bd4a2b0 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
@@ -298,3 +298,11 @@ bb:
   %cmp = icmp eq ptr blockaddress(@blockaddr_no_cfi, %bb), no_cfi @func
   ret i1 %cmp
 }
+
+define i1 @global_no_cfi_dso_local_equivalent() {
+; CHECK-LABEL: @global_no_cfi_dso_local_equivalent(
+; CHECK-NEXT:    ret i1 icmp eq (ptr dso_local_equivalent @func, ptr no_cfi @func)
+;
+  %cmp = icmp eq ptr dso_local_equivalent @func, no_cfi @func
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index b8244b1d508c6..5d17504c09df6 100644
--- a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -211,6 +211,50 @@ define double @fmul_nnan_ninf_nneg_n0.0_commute(i127 %x) {
   ret double %r
 }
 
+define float @src_mul_nzero_neg(float nofpclass(inf nan pzero psub pnorm) %f) {
+; CHECK-LABEL: @src_mul_nzero_neg(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %r = fmul float %f, -0.0
+  ret float %r
+}
+
+define <2 x float> @src_mul_zero_neg(<2 x float> nofpclass(inf nan pzero psub pnorm) %f) {
+; CHECK-LABEL: @src_mul_zero_neg(
+; CHECK-NEXT:    ret <2 x float> <float -0.000000e+00, float -0.000000e+00>
+;
+  %r = fmul <2 x float> <float 0.0, float 0.0>, %f
+  ret <2 x float> %r
+}
+
+define <2 x float> @src_mul_zero_and_nzero_neg(<2 x float> nofpclass(inf nan pzero psub pnorm) %f) {
+; CHECK-LABEL: @src_mul_zero_and_nzero_neg(
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float -0.000000e+00>
+;
+  %r = fmul <2 x float> <float -0.0, float 0.0>, %f
+  ret <2 x float> %r
+}
+
+
+define float @src_muladd_zero_neg(float nofpclass(inf nan pzero psub pnorm) %f, float %add) {
+; CHECK-LABEL: @src_muladd_zero_neg(
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.fmuladd.f32(float [[F:%.*]], float 0.000000e+00, float [[ADD:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %r = call float @llvm.fmuladd.f32(float %f, float 0.0, float %add)
+  ret float %r
+}
+
+define float @src_fma_nzero_neg(float nofpclass(inf nan pzero psub pnorm) %f, float %add) {
+; CHECK-LABEL: @src_fma_nzero_neg(
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.fma.f32(float -0.000000e+00, float [[F:%.*]], float [[ADD:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %r = call float @llvm.fma.f32(float -0.0, float %f, float %add)
+  ret float %r
+}
+
+
 ; Make sure we can infer %x can't be 0 based on assumes.
 define { float, float } @test_fmul_0_assumed_finite(float %x) {
 ; CHECK-LABEL: @test_fmul_0_assumed_finite(
diff --git a/llvm/test/Transforms/LICM/expr-reassociate-int.ll b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
index 63548974fb318..70288731e31c6 100644
--- a/llvm/test/Transforms/LICM/expr-reassociate-int.ll
+++ b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
@@ -23,7 +23,7 @@ define void @innermost_loop_1d_shouldhoist(i32 %i, i64 %d1, i64 %delta, ptr %cel
 ; CHECK-LABEL: define void @innermost_loop_1d_shouldhoist
 ; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[DELTA]], [[D1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i64 [[DELTA]], [[D1]]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
@@ -55,7 +55,7 @@ for.body:
   %idxprom.j.1 = zext i32 %add.j.1 to i64
   %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
   %cell.1 = load i64, ptr %arrayidx.j.1, align 8
-  %mul.1 = mul i64 %delta, %d1
+  %mul.1 = mul nsw nuw i64 %delta, %d1
   %mul.2 = mul i64 %mul.1, %cell.1
   %idxprom.j = zext i32 %j to i64
   %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
@@ -130,8 +130,8 @@ define void @innermost_loop_2d(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells)
 ; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
 ; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
 ; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
-; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
-; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i64 [[CELL_2]], [[D2]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add nuw nsw i64 [[MUL_2]], [[MUL_1]]
 ; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD]], [[DELTA]]
 ; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
 ; CONSTRAINED-NEXT:    br label [[FOR_COND]]
@@ -155,8 +155,8 @@ for.body:
   %idxprom.j = zext i32 %j to i64
   %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
   %cell.2 = load i64, ptr %arrayidx.j, align 8
-  %mul.2 = mul i64 %cell.2, %d2
-  %reass.add = add i64 %mul.2, %mul.1
+  %mul.2 = mul nsw nuw i64 %cell.2, %d2
+  %reass.add = add nsw nuw i64 %mul.2, %mul.1
   %reass.mul = mul i64 %reass.add, %delta
   store i64 %reass.mul, ptr %arrayidx.j, align 8
   br label %for.cond
@@ -243,10 +243,10 @@ define void @innermost_loop_3d(i32 %i, i64 %d1, i64 %d2, i64 %d3, i64 %delta, pt
 ; CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
 ; CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
 ; CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
-; CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[D3]]
-; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
-; CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
-; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD1]], [[DELTA]]
+; CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i64 [[CELL_3]], [[D3]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add nuw nsw i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add nuw nsw i64 [[REASS_ADD]], [[MUL_3]]
+; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul nuw nsw i64 [[REASS_ADD1]], [[DELTA]]
 ; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J_2]], align 8
 ; CONSTRAINED-NEXT:    br label [[FOR_COND]]
 ; CONSTRAINED:       for.end:
@@ -274,10 +274,10 @@ for.body:
   %idxprom.j.2 = zext i32 %add.j.2 to i64
   %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
   %cell.3 = load i64, ptr %arrayidx.j.2, align 8
-  %mul.3 = mul i64 %cell.3, %d3
-  %reass.add = add i64 %mul.2, %mul.1
-  %reass.add1 = add i64 %reass.add, %mul.3
-  %reass.mul = mul i64 %reass.add1, %delta
+  %mul.3 = mul nsw nuw i64 %cell.3, %d3
+  %reass.add = add nsw nuw i64 %mul.2, %mul.1
+  %reass.add1 = add nsw nuw i64 %reass.add, %mul.3
+  %reass.mul = mul nsw nuw i64 %reass.add1, %delta
   store i64 %reass.mul, ptr %arrayidx.j.2, align 8
   br label %for.cond
 
@@ -362,3 +362,34 @@ for.body:
 for.end:
   ret void
 }
+
+; Make sure we drop poison flags on the mul in the loop.
+define i32 @pr85457(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @pr85457
+; CHECK-SAME: (i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i32 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[MUL0:%.*]] = mul i32 [[FACTOR_OP_MUL]], [[IV]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[MUL0]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %iv.next = add nuw nsw i32 %iv, 1
+  %mul0 = mul nuw nsw i32 %x, %iv
+  %mul1 = mul nuw i32 %mul0, %y
+  %cmp = icmp slt i32 %mul1, 1
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopRotate/dbgvalue.ll b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
index 92cc886bc81c1..9ecc31e1bd2d3 100644
--- a/llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -220,7 +220,7 @@ for.end:
 
 ; Test that dbg.value intrinsincs adjacent to the `icmp slt i32 0, 0` get
 ; rotated as expected. The icmp is loop-invariant and so gets hoisted to the
-; preheader via a different code path. This is more difficult for DPValue
+; preheader via a different code path. This is more difficult for DbgVariableRecord
 ; debug-info records to handle, because they have to get detached and moved
 ; somewhere else during rotation.
 define void @invariant_hoist() !dbg !70 {
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index 938e1deab0b6c..6953834335669 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -45,8 +45,8 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %
 ; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP9]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = fdiv fast <4 x float> [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[DOTNOT8]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[TMP11]]
-; CHECK-NEXT:    [[PREDPHI]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP12]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <4 x float> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[DOTNOT8]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP12]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
index 14acb6f57aa0c..3f38abc75a583 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -29,7 +29,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[ARR]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 -3
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
index 3ba57821bc31b..873f6364f8281 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -850,8 +850,8 @@ define float @cond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK:       pred.load.continue6:
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x float> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[TMP24]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP25]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast <4 x float> [[TMP24]], [[VEC_PHI]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[TMP25]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP26]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
@@ -889,8 +889,8 @@ define float @cond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK:       pred.load.continue14:
 ; CHECK-NEXT:    [[TMP46:%.*]] = phi <4 x float> [ [[TMP41]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP45]], [[PRED_LOAD_IF13]] ]
-; CHECK-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP26]], <4 x float> [[TMP46]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT:    [[PREDPHI15]] = fadd fast <4 x float> [[PREDPHI]], [[TMP47]]
+; CHECK-NEXT:    [[TMP47:%.*]] = fadd fast <4 x float> [[TMP46]], [[PREDPHI]]
+; CHECK-NEXT:    [[PREDPHI15]] = select <4 x i1> [[TMP26]], <4 x float> [[TMP47]], <4 x float> [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index b1c5ccbead64e..e79f0983a6378 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -65,7 +65,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ poison, [[DOTLR_PH]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
@@ -1173,7 +1173,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], <i64 257, i64 257, i64 257, i64 257>
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -1216,11 +1216,11 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP23]], <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP24]])
-; CHECK-NEXT:    [[TMP26]] = call i32 @llvm.smin.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
+; CHECK-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1228,7 +1228,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
 ;
 entry:
@@ -1260,7 +1260,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], <i64 257, i64 257, i64 257, i64 257>
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -1303,11 +1303,11 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP23]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP24]])
-; CHECK-NEXT:    [[TMP26]] = call i32 @llvm.umax.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
+; CHECK-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1315,7 +1315,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
 ;
 entry:
@@ -1351,25 +1351,25 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP11]]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP13]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[PREDPHI_V]]
-; CHECK-NEXT:    [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI3]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -1386,7 +1386,7 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ poison, [[FOR_INC]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ poison, [[FOR_INC]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[SUM_1_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index d85241167d0cd..204d021f06571 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -690,16 +690,16 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP5]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP9]]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP11]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[PREDPHI_V]]
-; CHECK-NEXT:    [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]]
+; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
@@ -1354,9 +1354,8 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK:       pred.load.continue6:
 ; CHECK-NEXT:    [[TMP43:%.*]] = phi <4 x i32> [ [[TMP37]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP42]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <4 x i32> [[TMP43]], zeroinitializer
-; CHECK-NEXT:    [[TMP45:%.*]] = select <4 x i1> [[TMP19]], <4 x i1> [[TMP44]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP46:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP47:%.*]] = or <4 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP44]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <4 x i1> [[TMP47]] to i4
 ; CHECK-NEXT:    [[TMP49:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP48]]), !range [[RNG42:![0-9]+]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = zext nneg i4 [[TMP49]] to i32
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
index ba82bac6fad26..a47a38510eeeb 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -761,16 +761,16 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP5]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP9]]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP11]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[PREDPHI_V]]
-; CHECK-NEXT:    [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]]
+; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
index ef8665b796909..bdd0c6f728ae7 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
@@ -6,15 +6,11 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; Tests to make sure no loads are introduced after a lifetime.end by multiply
 ; fusion.
 
-; FIXME: Currently the tests are mis-compiled, with loads being introduced after
-;       llvm.lifetime.end calls.
-
 define void @lifetime_for_first_arg_before_multiply(ptr noalias %B, ptr noalias %C) {
 ; CHECK-LABEL: @lifetime_for_first_arg_before_multiply(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
 ; CHECK-NEXT:    call void @init(ptr [[A]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
@@ -77,6 +73,7 @@ define void @lifetime_for_first_arg_before_multiply(ptr noalias %B, ptr noalias
 ; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
 ; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -95,7 +92,6 @@ define void @lifetime_for_second_arg_before_multiply(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = alloca <4 x double>, align 32
 ; CHECK-NEXT:    call void @init(ptr [[B]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
@@ -158,6 +154,7 @@ define void @lifetime_for_second_arg_before_multiply(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
 ; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -177,7 +174,6 @@ define void @lifetime_for_first_arg_before_multiply_load_from_offset(ptr noalias
 ; CHECK-NEXT:    [[A:%.*]] = alloca <8 x double>, align 64
 ; CHECK-NEXT:    call void @init(ptr [[A]])
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr i8, ptr [[A]], i64 8
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[GEP_8]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
@@ -240,6 +236,7 @@ define void @lifetime_for_first_arg_before_multiply_load_from_offset(ptr noalias
 ; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
 ; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -261,7 +258,6 @@ define void @lifetime_for_first_arg_before_multiply_lifetime_does_not_dominate(p
 ; CHECK-NEXT:    call void @init(ptr [[A]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
@@ -352,7 +348,6 @@ define void @lifetime_for_second_arg_before_multiply_lifetime_does_not_dominate(
 ; CHECK-NEXT:    call void @init(ptr [[B]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
@@ -441,10 +436,9 @@ define void @lifetime_for_ptr_first_arg_before_multiply(ptr noalias %A, ptr noal
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A:%.*]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
@@ -528,15 +522,104 @@ define void @lifetime_for_both_ptr_args_before_multiply(ptr noalias %A, ptr noal
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B:%.*]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A:%.*]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = load <4 x double>, ptr %A, align 8
+  %b = load <4 x double>, ptr %B, align 8
+  br i1 %c.0, label %then, label %exit
+
+then:
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  br label %exit
+
+exit:
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  ret void
+}
+
+define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @multiple_unrelated_lifetimes(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOC_1:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ALLOC_2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_2]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -597,13 +680,17 @@ define void @lifetime_for_both_ptr_args_before_multiply(ptr noalias %A, ptr noal
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %alloc.1 = alloca i32
+  %alloc.2 = alloca i32
   %a = load <4 x double>, ptr %A, align 8
   %b = load <4 x double>, ptr %B, align 8
   br i1 %c.0, label %then, label %exit
 
 then:
   call void @llvm.lifetime.end(i64 -1, ptr %B)
+  call void @llvm.lifetime.end(i64 -1, ptr %alloc.1)
   call void @llvm.lifetime.end(i64 -1, ptr %A)
+  call void @llvm.lifetime.end(i64 -1, ptr %alloc.2)
   br label %exit
 
 exit:
@@ -618,7 +705,6 @@ define void @lifetime_for_ptr_select_before_multiply(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    [[P:%.*]] = select i1 [[C_0:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[P]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[P]], i64 0
@@ -701,6 +787,374 @@ exit:
   ret void
 }
 
+define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @lifetimes_for_args_in_different_blocks(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %A = alloca <4 x double>
+  call void @init(ptr %A)
+  br i1 %c.0, label %then, label %exit
+
+then:
+  %a = load <4 x double>, ptr %A, align 8
+  %b = load <4 x double>, ptr %B, align 8
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  br label %exit
+
+exit:
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  ret void
+}
+
+define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @lifetimes_for_args_in_different_blocks2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B:%.*]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %A = alloca <4 x double>
+  call void @init(ptr %A)
+  br i1 %c.0, label %then, label %exit
+
+then:
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  br label %exit
+
+exit:
+  %a = load <4 x double>, ptr %A, align 8
+  %b = load <4 x double>, ptr %B, align 8
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  ret void
+}
+
+define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @lifetimes_for_args_load0_in_different_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %A = alloca <4 x double>
+  call void @init(ptr %A)
+  %a = load <4 x double>, ptr %A, align 8
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  br i1 %c.0, label %then, label %exit
+
+then:
+  %b = load <4 x double>, ptr %B, align 8
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  br label %exit
+
+exit:
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  ret void
+}
+
+define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @lifetimes_for_args_load1_in_different_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %A = alloca <4 x double>
+  call void @init(ptr %A)
+  %b = load <4 x double>, ptr %B, align 8
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  br i1 %c.0, label %then, label %exit
+
+then:
+  %a = load <4 x double>, ptr %A, align 8
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  br label %exit
+
+exit:
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  ret void
+}
+
 declare void @init(ptr)
 declare void @llvm.lifetime.end(i64, ptr)
 
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-DominatedLoop.ll b/llvm/test/Transforms/NewGVN/2007-07-25-DominatedLoop.ll
index 978f061ac4fd8..6f0ef197338c2 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-DominatedLoop.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-DominatedLoop.ll
@@ -1,86 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%struct.PerlInterpreter = type { i8 }
+  %struct.PerlInterpreter = type { i8 }
 @PL_sv_count = external global i32		; <ptr> [#uses=2]
 
 define void @perl_destruct(ptr %sv_interp) {
 entry:
-	br i1 false, label %cond_next25, label %cond_true16
+  br i1 false, label %cond_next25, label %cond_true16
 
 cond_true16:		; preds = %entry
-	ret void
+  ret void
 
 cond_next25:		; preds = %entry
-	br i1 false, label %cond_next33, label %cond_true32
+  br i1 false, label %cond_next33, label %cond_true32
 
 cond_true32:		; preds = %cond_next25
-	ret void
+  ret void
 
 cond_next33:		; preds = %cond_next25
-	br i1 false, label %cond_next61, label %cond_true.i46
+  br i1 false, label %cond_next61, label %cond_true.i46
 
 cond_true.i46:		; preds = %cond_next33
-	ret void
+  ret void
 
 cond_next61:		; preds = %cond_next33
-	br i1 false, label %cond_next69, label %cond_true66
+  br i1 false, label %cond_next69, label %cond_true66
 
 cond_true66:		; preds = %cond_next61
-	ret void
+  ret void
 
 cond_next69:		; preds = %cond_next61
-	br i1 false, label %Perl_safefree.exit52, label %cond_true.i50
+  br i1 false, label %Perl_safefree.exit52, label %cond_true.i50
 
 cond_true.i50:		; preds = %cond_next69
-	ret void
+  ret void
 
 Perl_safefree.exit52:		; preds = %cond_next69
-	br i1 false, label %cond_next80, label %cond_true77
+  br i1 false, label %cond_next80, label %cond_true77
 
 cond_true77:		; preds = %Perl_safefree.exit52
-	ret void
+  ret void
 
 cond_next80:		; preds = %Perl_safefree.exit52
-	br i1 false, label %Perl_safefree.exit56, label %cond_true.i54
+  br i1 false, label %Perl_safefree.exit56, label %cond_true.i54
 
 cond_true.i54:		; preds = %cond_next80
-	ret void
+  ret void
 
 Perl_safefree.exit56:		; preds = %cond_next80
-	br i1 false, label %Perl_safefree.exit60, label %cond_true.i58
+  br i1 false, label %Perl_safefree.exit60, label %cond_true.i58
 
 cond_true.i58:		; preds = %Perl_safefree.exit56
-	ret void
+  ret void
 
 Perl_safefree.exit60:		; preds = %Perl_safefree.exit56
-	br i1 false, label %Perl_safefree.exit64, label %cond_true.i62
+  br i1 false, label %Perl_safefree.exit64, label %cond_true.i62
 
 cond_true.i62:		; preds = %Perl_safefree.exit60
-	ret void
+  ret void
 
 Perl_safefree.exit64:		; preds = %Perl_safefree.exit60
-	br i1 false, label %Perl_safefree.exit68, label %cond_true.i66
+  br i1 false, label %Perl_safefree.exit68, label %cond_true.i66
 
 cond_true.i66:		; preds = %Perl_safefree.exit64
-	ret void
+  ret void
 
 Perl_safefree.exit68:		; preds = %Perl_safefree.exit64
-	br i1 false, label %cond_next150, label %cond_true23.i
+  br i1 false, label %cond_next150, label %cond_true23.i
 
 cond_true23.i:		; preds = %Perl_safefree.exit68
-	ret void
+  ret void
 
 cond_next150:		; preds = %Perl_safefree.exit68
-	%tmp16092 = load i32, ptr @PL_sv_count, align 4		; <i32> [#uses=0]
-	br label %cond_next165
+  %tmp16092 = load i32, ptr @PL_sv_count, align 4		; <i32> [#uses=0]
+  br label %cond_next165
 
 bb157:		; preds = %cond_next165
-	%tmp158 = load i32, ptr @PL_sv_count, align 4		; <i32> [#uses=0]
-	br label %cond_next165
+  %tmp158 = load i32, ptr @PL_sv_count, align 4		; <i32> [#uses=0]
+  br label %cond_next165
 
 cond_next165:		; preds = %bb157, %cond_next150
-	br i1 false, label %bb171, label %bb157
+  br i1 false, label %bb171, label %bb157
 
 bb171:		; preds = %cond_next165
-	ret void
+  ret void
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-InfiniteLoop.ll b/llvm/test/Transforms/NewGVN/2007-07-25-InfiniteLoop.ll
index abb6fbe5030f7..5202a2bc6466b 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-InfiniteLoop.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-InfiniteLoop.ll
@@ -1,15 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
-	%struct.INT2 = type { i32, i32 }
+  %struct.INT2 = type { i32, i32 }
 @blkshifts = external global ptr		; <ptr> [#uses=2]
 
 define i32 @xcompact() {
+; CHECK-LABEL: define i32 @xcompact() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store ptr null, ptr @blkshifts, align 4
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-	store ptr null, ptr @blkshifts, align 4
-	br label %bb
+  store ptr null, ptr @blkshifts, align 4
+  br label %bb
 
 bb:		; preds = %bb, %entry
-	%tmp10 = load ptr, ptr @blkshifts, align 4		; <ptr> [#uses=0]
-; CHECK-NOT:  %tmp10
-	br label %bb
+  %tmp10 = load ptr, ptr @blkshifts, align 4		; <ptr> [#uses=0]
+  br label %bb
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-Loop.ll b/llvm/test/Transforms/NewGVN/2007-07-25-Loop.ll
index 336f390459b94..2ee599c11d419 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-Loop.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-Loop.ll
@@ -1,15 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%struct.s_segment_inf = type { float, i32, i16, i16, float, float, i32, float, float }
+  %struct.s_segment_inf = type { float, i32, i16, i16, float, float, i32, float, float }
 
 define void @print_arch(ptr %arch_file, i32 %route_type, i64 %det_routing_arch.0.0, i64 %det_routing_arch.0.1, i64 %det_routing_arch.0.2, i64 %det_routing_arch.0.3, i64 %det_routing_arch.0.4, ptr %segment_inf, i64 %timing_inf.0.0, i64 %timing_inf.0.1, i64 %timing_inf.0.2, i64 %timing_inf.0.3, i64 %timing_inf.0.4, i32 %timing_inf.1) {
 entry:
-	br i1 false, label %bb278, label %bb344
+  br i1 false, label %bb278, label %bb344
 
 bb278:		; preds = %bb278, %entry
-	br i1 false, label %bb278, label %bb344
+  br i1 false, label %bb278, label %bb344
 
 bb344:		; preds = %bb278, %entry
-	%tmp38758 = load i16, ptr null, align 2		; <i16> [#uses=0]
-	ret void
+  %tmp38758 = load i16, ptr null, align 2		; <i16> [#uses=0]
+  ret void
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-NestedLoop.ll b/llvm/test/Transforms/NewGVN/2007-07-25-NestedLoop.ll
index c46f2b7630aca..e7461c2f32bf8 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-NestedLoop.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-NestedLoop.ll
@@ -1,38 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%struct.TypHeader = type { i32, ptr, [3 x i8], i8 }
+  %struct.TypHeader = type { i32, ptr, [3 x i8], i8 }
 
 define ptr @LtRec(ptr %hdL, ptr %hdR) {
 entry:
-	br i1 false, label %bb556.preheader, label %bb534.preheader
+  br i1 false, label %bb556.preheader, label %bb534.preheader
 
 bb534.preheader:		; preds = %entry
-	ret ptr null
+  ret ptr null
 
 bb556.preheader:		; preds = %entry
-	%tmp56119 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
-	%tmp56220 = load i32, ptr %tmp56119		; <i32> [#uses=0]
-	br i1 false, label %bb.nph23, label %bb675.preheader
+  %tmp56119 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
+  %tmp56220 = load i32, ptr %tmp56119		; <i32> [#uses=0]
+  br i1 false, label %bb.nph23, label %bb675.preheader
 
 bb.nph23:		; preds = %bb556.preheader
-	ret ptr null
+  ret ptr null
 
 bb656:		; preds = %bb675.outer, %bb656
-	%tmp678 = load i32, ptr %tmp677		; <i32> [#uses=0]
-	br i1 false, label %bb684, label %bb656
+  %tmp678 = load i32, ptr %tmp677		; <i32> [#uses=0]
+  br i1 false, label %bb684, label %bb656
 
 bb684:		; preds = %bb675.outer, %bb656
-	br i1 false, label %bb924.preheader, label %bb675.outer
+  br i1 false, label %bb924.preheader, label %bb675.outer
 
 bb675.outer:		; preds = %bb675.preheader, %bb684
-	%tmp67812 = load i32, ptr %tmp67711		; <i32> [#uses=0]
-	br i1 false, label %bb684, label %bb656
+  %tmp67812 = load i32, ptr %tmp67711		; <i32> [#uses=0]
+  br i1 false, label %bb684, label %bb656
 
 bb675.preheader:		; preds = %bb556.preheader
-	%tmp67711 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
-	%tmp677 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
-	br label %bb675.outer
+  %tmp67711 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
+  %tmp677 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
+  br label %bb675.outer
 
 bb924.preheader:		; preds = %bb684
-	ret ptr null
+  ret ptr null
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-SinglePredecessor.ll b/llvm/test/Transforms/NewGVN/2007-07-25-SinglePredecessor.ll
index 0b0597f44ee7e..6fafce3047569 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-SinglePredecessor.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-SinglePredecessor.ll
@@ -1,29 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%struct.ggBRDF = type { ptr }
-	%struct.ggBox3 = type { %struct.ggPoint3, %struct.ggPoint3 }
-	%struct.ggMaterialRecord = type { %struct.ggPoint2, %struct.ggBox3, %struct.ggBox3, %struct.ggSpectrum, %struct.ggSpectrum, %struct.ggSpectrum, ptr, i32, i32, i32, i32 }
-	%struct.ggONB3 = type { %struct.ggPoint3, %struct.ggPoint3, %struct.ggPoint3 }
-	%struct.ggPoint2 = type { [2 x double] }
-	%struct.ggPoint3 = type { [3 x double] }
-	%struct.ggSpectrum = type { [8 x float] }
-	%struct.mrViewingHitRecord = type { double, %struct.ggPoint3, %struct.ggONB3, %struct.ggPoint2, double, %struct.ggSpectrum, %struct.ggSpectrum, i32, i32, i32, i32 }
-	%struct.mrXEllipticalCylinder = type { %struct.ggBRDF, float, float, float, float, float, float }
+  %struct.ggBRDF = type { ptr }
+  %struct.ggBox3 = type { %struct.ggPoint3, %struct.ggPoint3 }
+  %struct.ggMaterialRecord = type { %struct.ggPoint2, %struct.ggBox3, %struct.ggBox3, %struct.ggSpectrum, %struct.ggSpectrum, %struct.ggSpectrum, ptr, i32, i32, i32, i32 }
+  %struct.ggONB3 = type { %struct.ggPoint3, %struct.ggPoint3, %struct.ggPoint3 }
+  %struct.ggPoint2 = type { [2 x double] }
+  %struct.ggPoint3 = type { [3 x double] }
+  %struct.ggSpectrum = type { [8 x float] }
+  %struct.mrViewingHitRecord = type { double, %struct.ggPoint3, %struct.ggONB3, %struct.ggPoint2, double, %struct.ggSpectrum, %struct.ggSpectrum, i32, i32, i32, i32 }
+  %struct.mrXEllipticalCylinder = type { %struct.ggBRDF, float, float, float, float, float, float }
 
 define i32 @_ZNK21mrZEllipticalCylinder10viewingHitERK6ggRay3dddR18mrViewingHitRecordR16ggMaterialRecord(ptr %this, ptr %ray, double %unnamed_arg, double %tmin, double %tmax, ptr %VHR, ptr %unnamed_arg2) {
 entry:
-	%tmp80.i = getelementptr %struct.mrViewingHitRecord, ptr %VHR, i32 0, i32 1, i32 0, i32 0		; <ptr> [#uses=1]
-	store double 0.000000e+00, ptr %tmp80.i
-	br i1 false, label %return, label %cond_next.i
+  %tmp80.i = getelementptr %struct.mrViewingHitRecord, ptr %VHR, i32 0, i32 1, i32 0, i32 0		; <ptr> [#uses=1]
+  store double 0.000000e+00, ptr %tmp80.i
+  br i1 false, label %return, label %cond_next.i
 
 cond_next.i:		; preds = %entry
-	br i1 false, label %return, label %cond_true
+  br i1 false, label %return, label %cond_true
 
 cond_true:		; preds = %cond_next.i
-	%tmp3.i8 = getelementptr %struct.mrViewingHitRecord, ptr %VHR, i32 0, i32 1, i32 0, i32 0		; <ptr> [#uses=1]
-	%tmp46 = load double, ptr %tmp3.i8		; <double> [#uses=0]
-	ret i32 1
+  %tmp3.i8 = getelementptr %struct.mrViewingHitRecord, ptr %VHR, i32 0, i32 1, i32 0, i32 0		; <ptr> [#uses=1]
+  %tmp46 = load double, ptr %tmp3.i8		; <double> [#uses=0]
+  ret i32 1
 
 return:		; preds = %cond_next.i, %entry
-	ret i32 0
+  ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-26-NonRedundant.ll b/llvm/test/Transforms/NewGVN/2007-07-26-NonRedundant.ll
index 8d3bfcd1367bf..a64901e71b05a 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-26-NonRedundant.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-26-NonRedundant.ll
@@ -1,16 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
 @bsLive = external global i32		; <ptr> [#uses=2]
 
 define i32 @bsR(i32 %n) {
 entry:
-	br i1 false, label %cond_next, label %bb19
+  br i1 false, label %cond_next, label %bb19
 
 cond_next:		; preds = %entry
-	store i32 0, ptr @bsLive, align 4
-	br label %bb19
+  store i32 0, ptr @bsLive, align 4
+  br label %bb19
 
 bb19:		; preds = %cond_next, %entry
-	%tmp29 = load i32, ptr @bsLive, align 4		; <i32> [#uses=0]
-	ret i32 0
+  %tmp29 = load i32, ptr @bsLive, align 4		; <i32> [#uses=0]
+  ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll b/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
index 22d64324c2352..46f9b84a7e378 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
@@ -20,7 +20,7 @@ define i32 @reload(ptr %first, i32 %global, ptr %dumpfile) {
 ; CHECK:       cond_next2943:
 ; CHECK-NEXT:    br i1 false, label [[BB2982_PREHEADER:%.*]], label [[BB2928]]
 ; CHECK:       bb2982.preheader:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    ret i32 poison
 ;
 cond_next2835.1:		; preds = %cond_next2861
diff --git a/llvm/test/Transforms/NewGVN/2007-07-30-PredIDom.ll b/llvm/test/Transforms/NewGVN/2007-07-30-PredIDom.ll
index 59da31c5e33e0..c70846054caf5 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-30-PredIDom.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-30-PredIDom.ll
@@ -1,274 +1,275 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%"struct.Block::$_16" = type { i32 }
-	%struct.Exp = type { ptr, i32, i32, i32, ptr, ptr, %"struct.Exp::$_10", %"struct.Block::$_16", %"struct.Exp::$_12" }
-	%"struct.Exp::$_10" = type { ptr }
-	%"struct.Exp::$_12" = type { ptr }
-	%struct.Exp_ = type { i32, i32, i32, i32, ptr }
-	%struct.Id = type { ptr, i32, i32, i32, %"struct.Id::$_13" }
-	%"struct.Id::$_13" = type { double }
+  %"struct.Block::$_16" = type { i32 }
+  %struct.Exp = type { ptr, i32, i32, i32, ptr, ptr, %"struct.Exp::$_10", %"struct.Block::$_16", %"struct.Exp::$_12" }
+  %"struct.Exp::$_10" = type { ptr }
+  %"struct.Exp::$_12" = type { ptr }
+  %struct.Exp_ = type { i32, i32, i32, i32, ptr }
+  %struct.Id = type { ptr, i32, i32, i32, %"struct.Id::$_13" }
+  %"struct.Id::$_13" = type { double }
 
 define ptr @_ZN3Exp8toStringEj(ptr %this, i32 %nextpc) {
 entry:
-	switch i32 0, label %bb970 [
-		 i32 1, label %bb
-		 i32 2, label %bb39
-		 i32 3, label %bb195
-		 i32 4, label %bb270
-		 i32 5, label %bb418
-		 i32 6, label %bb633
-		 i32 7, label %bb810
-		 i32 8, label %bb882
-		 i32 9, label %bb925
-	]
+  switch i32 0, label %bb970 [
+  i32 1, label %bb
+  i32 2, label %bb39
+  i32 3, label %bb195
+  i32 4, label %bb270
+  i32 5, label %bb418
+  i32 6, label %bb633
+  i32 7, label %bb810
+  i32 8, label %bb882
+  i32 9, label %bb925
+  ]
 
 bb:		; preds = %entry
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb39:		; preds = %entry
-	br i1 false, label %cond_true, label %cond_false132
+  br i1 false, label %cond_true, label %cond_false132
 
 cond_true:		; preds = %bb39
-	br i1 false, label %cond_true73, label %cond_false
+  br i1 false, label %cond_true73, label %cond_false
 
 cond_true73:		; preds = %cond_true
-	br i1 false, label %cond_true108, label %cond_next
+  br i1 false, label %cond_true108, label %cond_next
 
 cond_true108:		; preds = %cond_true73
-	br label %cond_next
+  br label %cond_next
 
 cond_next:		; preds = %cond_true108, %cond_true73
-	br label %cond_next131
+  br label %cond_next131
 
 cond_false:		; preds = %cond_true
-	br label %cond_next131
+  br label %cond_next131
 
 cond_next131:		; preds = %cond_false, %cond_next
-	br label %cond_next141
+  br label %cond_next141
 
 cond_false132:		; preds = %bb39
-	br label %cond_next141
+  br label %cond_next141
 
 cond_next141:		; preds = %cond_false132, %cond_next131
-	br i1 false, label %cond_true169, label %cond_false175
+  br i1 false, label %cond_true169, label %cond_false175
 
 cond_true169:		; preds = %cond_next141
-	br label %cond_next181
+  br label %cond_next181
 
 cond_false175:		; preds = %cond_next141
-	br label %cond_next181
+  br label %cond_next181
 
 cond_next181:		; preds = %cond_false175, %cond_true169
-	br i1 false, label %cond_true189, label %cond_next191
+  br i1 false, label %cond_true189, label %cond_next191
 
 cond_true189:		; preds = %cond_next181
-	br label %cond_next191
+  br label %cond_next191
 
 cond_next191:		; preds = %cond_true189, %cond_next181
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb195:		; preds = %entry
-	br i1 false, label %cond_true248, label %cond_false250
+  br i1 false, label %cond_true248, label %cond_false250
 
 cond_true248:		; preds = %bb195
-	br label %cond_next252
+  br label %cond_next252
 
 cond_false250:		; preds = %bb195
-	br label %cond_next252
+  br label %cond_next252
 
 cond_next252:		; preds = %cond_false250, %cond_true248
-	br i1 false, label %cond_true265, label %cond_next267
+  br i1 false, label %cond_true265, label %cond_next267
 
 cond_true265:		; preds = %cond_next252
-	br label %cond_next267
+  br label %cond_next267
 
 cond_next267:		; preds = %cond_true265, %cond_next252
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb270:		; preds = %entry
-	br i1 false, label %cond_true338, label %cond_false340
+  br i1 false, label %cond_true338, label %cond_false340
 
 cond_true338:		; preds = %bb270
-	br label %cond_next342
+  br label %cond_next342
 
 cond_false340:		; preds = %bb270
-	br label %cond_next342
+  br label %cond_next342
 
 cond_next342:		; preds = %cond_false340, %cond_true338
-	br i1 false, label %cond_true362, label %cond_false364
+  br i1 false, label %cond_true362, label %cond_false364
 
 cond_true362:		; preds = %cond_next342
-	br label %cond_next366
+  br label %cond_next366
 
 cond_false364:		; preds = %cond_next342
-	br label %cond_next366
+  br label %cond_next366
 
 cond_next366:		; preds = %cond_false364, %cond_true362
-	br i1 false, label %cond_true393, label %cond_next395
+  br i1 false, label %cond_true393, label %cond_next395
 
 cond_true393:		; preds = %cond_next366
-	br label %cond_next395
+  br label %cond_next395
 
 cond_next395:		; preds = %cond_true393, %cond_next366
-	br i1 false, label %cond_true406, label %cond_next408
+  br i1 false, label %cond_true406, label %cond_next408
 
 cond_true406:		; preds = %cond_next395
-	br label %cond_next408
+  br label %cond_next408
 
 cond_next408:		; preds = %cond_true406, %cond_next395
-	br i1 false, label %cond_true413, label %cond_next415
+  br i1 false, label %cond_true413, label %cond_next415
 
 cond_true413:		; preds = %cond_next408
-	br label %cond_next415
+  br label %cond_next415
 
 cond_next415:		; preds = %cond_true413, %cond_next408
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb418:		; preds = %entry
-	br i1 false, label %cond_true512, label %cond_false514
+  br i1 false, label %cond_true512, label %cond_false514
 
 cond_true512:		; preds = %bb418
-	br label %cond_next516
+  br label %cond_next516
 
 cond_false514:		; preds = %bb418
-	br label %cond_next516
+  br label %cond_next516
 
 cond_next516:		; preds = %cond_false514, %cond_true512
-	br i1 false, label %cond_true536, label %cond_false538
+  br i1 false, label %cond_true536, label %cond_false538
 
 cond_true536:		; preds = %cond_next516
-	br label %cond_next540
+  br label %cond_next540
 
 cond_false538:		; preds = %cond_next516
-	br label %cond_next540
+  br label %cond_next540
 
 cond_next540:		; preds = %cond_false538, %cond_true536
-	br i1 false, label %cond_true560, label %cond_false562
+  br i1 false, label %cond_true560, label %cond_false562
 
 cond_true560:		; preds = %cond_next540
-	br label %cond_next564
+  br label %cond_next564
 
 cond_false562:		; preds = %cond_next540
-	br label %cond_next564
+  br label %cond_next564
 
 cond_next564:		; preds = %cond_false562, %cond_true560
-	br i1 false, label %cond_true597, label %cond_next599
+  br i1 false, label %cond_true597, label %cond_next599
 
 cond_true597:		; preds = %cond_next564
-	br label %cond_next599
+  br label %cond_next599
 
 cond_next599:		; preds = %cond_true597, %cond_next564
-	br i1 false, label %cond_true614, label %cond_next616
+  br i1 false, label %cond_true614, label %cond_next616
 
 cond_true614:		; preds = %cond_next599
-	br label %cond_next616
+  br label %cond_next616
 
 cond_next616:		; preds = %cond_true614, %cond_next599
-	br i1 false, label %cond_true621, label %cond_next623
+  br i1 false, label %cond_true621, label %cond_next623
 
 cond_true621:		; preds = %cond_next616
-	br label %cond_next623
+  br label %cond_next623
 
 cond_next623:		; preds = %cond_true621, %cond_next616
-	br i1 false, label %cond_true628, label %cond_next630
+  br i1 false, label %cond_true628, label %cond_next630
 
 cond_true628:		; preds = %cond_next623
-	br label %cond_next630
+  br label %cond_next630
 
 cond_next630:		; preds = %cond_true628, %cond_next623
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb633:		; preds = %entry
-	br i1 false, label %cond_true667, label %cond_next669
+  br i1 false, label %cond_true667, label %cond_next669
 
 cond_true667:		; preds = %bb633
-	br label %cond_next669
+  br label %cond_next669
 
 cond_next669:		; preds = %cond_true667, %bb633
-	br i1 false, label %cond_true678, label %cond_next791
+  br i1 false, label %cond_true678, label %cond_next791
 
 cond_true678:		; preds = %cond_next669
-	br label %bb735
+  br label %bb735
 
 bb679:		; preds = %bb735
-	br i1 false, label %cond_true729, label %cond_next731
+  br i1 false, label %cond_true729, label %cond_next731
 
 cond_true729:		; preds = %bb679
-	br label %cond_next731
+  br label %cond_next731
 
 cond_next731:		; preds = %cond_true729, %bb679
-	br label %bb735
+  br label %bb735
 
 bb735:		; preds = %cond_next731, %cond_true678
-	br i1 false, label %bb679, label %bb743
+  br i1 false, label %bb679, label %bb743
 
 bb743:		; preds = %bb735
-	br i1 false, label %cond_true788, label %cond_next790
+  br i1 false, label %cond_true788, label %cond_next790
 
 cond_true788:		; preds = %bb743
-	br label %cond_next790
+  br label %cond_next790
 
 cond_next790:		; preds = %cond_true788, %bb743
-	br label %cond_next791
+  br label %cond_next791
 
 cond_next791:		; preds = %cond_next790, %cond_next669
-	br i1 false, label %cond_true805, label %cond_next807
+  br i1 false, label %cond_true805, label %cond_next807
 
 cond_true805:		; preds = %cond_next791
-	br label %cond_next807
+  br label %cond_next807
 
 cond_next807:		; preds = %cond_true805, %cond_next791
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb810:		; preds = %entry
-	br i1 false, label %cond_true870, label %cond_next872
+  br i1 false, label %cond_true870, label %cond_next872
 
 cond_true870:		; preds = %bb810
-	br label %cond_next872
+  br label %cond_next872
 
 cond_next872:		; preds = %cond_true870, %bb810
-	br i1 false, label %cond_true877, label %cond_next879
+  br i1 false, label %cond_true877, label %cond_next879
 
 cond_true877:		; preds = %cond_next872
-	br label %cond_next879
+  br label %cond_next879
 
 cond_next879:		; preds = %cond_true877, %cond_next872
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb882:		; preds = %entry
-	br i1 false, label %cond_true920, label %cond_next922
+  br i1 false, label %cond_true920, label %cond_next922
 
 cond_true920:		; preds = %bb882
-	br label %cond_next922
+  br label %cond_next922
 
 cond_next922:		; preds = %cond_true920, %bb882
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb925:		; preds = %entry
-	br i1 false, label %cond_true965, label %cond_next967
+  br i1 false, label %cond_true965, label %cond_next967
 
 cond_true965:		; preds = %bb925
-	br label %cond_next967
+  br label %cond_next967
 
 cond_next967:		; preds = %cond_true965, %bb925
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb970:		; preds = %entry
-	unreachable
-		; No predecessors!
-	store ptr null, ptr null
-	br label %return
+  unreachable
+  ; No predecessors!
+  store ptr null, ptr null
+  br label %return
 
 return:		; preds = %0, %cond_next967, %cond_next922, %cond_next879, %cond_next807, %cond_next630, %cond_next415, %cond_next267, %cond_next191, %bb
-	%retval980 = load ptr, ptr null		; <ptr> [#uses=1]
-	ret ptr %retval980
+  %retval980 = load ptr, ptr null		; <ptr> [#uses=1]
+  ret ptr %retval980
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-31-RedundantPhi.ll b/llvm/test/Transforms/NewGVN/2007-07-31-RedundantPhi.ll
index 934fffc986f81..5411bcc7d406a 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-31-RedundantPhi.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-31-RedundantPhi.ll
@@ -1,23 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 @img_width = external global i16		; <ptr> [#uses=2]
 
 define i32 @smpUMHEXBipredIntegerPelBlockMotionSearch(ptr %cur_pic, i16 signext  %ref, i32 %list, i32 %pic_pix_x, i32 %pic_pix_y, i32 %blocktype, i16 signext  %pred_mv_x1, i16 signext  %pred_mv_y1, i16 signext  %pred_mv_x2, i16 signext  %pred_mv_y2, ptr %mv_x, ptr %mv_y, ptr %s_mv_x, ptr %s_mv_y, i32 %search_range, i32 %min_mcost, i32 %lambda_factor) {
+; CHECK-LABEL: define i32 @smpUMHEXBipredIntegerPelBlockMotionSearch(
+; CHECK-SAME: ptr [[CUR_PIC:%.*]], i16 signext [[REF:%.*]], i32 [[LIST:%.*]], i32 [[PIC_PIX_X:%.*]], i32 [[PIC_PIX_Y:%.*]], i32 [[BLOCKTYPE:%.*]], i16 signext [[PRED_MV_X1:%.*]], i16 signext [[PRED_MV_Y1:%.*]], i16 signext [[PRED_MV_X2:%.*]], i16 signext [[PRED_MV_Y2:%.*]], ptr [[MV_X:%.*]], ptr [[MV_Y:%.*]], ptr [[S_MV_X:%.*]], ptr [[S_MV_Y:%.*]], i32 [[SEARCH_RANGE:%.*]], i32 [[MIN_MCOST:%.*]], i32 [[LAMBDA_FACTOR:%.*]]) {
+; CHECK-NEXT:  cond_next143:
+; CHECK-NEXT:    store i16 0, ptr @img_width, align 2
+; CHECK-NEXT:    br i1 false, label [[COND_NEXT449:%.*]], label [[COND_FALSE434:%.*]]
+; CHECK:       cond_false434:
+; CHECK-NEXT:    br label [[COND_NEXT449]]
+; CHECK:       cond_next449:
+; CHECK-NEXT:    br i1 false, label [[COND_NEXT698:%.*]], label [[COND_FALSE470:%.*]]
+; CHECK:       cond_false470:
+; CHECK-NEXT:    br label [[COND_NEXT698]]
+; CHECK:       cond_next698:
+; CHECK-NEXT:    ret i32 0
+;
 cond_next143:		; preds = %entry
-	store i16 0, ptr @img_width, align 2
-	br i1 false, label %cond_next449, label %cond_false434
+  store i16 0, ptr @img_width, align 2
+  br i1 false, label %cond_next449, label %cond_false434
 
 cond_false434:		; preds = %cond_true415
-	br label %cond_next449
+  br label %cond_next449
 
 cond_next449:		; preds = %cond_false434, %cond_true415
-	br i1 false, label %cond_next698, label %cond_false470
+  br i1 false, label %cond_next698, label %cond_false470
 
 cond_false470:		; preds = %cond_next449
-	br label %cond_next698
+  br label %cond_next698
 
 cond_next698:		; preds = %cond_true492
-	%tmp701 = load i16, ptr @img_width, align 2		; <i16> [#uses=0]
-; CHECK-NOT: %tmp701 =
-	ret i32 0
+  %tmp701 = load i16, ptr @img_width, align 2		; <i16> [#uses=0]
+  ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-02-13-NewPHI.ll b/llvm/test/Transforms/NewGVN/2008-02-13-NewPHI.ll
index b2440fbced41f..49d6de7d7a0f6 100644
--- a/llvm/test/Transforms/NewGVN/2008-02-13-NewPHI.ll
+++ b/llvm/test/Transforms/NewGVN/2008-02-13-NewPHI.ll
@@ -1,22 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn
 ; PR2032
 
 define i32 @sscal(i32 %n, double %sa1, ptr %sx, i32 %incx) {
 entry:
-	%sx_addr = alloca ptr		; <ptr> [#uses=3]
-	store ptr %sx, ptr %sx_addr, align 4
-	br label %bb33
+  %sx_addr = alloca ptr		; <ptr> [#uses=3]
+  store ptr %sx, ptr %sx_addr, align 4
+  br label %bb33
 
 bb:		; preds = %bb33
-	%tmp27 = load ptr, ptr %sx_addr, align 4		; <ptr> [#uses=1]
-	store float 0.000000e+00, ptr %tmp27, align 4
-	store ptr null, ptr %sx_addr, align 4
-	br label %bb33
+  %tmp27 = load ptr, ptr %sx_addr, align 4		; <ptr> [#uses=1]
+  store float 0.000000e+00, ptr %tmp27, align 4
+  store ptr null, ptr %sx_addr, align 4
+  br label %bb33
 
 bb33:		; preds = %bb, %entry
-	br i1 false, label %bb, label %return
+  br i1 false, label %bb, label %return
 
 return:		; preds = %bb33
-	%retval59 = load i32, ptr null, align 4		; <i32> [#uses=1]
-	ret i32 %retval59
+  %retval59 = load i32, ptr null, align 4		; <i32> [#uses=1]
+  ret i32 %retval59
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-07-02-Unreachable.ll b/llvm/test/Transforms/NewGVN/2008-07-02-Unreachable.ll
index 0c1891e46d267..cf591d7e2a79a 100644
--- a/llvm/test/Transforms/NewGVN/2008-07-02-Unreachable.ll
+++ b/llvm/test/Transforms/NewGVN/2008-07-02-Unreachable.ll
@@ -1,36 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 ; PR2503
 
 @g_3 = external global i8		; <ptr> [#uses=2]
 
 define i8 @func_1(i32 %x, i32 %y) nounwind  {
+; CHECK-LABEL: define i8 @func_1(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IFELSE:%.*]], label [[IFTHEN:%.*]]
+; CHECK:       ifthen:
+; CHECK-NEXT:    br label [[IFEND:%.*]]
+; CHECK:       ifelse:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr @g_3, align 1
+; CHECK-NEXT:    store i8 [[TMP3]], ptr [[A]], align 1
+; CHECK-NEXT:    br label [[AFTERFOR:%.*]]
+; CHECK:       forcond:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br i1 false, label [[AFTERFOR]], label [[FORBODY:%.*]]
+; CHECK:       forbody:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[FORINC:%.*]]
+; CHECK:       forinc:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[FORCOND:%.*]]
+; CHECK:       afterfor:
+; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK:       ifend:
+; CHECK-NEXT:    ret i8 0
+;
 entry:
   %A = alloca i8
-    %cmp = icmp eq i32 %x, %y
-	br i1 %cmp, label %ifelse, label %ifthen
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %ifelse, label %ifthen
 
 ifthen:		; preds = %entry
-	br label %ifend
+  br label %ifend
 
 ifelse:		; preds = %entry
-	%tmp3 = load i8, ptr @g_3		; <i8> [#uses=0]
-        store i8 %tmp3, ptr %A
-	br label %afterfor
+  %tmp3 = load i8, ptr @g_3		; <i8> [#uses=0]
+  store i8 %tmp3, ptr %A
+  br label %afterfor
 
 forcond:		; preds = %forinc
-	br i1 false, label %afterfor, label %forbody
+  br i1 false, label %afterfor, label %forbody
 
 forbody:		; preds = %forcond
-	br label %forinc
+  br label %forinc
 
 forinc:		; preds = %forbody
-	br label %forcond
+  br label %forcond
 
 afterfor:		; preds = %forcond, %forcond.thread
-	%tmp10 = load i8, ptr @g_3		; <i8> [#uses=0]
-	ret i8 %tmp10
-; CHECK: ret i8 %tmp3
+  %tmp10 = load i8, ptr @g_3		; <i8> [#uses=0]
+  ret i8 %tmp10
 
 ifend:		; preds = %afterfor, %ifthen
-	ret i8 0
+  ret i8 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-12-09-SelfRemove.ll b/llvm/test/Transforms/NewGVN/2008-12-09-SelfRemove.ll
index a2e252d065d2f..95cb229f17f58 100644
--- a/llvm/test/Transforms/NewGVN/2008-12-09-SelfRemove.ll
+++ b/llvm/test/Transforms/NewGVN/2008-12-09-SelfRemove.ll
@@ -1,38 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.5"
-	%struct.anon = type { ptr, i32 }
-	%struct.d_print_info = type { i32, ptr, i32, i32, ptr, ptr, i32 }
-	%struct.d_print_mod = type { ptr, ptr, i32, ptr }
-	%struct.d_print_template = type { ptr, ptr }
-	%struct.demangle_component = type { i32, { %struct.anon } }
+  %struct.anon = type { ptr, i32 }
+  %struct.d_print_info = type { i32, ptr, i32, i32, ptr, ptr, i32 }
+  %struct.d_print_mod = type { ptr, ptr, i32, ptr }
+  %struct.d_print_template = type { ptr, ptr }
+  %struct.demangle_component = type { i32, { %struct.anon } }
 
 define void @d_print_mod_list(ptr %dpi, ptr %mods, i32 %suffix) nounwind {
+; CHECK-LABEL: define void @d_print_mod_list(
+; CHECK-SAME: ptr [[DPI:%.*]], ptr [[MODS:%.*]], i32 [[SUFFIX:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_D_PRINT_INFO:%.*]], ptr [[DPI]], i32 0, i32 1
+; CHECK-NEXT:    br i1 false, label [[RETURN:%.*]], label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB21:%.*]]
+; CHECK:       bb21:
+; CHECK-NEXT:    br label [[BB21]]
+; CHECK:       return:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    ret void
+;
 entry:
-	%0 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1		; <ptr> [#uses=1]
-	br i1 false, label %return, label %bb
+  %0 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1		; <ptr> [#uses=1]
+  br i1 false, label %return, label %bb
 
 bb:		; preds = %entry
-	%1 = load ptr, ptr %0, align 4		; <ptr> [#uses=0]
-	%2 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1		; <ptr> [#uses=0]
-	br label %bb21
+  %1 = load ptr, ptr %0, align 4		; <ptr> [#uses=0]
+  %2 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1		; <ptr> [#uses=0]
+  br label %bb21
 
 bb21:		; preds = %bb21, %bb
-	br label %bb21
+  br label %bb21
 
 return:		; preds = %entry
-	ret void
+  ret void
 }
 
-; CHECK: define void @d_print_mod_list(ptr %dpi, ptr %mods, i32 %suffix) #0 {
-; CHECK: entry:
-; CHECK:   %0 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1
-; CHECK:   br i1 false, label %return, label %bb
-; CHECK: bb:
-; CHECK:   br label %bb21
-; CHECK: bb21:
-; CHECK:   br label %bb21
-; CHECK: return:
-; CHECK:   ret void
-; CHECK: }
diff --git a/llvm/test/Transforms/NewGVN/2008-12-12-RLE-Crash.ll b/llvm/test/Transforms/NewGVN/2008-12-12-RLE-Crash.ll
index bb51f72752a25..3df35a1593283 100644
--- a/llvm/test/Transforms/NewGVN/2008-12-12-RLE-Crash.ll
+++ b/llvm/test/Transforms/NewGVN/2008-12-12-RLE-Crash.ll
@@ -1,35 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
 
 define i32 @main(i32 %argc, ptr %argv) nounwind {
 entry:
-	br label %bb84
+  br label %bb84
 
 bb41:		; preds = %bb82
-	%tmp = load i8, ptr %opt.0, align 1		; <i8> [#uses=0]
-	%tmp1 = getelementptr i8, ptr %opt.0, i32 1		; <ptr> [#uses=2]
-	switch i32 0, label %bb81 [
-		i32 102, label %bb82
-		i32 110, label %bb79
-		i32 118, label %bb80
-	]
+  %tmp = load i8, ptr %opt.0, align 1		; <i8> [#uses=0]
+  %tmp1 = getelementptr i8, ptr %opt.0, i32 1		; <ptr> [#uses=2]
+  switch i32 0, label %bb81 [
+  i32 102, label %bb82
+  i32 110, label %bb79
+  i32 118, label %bb80
+  ]
 
 bb79:		; preds = %bb41
-	br label %bb82
+  br label %bb82
 
 bb80:		; preds = %bb41
-	ret i32 0
+  ret i32 0
 
 bb81:		; preds = %bb41
-	ret i32 1
+  ret i32 1
 
 bb82:		; preds = %bb84, %bb79, %bb41
-	%opt.0 = phi ptr [ %tmp3, %bb84 ], [ %tmp1, %bb79 ], [ %tmp1, %bb41 ]		; <ptr> [#uses=3]
-	%tmp2 = load i8, ptr %opt.0, align 1		; <i8> [#uses=0]
-	br i1 false, label %bb84, label %bb41
+  %opt.0 = phi ptr [ %tmp3, %bb84 ], [ %tmp1, %bb79 ], [ %tmp1, %bb41 ]		; <ptr> [#uses=3]
+  %tmp2 = load i8, ptr %opt.0, align 1		; <i8> [#uses=0]
+  br i1 false, label %bb84, label %bb41
 
 bb84:		; preds = %bb82, %entry
-	%tmp3 = getelementptr i8, ptr null, i32 1		; <ptr> [#uses=1]
-	br label %bb82
+  %tmp3 = getelementptr i8, ptr null, i32 1		; <ptr> [#uses=1]
+  br label %bb82
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-12-14-rle-reanalyze.ll b/llvm/test/Transforms/NewGVN/2008-12-14-rle-reanalyze.ll
index 38d1240fd6a29..821355664a7f0 100644
--- a/llvm/test/Transforms/NewGVN/2008-12-14-rle-reanalyze.ll
+++ b/llvm/test/Transforms/NewGVN/2008-12-14-rle-reanalyze.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
@@ -5,14 +6,14 @@ target triple = "i386-apple-darwin7"
 
 define i32 @Quiesce(i32 %alpha, i32 %beta, i32 %wtm, i32 %ply) nounwind {
 entry:
-	br label %bb22
+  br label %bb22
 
 bb22:		; preds = %bb23, %bb22, %entry
-	br i1 false, label %bb23, label %bb22
+  br i1 false, label %bb23, label %bb22
 
 bb23:		; preds = %bb23, %bb22
-	%sortv.233 = phi ptr [ @sort_value, %bb22 ], [ %sortv.2, %bb23 ]		; <ptr> [#uses=1]
-	%0 = load i32, ptr %sortv.233, align 4		; <i32> [#uses=0]
-	%sortv.2 = getelementptr [256 x i32], ptr @sort_value, i32 0, i32 0		; <ptr> [#uses=1]
-	br i1 false, label %bb23, label %bb22
+  %sortv.233 = phi ptr [ @sort_value, %bb22 ], [ %sortv.2, %bb23 ]		; <ptr> [#uses=1]
+  %0 = load i32, ptr %sortv.233, align 4		; <i32> [#uses=0]
+  %sortv.2 = getelementptr [256 x i32], ptr @sort_value, i32 0, i32 0		; <ptr> [#uses=1]
+  br i1 false, label %bb23, label %bb22
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-12-15-CacheVisited.ll b/llvm/test/Transforms/NewGVN/2008-12-15-CacheVisited.ll
index 0b1761da0c33a..ff9ecce88b458 100644
--- a/llvm/test/Transforms/NewGVN/2008-12-15-CacheVisited.ll
+++ b/llvm/test/Transforms/NewGVN/2008-12-15-CacheVisited.ll
@@ -1,28 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 ; Cached results must be added to and verified against the visited sets.
 ; PR3217
 
 define fastcc void @gen_field_die(ptr %decl) nounwind {
 entry:
-	br i1 false, label %bb203, label %bb202
+  br i1 false, label %bb203, label %bb202
 
 bb202:		; preds = %entry
-	unreachable
+  unreachable
 
 bb203:		; preds = %entry
-	%tmp = getelementptr i32, ptr %decl, i32 1		; <ptr> [#uses=1]
-	%tmp1 = load i32, ptr %tmp, align 4		; <i32> [#uses=0]
-	br i1 false, label %bb207, label %bb204
+  %tmp = getelementptr i32, ptr %decl, i32 1		; <ptr> [#uses=1]
+  %tmp1 = load i32, ptr %tmp, align 4		; <i32> [#uses=0]
+  br i1 false, label %bb207, label %bb204
 
 bb204:		; preds = %bb203
-	%tmp2 = getelementptr i32, ptr %decl, i32 1		; <ptr> [#uses=1]
-	br label %bb208
+  %tmp2 = getelementptr i32, ptr %decl, i32 1		; <ptr> [#uses=1]
+  br label %bb208
 
 bb207:		; preds = %bb203
-	br label %bb208
+  br label %bb208
 
 bb208:		; preds = %bb207, %bb204
-	%iftmp.1374.0.in = phi ptr [ null, %bb207 ], [ %tmp2, %bb204 ]		; <ptr> [#uses=1]
-	%iftmp.1374.0 = load i32, ptr %iftmp.1374.0.in		; <i32> [#uses=0]
-	unreachable
+  %iftmp.1374.0.in = phi ptr [ null, %bb207 ], [ %tmp2, %bb204 ]		; <ptr> [#uses=1]
+  %iftmp.1374.0 = load i32, ptr %iftmp.1374.0.in		; <i32> [#uses=0]
+  unreachable
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-01-21-SortInvalidation.ll b/llvm/test/Transforms/NewGVN/2009-01-21-SortInvalidation.ll
index 8631cbd82f5d7..1f0a32a874584 100644
--- a/llvm/test/Transforms/NewGVN/2009-01-21-SortInvalidation.ll
+++ b/llvm/test/Transforms/NewGVN/2009-01-21-SortInvalidation.ll
@@ -1,55 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 ; PR3358
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
-	%struct.re_pattern_buffer = type { ptr, i64, i64, i64, ptr, ptr, i64, i8 }
-	%struct.re_registers = type { i32, ptr, ptr }
+  %struct.re_pattern_buffer = type { ptr, i64, i64, i64, ptr, ptr, i64, i8 }
+  %struct.re_registers = type { i32, ptr, ptr }
 
 define fastcc i32 @byte_re_match_2_internal(ptr nocapture %bufp, ptr %string1, i32 %size1, ptr %string2, i32 %size2, i32 %pos, ptr %regs, i32 %stop) nounwind {
 entry:
-	br label %bb159
+  br label %bb159
 
 succeed_label:		; preds = %bb159
-	ret i32 0
+  ret i32 0
 
 bb159:		; preds = %bb664, %bb554, %bb159, %bb159, %bb159, %entry
-	%d.0 = phi ptr [ null, %entry ], [ %d.0, %bb159 ], [ %d.0, %bb554 ], [ %d.0, %bb159 ], [ %d.0, %bb159 ], [ %d.12, %bb664 ]		; <ptr> [#uses=5]
-	switch i32 0, label %bb661 [
-		i32 0, label %bb159
-		i32 1, label %succeed_label
-		i32 13, label %bb159
-		i32 14, label %bb159
-		i32 16, label %bb411
-		i32 24, label %bb622
-		i32 28, label %bb543
-	]
+  %d.0 = phi ptr [ null, %entry ], [ %d.0, %bb159 ], [ %d.0, %bb554 ], [ %d.0, %bb159 ], [ %d.0, %bb159 ], [ %d.12, %bb664 ]		; <ptr> [#uses=5]
+  switch i32 0, label %bb661 [
+  i32 0, label %bb159
+  i32 1, label %succeed_label
+  i32 13, label %bb159
+  i32 14, label %bb159
+  i32 16, label %bb411
+  i32 24, label %bb622
+  i32 28, label %bb543
+  ]
 
 bb411:		; preds = %bb411, %bb159
-	br label %bb411
+  br label %bb411
 
 bb543:		; preds = %bb159
-	br i1 false, label %bb549, label %bb550
+  br i1 false, label %bb549, label %bb550
 
 bb549:		; preds = %bb543
-	br label %bb554
+  br label %bb554
 
 bb550:		; preds = %bb543
-	br i1 false, label %bb554, label %bb552
+  br i1 false, label %bb554, label %bb552
 
 bb552:		; preds = %bb550
-	%0 = load i8, ptr %d.0, align 8		; <i8> [#uses=0]
-	br label %bb554
+  %0 = load i8, ptr %d.0, align 8		; <i8> [#uses=0]
+  br label %bb554
 
 bb554:		; preds = %bb552, %bb550, %bb549
-	br i1 false, label %bb159, label %bb661
+  br i1 false, label %bb159, label %bb661
 
 bb622:		; preds = %bb622, %bb159
-	br label %bb622
+  br label %bb622
 
 bb661:		; preds = %bb554, %bb159
-	%d.12 = select i1 false, ptr null, ptr null		; <ptr> [#uses=1]
-	br label %bb664
+  %d.12 = select i1 false, ptr null, ptr null		; <ptr> [#uses=1]
+  br label %bb664
 
 bb664:		; preds = %bb664, %bb661
-	br i1 false, label %bb159, label %bb664
+  br i1 false, label %bb159, label %bb664
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-01-22-SortInvalidation.ll b/llvm/test/Transforms/NewGVN/2009-01-22-SortInvalidation.ll
index d8871700df6d8..25f73dc8aa19d 100644
--- a/llvm/test/Transforms/NewGVN/2009-01-22-SortInvalidation.ll
+++ b/llvm/test/Transforms/NewGVN/2009-01-22-SortInvalidation.ll
@@ -1,100 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
-	%struct..4sPragmaType = type { ptr, i32 }
-	%struct.AggInfo = type { i8, i8, i32, ptr, i32, ptr, i32, i32, i32, ptr, i32, i32 }
-	%struct.AggInfo_col = type { ptr, i32, i32, i32, i32, ptr }
-	%struct.AggInfo_func = type { ptr, ptr, i32, i32 }
-	%struct.AuxData = type { ptr, ptr }
-	%struct.Bitvec = type { i32, i32, i32, { [125 x i32] } }
-	%struct.BtCursor = type { ptr, ptr, ptr, ptr, ptr, ptr, i32, ptr, i32, %struct.CellInfo, i8, i8, ptr, i64, i32, i8, ptr }
-	%struct.BtLock = type { ptr, i32, i8, ptr }
-	%struct.BtShared = type { ptr, ptr, ptr, ptr, i8, i8, i8, i8, i8, i8, i8, i8, i32, i16, i16, i32, i32, i32, i32, i8, i32, ptr, ptr, ptr, %struct.BusyHandler, i32, ptr, ptr, ptr }
-	%struct.Btree = type { ptr, ptr, i8, i8, i8, i32, ptr, ptr }
-	%struct.BtreeMutexArray = type { i32, [11 x ptr] }
-	%struct.BusyHandler = type { ptr, ptr, i32 }
-	%struct.CellInfo = type { ptr, i64, i32, i32, i16, i16, i16, i16 }
-	%struct.CollSeq = type { ptr, i8, i8, ptr, ptr, ptr }
-	%struct.Column = type { ptr, ptr, ptr, ptr, i8, i8, i8, i8 }
-	%struct.Context = type { i64, i32, %struct.Fifo }
-	%struct.CountCtx = type { i64 }
-	%struct.Cursor = type { ptr, i32, i64, i64, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i64, ptr, i32, ptr, i64, ptr, ptr, i32, i64, ptr, ptr, i32, i32, ptr, ptr, ptr }
-	%struct.Db = type { ptr, ptr, i8, i8, ptr, ptr, ptr }
-	%struct.Expr = type { i8, i8, i16, ptr, ptr, ptr, ptr, %struct..4sPragmaType, %struct..4sPragmaType, i32, i32, ptr, i32, i32, ptr, ptr, i32 }
-	%struct.ExprList = type { i32, i32, i32, ptr }
-	%struct.ExprList_item = type { ptr, ptr, i8, i8, i8 }
-	%struct.FKey = type { ptr, ptr, ptr, ptr, i32, ptr, i8, i8, i8, i8 }
-	%struct.Fifo = type { i32, ptr, ptr }
-	%struct.FifoPage = type { i32, i32, i32, ptr, [1 x i64] }
-	%struct.FuncDef = type { i16, i8, i8, i8, ptr, ptr, ptr, ptr, ptr, [1 x i8] }
-	%struct.Hash = type { i8, i8, i32, i32, ptr, ptr }
-	%struct.HashElem = type { ptr, ptr, ptr, ptr, i32 }
-	%struct.IdList = type { ptr, i32, i32 }
-	%struct.Index = type { ptr, i32, ptr, ptr, ptr, i32, i8, i8, ptr, ptr, ptr, ptr, ptr }
-	%struct.KeyInfo = type { ptr, i8, i8, i8, i32, ptr, [1 x ptr] }
-	%struct.Mem = type { %struct.CountCtx, double, ptr, ptr, i32, i16, i8, i8, ptr }
-	%struct.MemPage = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i16, i16, i16, i16, i16, i16, [5 x %struct._OvflCell], ptr, ptr, ptr, i32, ptr }
-	%struct.Module = type { ptr, ptr, ptr, ptr }
-	%struct.Op = type { i8, i8, i8, i8, i32, i32, i32, { i32 } }
-	%struct.Pager = type { ptr, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, %struct.PagerLruList, ptr, ptr, ptr, i64, i64, i64, i64, i64, i32, ptr, ptr, i32, ptr, ptr, [16 x i8] }
-	%struct.PagerLruLink = type { ptr, ptr }
-	%struct.PagerLruList = type { ptr, ptr, ptr }
-	%struct.Parse = type { ptr, i32, ptr, ptr, i8, i8, i8, i8, i8, i8, i8, [8 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [12 x i32], i32, ptr, i32, i32, i32, i32, i32, ptr, i8, %struct..4sPragmaType, %struct..4sPragmaType, %struct..4sPragmaType, ptr, ptr, ptr, ptr, ptr, ptr, %struct..4sPragmaType, i8, ptr, i32 }
-	%struct.PgHdr = type { ptr, i32, ptr, ptr, %struct.PagerLruLink, ptr, i8, i8, i8, i8, i8, i16, ptr, ptr, ptr }
-	%struct.Schema = type { i32, %struct.Hash, %struct.Hash, %struct.Hash, %struct.Hash, ptr, i8, i8, i16, i32, ptr }
-	%struct.Select = type { ptr, i8, i8, i8, i8, i8, i8, i8, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, [3 x i32] }
-	%struct.SrcList = type { i16, i16, [1 x %struct.SrcList_item] }
-	%struct.SrcList_item = type { ptr, ptr, ptr, ptr, ptr, i8, i8, i32, ptr, ptr, i64 }
-	%struct.Table = type { ptr, i32, ptr, i32, ptr, i32, ptr, i32, ptr, ptr, ptr, ptr, i32, i8, i8, i8, i8, i8, i8, i8, ptr, ptr, i32, ptr, ptr }
-	%struct.TableLock = type { i32, i32, i8, ptr }
-	%struct.Trigger = type { ptr, ptr, i8, i8, ptr, ptr, %struct..4sPragmaType, ptr, ptr, ptr, ptr }
-	%struct.TriggerStack = type { ptr, i32, i32, i32, i32, i32, i32, ptr, ptr }
-	%struct.TriggerStep = type { i32, i32, ptr, ptr, %struct..4sPragmaType, ptr, ptr, ptr, ptr, ptr }
-	%struct.Vdbe = type { ptr, ptr, ptr, i32, i32, ptr, i32, i32, ptr, ptr, ptr, i32, ptr, i32, ptr, ptr, i32, i32, i32, ptr, i32, i32, %struct.Fifo, i32, i32, ptr, i32, i32, i32, i32, i32, [25 x i32], i32, i32, ptr, ptr, ptr, i8, i8, i8, i8, i8, i8, i32, i64, i32, %struct.BtreeMutexArray, i32, ptr, i32 }
-	%struct.VdbeFunc = type { ptr, i32, [1 x %struct.AuxData] }
-	%struct._OvflCell = type { ptr, i16 }
-	%struct._ht = type { i32, ptr }
-	%struct.anon = type { double }
-	%struct.sColMap = type { i32, ptr }
-	%struct.sqlite3 = type { ptr, i32, ptr, i32, i32, i32, i32, i8, i8, i8, i8, i32, ptr, i64, i64, i32, i32, i32, ptr, %struct.sqlite3InitInfo, i32, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, %struct.anon, ptr, ptr, ptr, ptr, i32, %struct.Hash, ptr, ptr, i32, %struct.Hash, %struct.Hash, %struct.BusyHandler, i32, [2 x %struct.Db], i8 }
-	%struct.sqlite3InitInfo = type { i32, i32, i8 }
-	%struct.sqlite3_context = type { ptr, ptr, %struct.Mem, ptr, i32, ptr }
-	%struct.sqlite3_file = type { ptr }
-	%struct.sqlite3_index_constraint = type { i32, i8, i8, i32 }
-	%struct.sqlite3_index_constraint_usage = type { i32, i8 }
-	%struct.sqlite3_index_info = type { i32, ptr, i32, ptr, ptr, i32, ptr, i32, i32, double }
-	%struct.sqlite3_io_methods = type { i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
-	%struct.sqlite3_module = type { i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
-	%struct.sqlite3_mutex = type opaque
-	%struct.sqlite3_vfs = type { i32, i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
-	%struct.sqlite3_vtab = type { ptr, i32, ptr }
-	%struct.sqlite3_vtab_cursor = type { ptr }
+  %struct..4sPragmaType = type { ptr, i32 }
+  %struct.AggInfo = type { i8, i8, i32, ptr, i32, ptr, i32, i32, i32, ptr, i32, i32 }
+  %struct.AggInfo_col = type { ptr, i32, i32, i32, i32, ptr }
+  %struct.AggInfo_func = type { ptr, ptr, i32, i32 }
+  %struct.AuxData = type { ptr, ptr }
+  %struct.Bitvec = type { i32, i32, i32, { [125 x i32] } }
+  %struct.BtCursor = type { ptr, ptr, ptr, ptr, ptr, ptr, i32, ptr, i32, %struct.CellInfo, i8, i8, ptr, i64, i32, i8, ptr }
+  %struct.BtLock = type { ptr, i32, i8, ptr }
+  %struct.BtShared = type { ptr, ptr, ptr, ptr, i8, i8, i8, i8, i8, i8, i8, i8, i32, i16, i16, i32, i32, i32, i32, i8, i32, ptr, ptr, ptr, %struct.BusyHandler, i32, ptr, ptr, ptr }
+  %struct.Btree = type { ptr, ptr, i8, i8, i8, i32, ptr, ptr }
+  %struct.BtreeMutexArray = type { i32, [11 x ptr] }
+  %struct.BusyHandler = type { ptr, ptr, i32 }
+  %struct.CellInfo = type { ptr, i64, i32, i32, i16, i16, i16, i16 }
+  %struct.CollSeq = type { ptr, i8, i8, ptr, ptr, ptr }
+  %struct.Column = type { ptr, ptr, ptr, ptr, i8, i8, i8, i8 }
+  %struct.Context = type { i64, i32, %struct.Fifo }
+  %struct.CountCtx = type { i64 }
+  %struct.Cursor = type { ptr, i32, i64, i64, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i64, ptr, i32, ptr, i64, ptr, ptr, i32, i64, ptr, ptr, i32, i32, ptr, ptr, ptr }
+  %struct.Db = type { ptr, ptr, i8, i8, ptr, ptr, ptr }
+  %struct.Expr = type { i8, i8, i16, ptr, ptr, ptr, ptr, %struct..4sPragmaType, %struct..4sPragmaType, i32, i32, ptr, i32, i32, ptr, ptr, i32 }
+  %struct.ExprList = type { i32, i32, i32, ptr }
+  %struct.ExprList_item = type { ptr, ptr, i8, i8, i8 }
+  %struct.FKey = type { ptr, ptr, ptr, ptr, i32, ptr, i8, i8, i8, i8 }
+  %struct.Fifo = type { i32, ptr, ptr }
+  %struct.FifoPage = type { i32, i32, i32, ptr, [1 x i64] }
+  %struct.FuncDef = type { i16, i8, i8, i8, ptr, ptr, ptr, ptr, ptr, [1 x i8] }
+  %struct.Hash = type { i8, i8, i32, i32, ptr, ptr }
+  %struct.HashElem = type { ptr, ptr, ptr, ptr, i32 }
+  %struct.IdList = type { ptr, i32, i32 }
+  %struct.Index = type { ptr, i32, ptr, ptr, ptr, i32, i8, i8, ptr, ptr, ptr, ptr, ptr }
+  %struct.KeyInfo = type { ptr, i8, i8, i8, i32, ptr, [1 x ptr] }
+  %struct.Mem = type { %struct.CountCtx, double, ptr, ptr, i32, i16, i8, i8, ptr }
+  %struct.MemPage = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i16, i16, i16, i16, i16, i16, [5 x %struct._OvflCell], ptr, ptr, ptr, i32, ptr }
+  %struct.Module = type { ptr, ptr, ptr, ptr }
+  %struct.Op = type { i8, i8, i8, i8, i32, i32, i32, { i32 } }
+  %struct.Pager = type { ptr, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, %struct.PagerLruList, ptr, ptr, ptr, i64, i64, i64, i64, i64, i32, ptr, ptr, i32, ptr, ptr, [16 x i8] }
+  %struct.PagerLruLink = type { ptr, ptr }
+  %struct.PagerLruList = type { ptr, ptr, ptr }
+  %struct.Parse = type { ptr, i32, ptr, ptr, i8, i8, i8, i8, i8, i8, i8, [8 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [12 x i32], i32, ptr, i32, i32, i32, i32, i32, ptr, i8, %struct..4sPragmaType, %struct..4sPragmaType, %struct..4sPragmaType, ptr, ptr, ptr, ptr, ptr, ptr, %struct..4sPragmaType, i8, ptr, i32 }
+  %struct.PgHdr = type { ptr, i32, ptr, ptr, %struct.PagerLruLink, ptr, i8, i8, i8, i8, i8, i16, ptr, ptr, ptr }
+  %struct.Schema = type { i32, %struct.Hash, %struct.Hash, %struct.Hash, %struct.Hash, ptr, i8, i8, i16, i32, ptr }
+  %struct.Select = type { ptr, i8, i8, i8, i8, i8, i8, i8, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, [3 x i32] }
+  %struct.SrcList = type { i16, i16, [1 x %struct.SrcList_item] }
+  %struct.SrcList_item = type { ptr, ptr, ptr, ptr, ptr, i8, i8, i32, ptr, ptr, i64 }
+  %struct.Table = type { ptr, i32, ptr, i32, ptr, i32, ptr, i32, ptr, ptr, ptr, ptr, i32, i8, i8, i8, i8, i8, i8, i8, ptr, ptr, i32, ptr, ptr }
+  %struct.TableLock = type { i32, i32, i8, ptr }
+  %struct.Trigger = type { ptr, ptr, i8, i8, ptr, ptr, %struct..4sPragmaType, ptr, ptr, ptr, ptr }
+  %struct.TriggerStack = type { ptr, i32, i32, i32, i32, i32, i32, ptr, ptr }
+  %struct.TriggerStep = type { i32, i32, ptr, ptr, %struct..4sPragmaType, ptr, ptr, ptr, ptr, ptr }
+  %struct.Vdbe = type { ptr, ptr, ptr, i32, i32, ptr, i32, i32, ptr, ptr, ptr, i32, ptr, i32, ptr, ptr, i32, i32, i32, ptr, i32, i32, %struct.Fifo, i32, i32, ptr, i32, i32, i32, i32, i32, [25 x i32], i32, i32, ptr, ptr, ptr, i8, i8, i8, i8, i8, i8, i32, i64, i32, %struct.BtreeMutexArray, i32, ptr, i32 }
+  %struct.VdbeFunc = type { ptr, i32, [1 x %struct.AuxData] }
+  %struct._OvflCell = type { ptr, i16 }
+  %struct._ht = type { i32, ptr }
+  %struct.anon = type { double }
+  %struct.sColMap = type { i32, ptr }
+  %struct.sqlite3 = type { ptr, i32, ptr, i32, i32, i32, i32, i8, i8, i8, i8, i32, ptr, i64, i64, i32, i32, i32, ptr, %struct.sqlite3InitInfo, i32, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, %struct.anon, ptr, ptr, ptr, ptr, i32, %struct.Hash, ptr, ptr, i32, %struct.Hash, %struct.Hash, %struct.BusyHandler, i32, [2 x %struct.Db], i8 }
+  %struct.sqlite3InitInfo = type { i32, i32, i8 }
+  %struct.sqlite3_context = type { ptr, ptr, %struct.Mem, ptr, i32, ptr }
+  %struct.sqlite3_file = type { ptr }
+  %struct.sqlite3_index_constraint = type { i32, i8, i8, i32 }
+  %struct.sqlite3_index_constraint_usage = type { i32, i8 }
+  %struct.sqlite3_index_info = type { i32, ptr, i32, ptr, ptr, i32, ptr, i32, i32, double }
+  %struct.sqlite3_io_methods = type { i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
+  %struct.sqlite3_module = type { i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
+  %struct.sqlite3_mutex = type opaque
+  %struct.sqlite3_vfs = type { i32, i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
+  %struct.sqlite3_vtab = type { ptr, i32, ptr }
+  %struct.sqlite3_vtab_cursor = type { ptr }
 
 define fastcc void @sqlite3Insert(ptr %pParse, ptr %pTabList, ptr %pList, ptr %pSelect, ptr %pColumn, i32 %onError) nounwind {
 entry:
-	br i1 false, label %bb54, label %bb69.loopexit
+  br i1 false, label %bb54, label %bb69.loopexit
 
 bb54:		; preds = %entry
-	br label %bb69.loopexit
+  br label %bb69.loopexit
 
 bb59:		; preds = %bb63.preheader
-	%0 = load ptr, ptr %3, align 4		; <ptr> [#uses=0]
-	br label %bb65
+  %0 = load ptr, ptr %3, align 4		; <ptr> [#uses=0]
+  br label %bb65
 
 bb65:		; preds = %bb63.preheader, %bb59
-	%1 = load ptr, ptr %4, align 4		; <ptr> [#uses=0]
-	br i1 false, label %bb67, label %bb63.preheader
+  %1 = load ptr, ptr %4, align 4		; <ptr> [#uses=0]
+  br i1 false, label %bb67, label %bb63.preheader
 
 bb67:		; preds = %bb65
-	%2 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=0]
-	unreachable
+  %2 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=0]
+  unreachable
 
 bb69.loopexit:		; preds = %bb54, %entry
-	%3 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=1]
-	%4 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=1]
-	br label %bb63.preheader
+  %3 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=1]
+  %4 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=1]
+  br label %bb63.preheader
 
 bb63.preheader:		; preds = %bb69.loopexit, %bb65
-	br i1 false, label %bb59, label %bb65
+  br i1 false, label %bb59, label %bb65
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-03-10-PREOnVoid.ll b/llvm/test/Transforms/NewGVN/2009-03-10-PREOnVoid.ll
index 6aa79e0433d48..83177e5498fe8 100644
--- a/llvm/test/Transforms/NewGVN/2009-03-10-PREOnVoid.ll
+++ b/llvm/test/Transforms/NewGVN/2009-03-10-PREOnVoid.ll
@@ -1,21 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -disable-output
 ; PR3775
 
 ; ModuleID = 'bugpoint-reduced-simplified.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i386-pc-linux-gnu"
-	%llvm.dbg.anchor.type = type { i32, i32 }
-	%"struct.__gnu_cxx::hash<ptr>" = type <{ i8 }>
-	%struct.__sched_param = type { i32 }
-	%struct._pthread_descr_struct = type opaque
-	%struct.pthread_attr_t = type { i32, i32, %struct.__sched_param, i32, i32, i32, i32, ptr, i32 }
-	%struct.pthread_mutex_t = type { i32, i32, ptr, i32, %llvm.dbg.anchor.type }
-	%"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >" = type { %"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >::_Rb_tree_impl<std::less<ptr>,false>" }
-	%"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >::_Rb_tree_impl<std::less<ptr>,false>" = type { %"struct.__gnu_cxx::hash<ptr>", %"struct.std::_Rb_tree_node_base", i32 }
-	%"struct.std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >" = type { ptr }
-	%"struct.std::_Rb_tree_node_base" = type { i32, ptr, ptr, ptr }
-	%"struct.std::pair<std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,bool>" = type { %"struct.std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >", i8 }
-	%"struct.std::pair<ptr const,ptr>" = type { ptr, ptr }
+  %llvm.dbg.anchor.type = type { i32, i32 }
+  %"struct.__gnu_cxx::hash<ptr>" = type <{ i8 }>
+  %struct.__sched_param = type { i32 }
+  %struct._pthread_descr_struct = type opaque
+  %struct.pthread_attr_t = type { i32, i32, %struct.__sched_param, i32, i32, i32, i32, ptr, i32 }
+  %struct.pthread_mutex_t = type { i32, i32, ptr, i32, %llvm.dbg.anchor.type }
+  %"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >" = type { %"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >::_Rb_tree_impl<std::less<ptr>,false>" }
+  %"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >::_Rb_tree_impl<std::less<ptr>,false>" = type { %"struct.__gnu_cxx::hash<ptr>", %"struct.std::_Rb_tree_node_base", i32 }
+  %"struct.std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >" = type { ptr }
+  %"struct.std::_Rb_tree_node_base" = type { i32, ptr, ptr, ptr }
+  %"struct.std::pair<std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,bool>" = type { %"struct.std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >", i8 }
+  %"struct.std::pair<ptr const,ptr>" = type { ptr, ptr }
 
 @_ZL20__gthrw_pthread_oncePiPFvvE = weak alias i32 (ptr, ptr), ptr @pthread_once		; <ptr> [#uses=0]
 @_ZL27__gthrw_pthread_getspecificj = weak alias ptr (i32), ptr @pthread_getspecific		; <ptr> [#uses=0]
@@ -36,75 +37,75 @@ declare fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind readnone
 
 define fastcc void @_ZNSt8_Rb_treeIPvSt4pairIKS0_S0_ESt10_Select1stIS3_ESt4lessIS0_ESaIS3_EE16_M_insert_uniqueERKS3_(ptr noalias nocapture sret(%"struct.std::pair<std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,bool>") %agg.result, ptr %this, ptr %__v) nounwind {
 entry:
-	br i1 false, label %bb7, label %bb
+  br i1 false, label %bb7, label %bb
 
 bb:		; preds = %bb, %entry
-	br i1 false, label %bb5, label %bb
+  br i1 false, label %bb5, label %bb
 
 bb5:		; preds = %bb
-	call fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind
-	br i1 false, label %bb11, label %bb7
+  call fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind
+  br i1 false, label %bb11, label %bb7
 
 bb7:		; preds = %bb5, %entry
-	br label %bb11
+  br label %bb11
 
 bb11:		; preds = %bb7, %bb5
-	call fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind
-	unreachable
+  call fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind
+  unreachable
 }
 
 define i32 @pthread_once(ptr, ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define ptr @pthread_getspecific(i32) {
-       ret ptr null
+  ret ptr null
 }
 
 define i32 @pthread_setspecific(i32, ptr) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_create(ptr, ptr, ptr, ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_cancel(i32) {
-      ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutex_lock(ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutex_trylock(ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutex_unlock(ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutex_init(ptr, ptr) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_key_create(ptr, ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_key_delete(i32) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutexattr_init(ptr) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutexattr_settype(ptr, i32) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutexattr_destroy(ptr) {
-       ret i32 0
+  ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-07-13-MemDepSortFail.ll b/llvm/test/Transforms/NewGVN/2009-07-13-MemDepSortFail.ll
index 24ad1852780cd..9694ae47e977c 100644
--- a/llvm/test/Transforms/NewGVN/2009-07-13-MemDepSortFail.ll
+++ b/llvm/test/Transforms/NewGVN/2009-07-13-MemDepSortFail.ll
@@ -1,67 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 ; PR4256
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i386-pc-linux-gnu"
-	%llvm.dbg.anchor.type = type { i32, i32 }
-	%struct.cset = type { ptr, i8, i8, i32, ptr }
-	%struct.lmat = type { ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr }
-	%struct.re_guts = type { ptr, ptr, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, i32, i32, i32, i32, [1 x i8] }
+  %llvm.dbg.anchor.type = type { i32, i32 }
+  %struct.cset = type { ptr, i8, i8, i32, ptr }
+  %struct.lmat = type { ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr }
+  %struct.re_guts = type { ptr, ptr, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, i32, i32, i32, i32, [1 x i8] }
 
 define ptr @lbackref(ptr %m, ptr %start, ptr %stop, i32 %startst, i32 %stopst, i32 %lev, i32 %rec) nounwind {
 entry:
-	br label %bb63
+  br label %bb63
 
 bb:		; preds = %bb63
-	switch i32 0, label %bb62 [
-		i32 268435456, label %bb2
-		i32 805306368, label %bb9
-		i32 -1610612736, label %bb51
-	]
+  switch i32 0, label %bb62 [
+  i32 268435456, label %bb2
+  i32 805306368, label %bb9
+  i32 -1610612736, label %bb51
+  ]
 
 bb2:		; preds = %bb
-	br label %bb62
+  br label %bb62
 
 bb9:		; preds = %bb
-	%0 = load i8, ptr %sp.1, align 1		; <i8> [#uses=0]
-	br label %bb62
+  %0 = load i8, ptr %sp.1, align 1		; <i8> [#uses=0]
+  br label %bb62
 
 bb51:		; preds = %bb
-	%1 = load i8, ptr %sp.1, align 1		; <i8> [#uses=0]
-	ret ptr null
+  %1 = load i8, ptr %sp.1, align 1		; <i8> [#uses=0]
+  ret ptr null
 
 bb62:		; preds = %bb9, %bb2, %bb
-	br label %bb63
+  br label %bb63
 
 bb63:		; preds = %bb84, %bb69, %bb62, %entry
-	%sp.1 = phi ptr [ null, %bb62 ], [ %sp.1.lcssa, %bb84 ], [ %start, %entry ], [ %sp.1.lcssa, %bb69 ]		; <ptr> [#uses=3]
-	br i1 false, label %bb, label %bb65
+  %sp.1 = phi ptr [ null, %bb62 ], [ %sp.1.lcssa, %bb84 ], [ %start, %entry ], [ %sp.1.lcssa, %bb69 ]		; <ptr> [#uses=3]
+  br i1 false, label %bb, label %bb65
 
 bb65:		; preds = %bb63
-	%sp.1.lcssa = phi ptr [ %sp.1, %bb63 ]		; <ptr> [#uses=4]
-	br i1 false, label %bb66, label %bb69
+  %sp.1.lcssa = phi ptr [ %sp.1, %bb63 ]		; <ptr> [#uses=4]
+  br i1 false, label %bb66, label %bb69
 
 bb66:		; preds = %bb65
-	ret ptr null
+  ret ptr null
 
 bb69:		; preds = %bb65
-	switch i32 0, label %bb108.loopexit2.loopexit.loopexit [
-		i32 1342177280, label %bb63
-		i32 1476395008, label %bb84
-		i32 1879048192, label %bb104
-		i32 2013265920, label %bb93
-	]
+  switch i32 0, label %bb108.loopexit2.loopexit.loopexit [
+  i32 1342177280, label %bb63
+  i32 1476395008, label %bb84
+  i32 1879048192, label %bb104
+  i32 2013265920, label %bb93
+  ]
 
 bb84:		; preds = %bb69
-	%2 = tail call ptr @lbackref(ptr %m, ptr %sp.1.lcssa, ptr %stop, i32 0, i32 %stopst, i32 0, i32 0) nounwind		; <ptr> [#uses=0]
-	br label %bb63
+  %2 = tail call ptr @lbackref(ptr %m, ptr %sp.1.lcssa, ptr %stop, i32 0, i32 %stopst, i32 0, i32 0) nounwind		; <ptr> [#uses=0]
+  br label %bb63
 
 bb93:		; preds = %bb69
-	ret ptr null
+  ret ptr null
 
 bb104:		; preds = %bb69
-	%sp.1.lcssa.lcssa33 = phi ptr [ %sp.1.lcssa, %bb69 ]		; <ptr> [#uses=0]
-	unreachable
+  %sp.1.lcssa.lcssa33 = phi ptr [ %sp.1.lcssa, %bb69 ]		; <ptr> [#uses=0]
+  unreachable
 
 bb108.loopexit2.loopexit.loopexit:		; preds = %bb69
-	ret ptr null
+  ret ptr null
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll b/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll
index 3eda7ca1b812c..c49f651437cbb 100644
--- a/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll
+++ b/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll
@@ -1,14 +1,19 @@
-; Test to make sure malloc's bitcast does not block detection of a store 
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; Test to make sure malloc's bitcast does not block detection of a store
 ; to aliased memory; GVN should not optimize away the load in this program.
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 define i64 @test() {
+; CHECK-LABEL: define i64 @test() {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call ptr @malloc(i64 mul (i64 ptrtoint (ptr getelementptr (i64, ptr null, i64 1) to i64), i64 4))
+; CHECK-NEXT:    store i8 42, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[Y:%.*]] = load i64, ptr [[TMP1]], align 4
+; CHECK-NEXT:    ret i64 [[Y]]
+;
   %1 = tail call ptr @malloc(i64 mul (i64 4, i64 ptrtoint (ptr getelementptr (i64, ptr null, i64 1) to i64))) ; <ptr> [#uses=2]
   store i8 42, ptr %1
   %Y = load i64, ptr %1                               ; <i64> [#uses=1]
   ret i64 %Y
-; CHECK: %Y = load i64, ptr %1
-; CHECK: ret i64 %Y
 }
 
 declare noalias ptr @malloc(i64)
diff --git a/llvm/test/Transforms/NewGVN/2010-03-31-RedundantPHIs.ll b/llvm/test/Transforms/NewGVN/2010-03-31-RedundantPHIs.ll
index 321f3cff89a1e..c6fc7b99cdf8d 100644
--- a/llvm/test/Transforms/NewGVN/2010-03-31-RedundantPHIs.ll
+++ b/llvm/test/Transforms/NewGVN/2010-03-31-RedundantPHIs.ll
@@ -1,9 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 ; CHECK-NOT: load
 ; CHECK-NOT: phi
 
 define ptr @cat(ptr %s1, ...) nounwind {
+; CHECK-LABEL: define ptr @cat(
+; CHECK-SAME: ptr [[S1:%.*]], ...) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[BB:%.*]], label [[BB3:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb3:
+; CHECK-NEXT:    store ptr undef, ptr undef, align 4
+; CHECK-NEXT:    br i1 undef, label [[BB5:%.*]], label [[BB6:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb6:
+; CHECK-NEXT:    br label [[BB12:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    br i1 undef, label [[BB9:%.*]], label [[BB10:%.*]]
+; CHECK:       bb9:
+; CHECK-NEXT:    br label [[BB11:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    br label [[BB11]]
+; CHECK:       bb11:
+; CHECK-NEXT:    br label [[BB12]]
+; CHECK:       bb12:
+; CHECK-NEXT:    br i1 undef, label [[BB8:%.*]], label [[BB13:%.*]]
+; CHECK:       bb13:
+; CHECK-NEXT:    ret ptr undef
+;
 entry:
   br i1 undef, label %bb, label %bb3
 
diff --git a/llvm/test/Transforms/NewGVN/2010-05-08-OneBit.ll b/llvm/test/Transforms/NewGVN/2010-05-08-OneBit.ll
index 0d2d45a5e0d9f..0a121ffab761f 100644
--- a/llvm/test/Transforms/NewGVN/2010-05-08-OneBit.ll
+++ b/llvm/test/Transforms/NewGVN/2010-05-08-OneBit.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn
 ; PR7052
 
@@ -12,7 +13,7 @@ entry:
 
 l117.i.i:                                         ; preds = %entry
   invoke fastcc void @foo()
-          to label %.noexc5 unwind label %landing_pad
+  to label %.noexc5 unwind label %landing_pad
 
 .noexc5:                                          ; preds = %l117.i.i
   unreachable
@@ -22,7 +23,7 @@ k121.i.i:                                         ; preds = %entry
 
 l129.i.i:                                         ; preds = %k121.i.i
   invoke fastcc void @foo()
-          to label %.noexc7 unwind label %landing_pad
+  to label %.noexc7 unwind label %landing_pad
 
 .noexc7:                                          ; preds = %l129.i.i
   unreachable
@@ -34,7 +35,7 @@ k133.i.i:                                         ; preds = %k121.i.i
 
 l147.i.i:                                         ; preds = %k133.i.i
   invoke fastcc void @foo()
-          to label %.noexc10 unwind label %landing_pad
+  to label %.noexc10 unwind label %landing_pad
 
 .noexc10:                                         ; preds = %l147.i.i
   unreachable
@@ -44,10 +45,10 @@ k151.i.i:                                         ; preds = %k133.i.i
 
 landing_pad:                                      ; preds = %l147.i.i, %l129.i.i, %l117.i.i
   %exn = landingpad {ptr, i32}
-            cleanup
+  cleanup
   switch i32 undef, label %fin [
-    i32 1, label %catch1
-    i32 2, label %catch
+  i32 1, label %catch1
+  i32 2, label %catch
   ]
 
 fin:                                              ; preds = %landing_pad
diff --git a/llvm/test/Transforms/NewGVN/2010-11-13-Simplify.ll b/llvm/test/Transforms/NewGVN/2010-11-13-Simplify.ll
index b06570b0ebf8d..3d12783ee53cc 100644
--- a/llvm/test/Transforms/NewGVN/2010-11-13-Simplify.ll
+++ b/llvm/test/Transforms/NewGVN/2010-11-13-Simplify.ll
@@ -1,9 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 declare i32 @foo(i32) readnone
 
 define i1 @bar() {
-; CHECK-LABEL: @bar(
+; CHECK-LABEL: define i1 @bar() {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(i32 0) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @foo(i32 [[A]]) #[[ATTR0]]
+; CHECK-NEXT:    ret i1 true
+;
   %a = call i32 @foo (i32 0) readnone
   %b = call i32 @foo (i32 0) readnone
   %c = and i32 %a, %b
@@ -11,5 +16,4 @@ define i1 @bar() {
   %y = call i32 @foo (i32 %c) readnone
   %z = icmp eq i32 %x, %y
   ret i1 %z
-; CHECK: ret i1 true
-} 
+}
diff --git a/llvm/test/Transforms/NewGVN/2011-04-27-phioperands.ll b/llvm/test/Transforms/NewGVN/2011-04-27-phioperands.ll
index 3e8a5d84405ea..c039422be84ed 100644
--- a/llvm/test/Transforms/NewGVN/2011-04-27-phioperands.ll
+++ b/llvm/test/Transforms/NewGVN/2011-04-27-phioperands.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
@@ -64,10 +65,10 @@ doemit.exit76.i:
 
 "<bb 64>.i":
   switch i32 undef, label %"<bb 5>" [
-    i32 42, label %"<L54>.i"
-    i32 43, label %"<L55>.i"
-    i32 63, label %"<L56>.i"
-    i32 123, label %"<bb 5>.i258.i"
+  i32 42, label %"<L54>.i"
+  i32 43, label %"<L55>.i"
+  i32 63, label %"<L56>.i"
+  i32 123, label %"<bb 5>.i258.i"
   ]
 
 "<L54>.i":
@@ -93,14 +94,14 @@ doemit.exit127.i:
 
 "<bb 5>":
   switch i32 undef, label %"<L39>.i" [
-    i32 36, label %"<L19>.i"
-    i32 94, label %"<L18>.i"
-    i32 124, label %"<L98>.i"
-    i32 42, label %"<L99>.i"
-    i32 43, label %"<L99>.i"
-    i32 46, label %"<L24>.i"
-    i32 63, label %"<L99>.i"
-    i32 91, label %"<L28>.i"
-    i32 92, label %"<L29>.i"
+  i32 36, label %"<L19>.i"
+  i32 94, label %"<L18>.i"
+  i32 124, label %"<L98>.i"
+  i32 42, label %"<L99>.i"
+  i32 43, label %"<L99>.i"
+  i32 46, label %"<L24>.i"
+  i32 63, label %"<L99>.i"
+  i32 91, label %"<L28>.i"
+  i32 92, label %"<L29>.i"
   ]
 }
diff --git a/llvm/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll b/llvm/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
index c547e8fac5e58..444385d81ab41 100644
--- a/llvm/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
+++ b/llvm/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
@@ -1,9 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 ;
 
 %0 = type { i64, i1 }
 
 define i64 @test1(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test1(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[UADD_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[UADD_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %uadd = tail call %0 @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %uadd.0 = extractvalue %0 %uadd, 0
@@ -12,11 +25,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test1(
-; CHECK-NOT: add1
-; CHECK: ret
 
 define i64 @test2(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test2(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[USUB_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[USUB_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %usub = tail call %0 @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %usub.0 = extractvalue %0 %usub, 0
@@ -25,11 +47,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test2(
-; CHECK-NOT: sub1
-; CHECK: ret
 
 define i64 @test3(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test3(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[UMUL_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[UMUL_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %umul = tail call %0 @llvm.umul.with.overflow.i64(i64 %a, i64 %b)
   %umul.0 = extractvalue %0 %umul, 0
@@ -38,11 +69,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test3(
-; CHECK-NOT: mul1
-; CHECK: ret
 
 define i64 @test4(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test4(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[SADD_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[SADD_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %sadd = tail call %0 @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
   %sadd.0 = extractvalue %0 %sadd, 0
@@ -51,11 +91,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test4(
-; CHECK-NOT: add1
-; CHECK: ret
 
 define i64 @test5(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test5(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[SSUB_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[SSUB_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %ssub = tail call %0 @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
   %ssub.0 = extractvalue %0 %ssub, 0
@@ -64,11 +113,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test5(
-; CHECK-NOT: sub1
-; CHECK: ret
 
 define i64 @test6(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test6(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[SMUL_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[SMUL_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %smul = tail call %0 @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
   %smul.0 = extractvalue %0 %smul, 0
@@ -77,9 +135,6 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test6(
-; CHECK-NOT: mul1
-; CHECK: ret
 
 declare void @exit(i32) noreturn
 declare %0 @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll b/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
index 46e3c2830920c..675e7da26a105 100644
--- a/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
+++ b/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 %struct.__fundamental_type_info_pseudo = type { %struct.__type_info_pseudo }
 %struct.__type_info_pseudo = type { ptr, ptr }
@@ -18,26 +19,70 @@ declare void @__cxa_end_catch()
 declare i32 @__gxx_personality_v0(i32, i64, ptr, ptr)
 
 define void @_Z3foov() uwtable personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: define void @_Z3foov(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] personality ptr @__gxx_personality_v0 {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    invoke void @_Z4barv()
+; CHECK-NEXT:            to label [[RETURN:%.*]] unwind label [[LPAD:%.*]]
+; CHECK:       lpad:
+; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            catch ptr @_ZTIi
+; CHECK-NEXT:            catch ptr @_ZTIb
+; CHECK-NEXT:            catch ptr @_ZTIi
+; CHECK-NEXT:            catch ptr @_ZTIb
+; CHECK-NEXT:    [[EXC_PTR2_I:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0
+; CHECK-NEXT:    [[FILTER3_I:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1
+; CHECK-NEXT:    [[TYPEID_I:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[FILTER3_I]], [[TYPEID_I]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[PPAD:%.*]], label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    [[TYPEID1_I:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[FILTER3_I]], [[TYPEID1_I]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PPAD2:%.*]], label [[NEXT2:%.*]]
+; CHECK:       ppad:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXC_PTR2_I]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR1]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       ppad2:
+; CHECK-NEXT:    [[D_2073_5_I:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXC_PTR2_I]]) #[[ATTR1]]
+; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR1]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       next2:
+; CHECK-NEXT:    call void @_Z7cleanupv()
+; CHECK-NEXT:    br i1 [[TMP1]], label [[PPAD3:%.*]], label [[NEXT3:%.*]]
+; CHECK:       next3:
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PPAD4:%.*]], label [[UNWIND:%.*]]
+; CHECK:       unwind:
+; CHECK-NEXT:    resume { ptr, i32 } [[TMP0]]
+; CHECK:       ppad3:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXC_PTR2_I]]) #[[ATTR1]]
+; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR1]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       ppad4:
+; CHECK-NEXT:    [[D_2080_5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXC_PTR2_I]]) #[[ATTR1]]
+; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR1]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
 entry:
   invoke void @_Z4barv()
-          to label %return unwind label %lpad
+  to label %return unwind label %lpad
 
 lpad:                                             ; preds = %entry
   %0 = landingpad { ptr, i32 }
-          catch ptr @_ZTIi
-          catch ptr @_ZTIb
-          catch ptr @_ZTIi
-          catch ptr @_ZTIb
+  catch ptr @_ZTIi
+  catch ptr @_ZTIb
+  catch ptr @_ZTIi
+  catch ptr @_ZTIb
   %exc_ptr2.i = extractvalue { ptr, i32 } %0, 0
   %filter3.i = extractvalue { ptr, i32 } %0, 1
   %typeid.i = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
-; CHECK: call i32 @llvm.eh.typeid.for
   %1 = icmp eq i32 %filter3.i, %typeid.i
   br i1 %1, label %ppad, label %next
 
 next:                                             ; preds = %lpad
   %typeid1.i = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
-; CHECK: call i32 @llvm.eh.typeid.for
   %2 = icmp eq i32 %filter3.i, %typeid1.i
   br i1 %2, label %ppad2, label %next2
 
@@ -54,7 +99,6 @@ ppad2:                                            ; preds = %next
 next2:                                            ; preds = %next
   call void @_Z7cleanupv()
   %typeid = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
-; CHECK-NOT: call i32 @llvm.eh.typeid.for
   %4 = icmp eq i32 %filter3.i, %typeid
   br i1 %4, label %ppad3, label %next3
 
diff --git a/llvm/test/Transforms/NewGVN/2012-05-22-PreCrash.ll b/llvm/test/Transforms/NewGVN/2012-05-22-PreCrash.ll
index 787a3ba70325e..1357f2b809f74 100644
--- a/llvm/test/Transforms/NewGVN/2012-05-22-PreCrash.ll
+++ b/llvm/test/Transforms/NewGVN/2012-05-22-PreCrash.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn
 ; PR12858
 
diff --git a/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll b/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll
index 2fb275d7aaf4d..7b3f33bd4dbea 100644
--- a/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll
+++ b/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; XFAIL: *
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
@@ -14,6 +15,25 @@ declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x
 ; CHECK: llvm.masked.scatter
 ; CHECK: llvm.masked.gather
 define spir_kernel void @test(<2 x ptr> %in1, <2 x ptr> %in2, ptr %out) {
+; CHECK-LABEL: define spir_kernel void @test(
+; CHECK-SAME: <2 x ptr> [[IN1:%.*]], <2 x ptr> [[IN2:%.*]], ptr [[OUT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP_1:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP_I:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP_0]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = insertelement <2 x ptr> [[TMP_I]], ptr [[TMP_1]], i32 1
+; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN1]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> <i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> <i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; CHECK-NEXT:    [[TMP_V_1_0:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 0
+; CHECK-NEXT:    [[TMP_V_1_1:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 1
+; CHECK-NEXT:    store i32 [[TMP_V_1_0]], ptr [[OUT]], align 4
+; CHECK-NEXT:    [[OUT_1:%.*]] = getelementptr i32, ptr [[OUT]], i32 1
+; CHECK-NEXT:    store i32 [[TMP_V_1_1]], ptr [[OUT_1]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   ; Just some temporary storage
   %tmp.0 = alloca i32
diff --git a/llvm/test/Transforms/NewGVN/MemdepMiscompile.ll b/llvm/test/Transforms/NewGVN/MemdepMiscompile.ll
index f2d1827d87b0d..a3f1f4d8182fb 100644
--- a/llvm/test/Transforms/NewGVN/MemdepMiscompile.ll
+++ b/llvm/test/Transforms/NewGVN/MemdepMiscompile.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.0"
@@ -7,14 +8,38 @@ target triple = "x86_64-apple-macosx10.7.0"
 ; Make sure we do not replace load %shouldExit in while.cond.backedge
 ; with a phi node where the value from while.body is 0.
 define i32 @test() nounwind ssp {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHOULDEXIT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TASKSIDLE:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[SHOULDEXIT]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[TASKSIDLE]], align 4
+; CHECK-NEXT:    call void @CTestInitialize(ptr [[TASKSIDLE]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SHOULDEXIT]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[WHILE_BODY_LR_PH:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       while.body.lr.ph:
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    call void @RunInMode(i32 100) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TASKSIDLE]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_COND_BACKEDGE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 0, ptr [[TASKSIDLE]], align 4
+; CHECK-NEXT:    call void @TimerCreate(ptr [[SHOULDEXIT]]) #[[ATTR1]]
+; CHECK-NEXT:    br label [[WHILE_COND_BACKEDGE]]
+; CHECK:       while.cond.backedge:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SHOULDEXIT]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_COND_WHILE_END_CRIT_EDGE:%.*]]
+; CHECK:       while.cond.while.end_crit_edge:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
-; CHECK: test()
-; CHECK: while.body:
-; CHECK: call void @RunInMode
-; CHECK: br i1 %tobool, label %while.cond.backedge, label %if.then
-; CHECK: while.cond.backedge:
-; CHECK: load i32, ptr %shouldExit
-; CHECK: br i1 %cmp, label %while.body
   %shouldExit = alloca i32, align 4
   %tasksIdle = alloca i32, align 4
   store i32 0, ptr %shouldExit, align 4
diff --git a/llvm/test/Transforms/NewGVN/addrspacecast.ll b/llvm/test/Transforms/NewGVN/addrspacecast.ll
index fea8a2f3f8586..394db28425d98 100644
--- a/llvm/test/Transforms/NewGVN/addrspacecast.ll
+++ b/llvm/test/Transforms/NewGVN/addrspacecast.ll
@@ -7,7 +7,7 @@ define ptr addrspace(1) @addrspacecast(ptr %ptr) {
 ; CHECK-NEXT:    [[Z1:%.*]] = addrspacecast ptr [[PTR:%.*]] to ptr addrspace(1)
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    store ptr addrspace(1) [[Z1]], ptr undef
+; CHECK-NEXT:    store ptr addrspace(1) [[Z1]], ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[Z1]]
 ;
 block1:
@@ -29,7 +29,7 @@ define ptr addrspace(1) @addrspacecast_different_result_types(ptr %ptr) {
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
 ; CHECK-NEXT:    [[Z2:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
-; CHECK-NEXT:    store ptr addrspace(2) [[Z1]], ptr undef
+; CHECK-NEXT:    store ptr addrspace(2) [[Z1]], ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[Z2]]
 ;
 block1:
@@ -48,7 +48,7 @@ define ptr addrspace(1) @addrspacecast_simplify(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    [[CAST0:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    store ptr addrspace(1) [[PTR]], ptr undef
+; CHECK-NEXT:    store ptr addrspace(1) [[PTR]], ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[PTR]]
 ;
 block1:
@@ -70,7 +70,7 @@ define ptr addrspace(1) @addrspacecast_constant() {
 ; CHECK-NEXT:    store ptr undef, ptr @h, align 4
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    store ptr addrspace(1) undef, ptr undef
+; CHECK-NEXT:    store ptr addrspace(1) undef, ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) undef
 ;
 block1:
@@ -88,11 +88,11 @@ block2:
 define ptr addrspace(1) @addrspacecast_leader(ptr %arg.ptr) {
 ; CHECK-LABEL: @addrspacecast_leader(
 ; CHECK-NEXT:  block1:
-; CHECK-NEXT:    [[LOAD0:%.*]] = load ptr, ptr [[ARG_PTR:%.*]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load ptr, ptr [[ARG_PTR:%.*]], align 8
 ; CHECK-NEXT:    [[Z1:%.*]] = addrspacecast ptr [[LOAD0]] to ptr addrspace(1)
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    store ptr addrspace(1) [[Z1]], ptr undef
+; CHECK-NEXT:    store ptr addrspace(1) [[Z1]], ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[Z1]]
 ;
 block1:
diff --git a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
index baef8b51fbc15..53190466963a1 100644
--- a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
+++ b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
@@ -245,7 +245,7 @@ bb23:                                             ; preds = %bb4
 define i8 @irreducible_memoryphi(ptr noalias %arg, ptr noalias %arg2) {
 ; CHECK-LABEL: @irreducible_memoryphi(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    store i8 0, ptr [[ARG:%.*]]
+; CHECK-NEXT:    store i8 0, ptr [[ARG:%.*]], align 1
 ; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB2]]
diff --git a/llvm/test/Transforms/NewGVN/basic-undef-test.ll b/llvm/test/Transforms/NewGVN/basic-undef-test.ll
index 5b731fc5f9fa0..148c0227adf26 100644
--- a/llvm/test/Transforms/NewGVN/basic-undef-test.ll
+++ b/llvm/test/Transforms/NewGVN/basic-undef-test.ll
@@ -1,15 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 ; ModuleID = 'test3.ll'
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 define i32 @main(ptr %foo)  {
+; CHECK-LABEL: define i32 @main(
+; CHECK-SAME: ptr [[FOO:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[FOO]], align 4
+; CHECK-NEXT:    store i32 5, ptr undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
 entry:
-; CHECK: load i32, ptr %foo, align 4
   %0 = load i32, ptr %foo, align 4
   store i32 5, ptr undef, align 4
-; CHECK-NOT: load i32, ptr %foo, align 4
   %1 = load i32, ptr %foo, align 4
-; CHECK: add i32 %0, %0
   %2 = add i32 %0, %1
   ret i32 %2
 }
diff --git a/llvm/test/Transforms/NewGVN/br-identical.ll b/llvm/test/Transforms/NewGVN/br-identical.ll
index c99838525aa2d..23f43b01f443b 100644
--- a/llvm/test/Transforms/NewGVN/br-identical.ll
+++ b/llvm/test/Transforms/NewGVN/br-identical.ll
@@ -1,8 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S -o - %s | FileCheck %s
 
 ; If a branch has two identical successors, we cannot declare either dead.
 
 define void @widget(i1 %p) {
+; CHECK-LABEL: define void @widget(
+; CHECK-SAME: i1 [[P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[T2:%.*]], [[BB7:%.*]] ]
+; CHECK-NEXT:    [[T2]] = add i64 [[T1]], 1
+; CHECK-NEXT:    [[T3:%.*]] = icmp ult i64 0, [[T2]]
+; CHECK-NEXT:    br i1 [[T3]], label [[BB3:%.*]], label [[BB4:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[T4:%.*]] = call i64 @f()
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb4:
+; CHECK-NEXT:    br i1 [[P]], label [[BB5:%.*]], label [[BB6:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    br i1 true, label [[BB7]], label [[BB7]]
+; CHECK:       bb6:
+; CHECK-NEXT:    br i1 true, label [[BB7]], label [[BB7]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br i1 [[P]], label [[BB2]], label [[BB8:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %bb2
 
@@ -17,7 +41,6 @@ bb3:
   br label %bb4
 
 bb4:
-  ; CHECK-NOT: phi {{.*}} undef
   %foo = phi i64 [ %t4, %bb3 ], [ 0, %bb2 ]
   br i1 %p, label %bb5, label %bb6
 
diff --git a/llvm/test/Transforms/NewGVN/calloc-load-removal.ll b/llvm/test/Transforms/NewGVN/calloc-load-removal.ll
index a8a1e66d97d95..608f739e175fe 100644
--- a/llvm/test/Transforms/NewGVN/calloc-load-removal.ll
+++ b/llvm/test/Transforms/NewGVN/calloc-load-removal.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn < %s | FileCheck %s
 ; Check that loads from calloc are recognized as being zero.
 
@@ -5,14 +6,15 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Function Attrs: nounwind uwtable
 define i32 @test1() {
+; CHECK-LABEL: define i32 @test1() {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias ptr @calloc(i64 1, i64 4)
+; CHECK-NEXT:    ret i32 0
+;
   %1 = tail call noalias ptr @calloc(i64 1, i64 4)
   ; This load is trivially constant zero
   %2 = load i32, ptr %1, align 4
   ret i32 %2
 
-; CHECK-LABEL: @test1(
-; CHECK-NOT: %2 = load i32, ptr %1, align 4
-; CHECK: ret i32 0
 }
 
 declare noalias ptr @calloc(i64, i64) mustprogress nofree nounwind willreturn allockind("alloc,zeroed") allocsize(0,1) "alloc-family"="malloc"
diff --git a/llvm/test/Transforms/NewGVN/calls-readonly.ll b/llvm/test/Transforms/NewGVN/calls-readonly.ll
index 68d74c1aeda7f..49f5d3aa68540 100644
--- a/llvm/test/Transforms/NewGVN/calls-readonly.ll
+++ b/llvm/test/Transforms/NewGVN/calls-readonly.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 ; Should delete the second call to strlen even though the intervening strchr call exists.
 
@@ -5,6 +6,22 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 target triple = "i386-apple-darwin7"
 
 define ptr @test(ptr %P, ptr %Q, i32 %x, i32 %y) nounwind readonly {
+; CHECK-LABEL: define ptr @test(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @strlen(ptr [[P]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB:%.*]], label [[BB1:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[X_ADDR_0:%.*]] = phi i32 [ [[TMP2]], [[BB]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr [[Q]], i32 97)
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X_ADDR_0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 [[X_ADDR_0]]
+; CHECK-NEXT:    ret ptr [[TMP5]]
+;
 entry:
   %0 = tail call i32 @strlen(ptr %P)              ; <i32> [#uses=2]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
@@ -24,21 +41,6 @@ bb1:                                              ; preds = %bb, %entry
   ret ptr %6
 }
 
-; CHECK: define ptr @test(ptr %P, ptr %Q, i32 %x, i32 %y) #0 {
-; CHECK: entry:
-; CHECK-NEXT:   %0 = tail call i32 @strlen(ptr %P)
-; CHECK-NEXT:   %1 = icmp eq i32 %0, 0
-; CHECK-NEXT:   br i1 %1, label %bb, label %bb1
-; CHECK: bb:
-; CHECK-NEXT:   %2 = sdiv i32 %x, %y
-; CHECK-NEXT:   br label %bb1
-; CHECK: bb1:
-; CHECK-NEXT:   %x_addr.0 = phi i32 [ %2, %bb ], [ %x, %entry ]
-; CHECK-NEXT:   %3 = tail call ptr @strchr(ptr %Q, i32 97)
-; CHECK-NEXT:   %4 = add i32 %x_addr.0, %0
-; CHECK-NEXT:   %5 = getelementptr i8, ptr %3, i32 %x_addr.0
-; CHECK-NEXT:   ret ptr %5
-; CHECK: }
 
 declare i32 @strlen(ptr) nounwind readonly
 
diff --git a/llvm/test/Transforms/NewGVN/completeness.ll b/llvm/test/Transforms/NewGVN/completeness.ll
index d968c785ceff0..4841e2e958b28 100644
--- a/llvm/test/Transforms/NewGVN/completeness.ll
+++ b/llvm/test/Transforms/NewGVN/completeness.ll
@@ -6,9 +6,12 @@ define i32 @test1(i32, ptr) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
-; CHECK:         br label [[TMP6:%.*]]
-; CHECK:         br label [[TMP6]]
-; CHECK:         [[PHIOFOPS:%.*]] = phi i32 [ 105, [[TMP5]] ], [ 75, [[TMP4]] ]
+; CHECK:       4:
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       5:
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       6:
+; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i32 [ 105, [[TMP5]] ], [ 75, [[TMP4]] ]
 ; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 5, [[TMP4]] ], [ 7, [[TMP5]] ]
 ; CHECK-NEXT:    ret i32 [[PHIOFOPS]]
 ;
@@ -31,9 +34,12 @@ define i32 @test1b(i32, ptr) {
 ; CHECK-LABEL: @test1b(
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
-; CHECK:         br label [[TMP6:%.*]]
-; CHECK:         br label [[TMP6]]
-; CHECK:         [[PHIOFOPS1:%.*]] = phi i32 [ 105, [[TMP5]] ], [ 75, [[TMP4]] ]
+; CHECK:       4:
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       5:
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       6:
+; CHECK-NEXT:    [[PHIOFOPS1:%.*]] = phi i32 [ 105, [[TMP5]] ], [ 75, [[TMP4]] ]
 ; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i32 [ 1575, [[TMP5]] ], [ 1125, [[TMP4]] ]
 ; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 5, [[TMP4]] ], [ 7, [[TMP5]] ]
 ; CHECK-NEXT:    ret i32 [[PHIOFOPS]]
@@ -58,9 +64,12 @@ define i32 @test2(i32) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; CHECK:         br label [[TMP5:%.*]]
-; CHECK:         br label [[TMP5]]
-; CHECK:         [[DOT01:%.*]] = phi i32 [ 3, [[TMP3]] ], [ 2, [[TMP4]] ]
+; CHECK:       3:
+; CHECK-NEXT:    br label [[TMP5:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    br label [[TMP5]]
+; CHECK:       5:
+; CHECK-NEXT:    [[DOT01:%.*]] = phi i32 [ 3, [[TMP3]] ], [ 2, [[TMP4]] ]
 ; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 2, [[TMP3]] ], [ 3, [[TMP4]] ]
 ; CHECK-NEXT:    ret i32 5
 ;
@@ -158,9 +167,12 @@ define i32 @test4(i32, ptr, ptr noalias, ptr noalias) {
 ; CHECK-NEXT:    store i32 7, ptr [[TMP3:%.*]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:         br label [[TMP8:%.*]]
-; CHECK:         br label [[TMP8]]
-; CHECK:         [[DOT01:%.*]] = phi i32 [ 5, [[TMP6]] ], [ 7, [[TMP7]] ]
+; CHECK:       6:
+; CHECK-NEXT:    br label [[TMP8:%.*]]
+; CHECK:       7:
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:       8:
+; CHECK-NEXT:    [[DOT01:%.*]] = phi i32 [ 5, [[TMP6]] ], [ 7, [[TMP7]] ]
 ; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ [[TMP2]], [[TMP6]] ], [ [[TMP3]], [[TMP7]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOT0]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[TMP9]], 15
@@ -287,19 +299,19 @@ bb28:                                             ; preds = %bb27, %bb
 define i8 @test6(ptr %addr) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry-block:
-; CHECK-NEXT:    br label %main-loop
+; CHECK-NEXT:    br label [[MAIN_LOOP:%.*]]
 ; CHECK:       main-loop:
-; CHECK-NEXT:    [[PHIOFOPS1:%.*]] = phi i1 [ true, %entry-block ], [ false, [[CORE:%.*]] ]
-; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i1 [ false, %entry-block ], [ true, [[CORE]] ]
-; CHECK-NEXT:    [[PHI:%.*]] = phi i8 [ 0, %entry-block ], [ 1, [[CORE]] ]
-; CHECK-NEXT:    store volatile i8 0, ptr [[ADDR:%.*]]
-; CHECK-NEXT:    br i1 [[PHIOFOPS1]], label %busy-wait-phi-0, label [[EXIT:%.*]]
+; CHECK-NEXT:    [[PHIOFOPS1:%.*]] = phi i1 [ true, [[ENTRY_BLOCK:%.*]] ], [ false, [[CORE:%.*]] ]
+; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i1 [ false, [[ENTRY_BLOCK]] ], [ true, [[CORE]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i8 [ 0, [[ENTRY_BLOCK]] ], [ 1, [[CORE]] ]
+; CHECK-NEXT:    store volatile i8 0, ptr [[ADDR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[PHIOFOPS1]], label [[BUSY_WAIT_PHI_0:%.*]], label [[EXIT:%.*]]
 ; CHECK:       busy-wait-phi-0:
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i8, ptr [[ADDR]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i8, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[LOAD]], 0
-; CHECK-NEXT:    br i1 [[ICMP]], label %busy-wait-phi-0, label [[CORE]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[BUSY_WAIT_PHI_0]], label [[CORE]]
 ; CHECK:       core:
-; CHECK-NEXT:    br i1 [[PHIOFOPS]], label [[TRAP:%.*]], label %main-loop
+; CHECK-NEXT:    br i1 [[PHIOFOPS]], label [[TRAP:%.*]], label [[MAIN_LOOP]]
 ; CHECK:       trap:
 ; CHECK-NEXT:    ret i8 1
 ; CHECK:       exit:
@@ -507,13 +519,13 @@ declare ptr @wombat()
 define void @test12(ptr %p) {
 ; CHECK-LABEL: @test12(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr %p
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP]], 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[BB2:%.*]], label [[BB8:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    br i1 true, label [[BB6:%.*]], label [[BB7]]
+; CHECK-NEXT:    br i1 true, label [[BB6:%.*]], label [[BB7:%.*]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    br label [[BB7]]
 ; CHECK:       bb7:
@@ -551,7 +563,7 @@ define void @test13() {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP:%.*]] = load i8, ptr null
+; CHECK-NEXT:    [[TMP:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i8 [ [[TMP]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3]] ]
@@ -560,7 +572,7 @@ define void @test13() {
 ; CHECK-NEXT:    [[TMP6]] = getelementptr i8, ptr [[TMP4]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext i8 [[PHIOFOPS]] to i32
 ; CHECK-NEXT:    [[TMP9]] = mul i32 [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10]] = load i8, ptr [[TMP6]]
+; CHECK-NEXT:    [[TMP10]] = load i8, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[BB12:%.*]], label [[BB3]]
 ; CHECK:       bb12:
diff --git a/llvm/test/Transforms/NewGVN/cond_br.ll b/llvm/test/Transforms/NewGVN/cond_br.ll
index 3dbeb394c7cfa..930e5b30c0888 100644
--- a/llvm/test/Transforms/NewGVN/cond_br.ll
+++ b/llvm/test/Transforms/NewGVN/cond_br.ll
@@ -1,12 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 @y = external global i32
 @z = external global i32
 
 ; Function Attrs: nounwind ssp uwtable
 define void @foo(i32 %x) {
-; CHECK: @foo(i32 %x)
-; CHECK: %.pre = load i32, ptr @y
-; CHECK: call void @bar(i32 %.pre)
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @y, align 4
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]]
+; CHECK:       entry.if.end_crit_edge:
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    tail call void @bar(i32 [[DOTPRE]])
+; CHECK-NEXT:    ret void
+;
 
   %t = sub i32 %x, %x
   %.pre = load i32, ptr @y, align 4
@@ -28,9 +39,21 @@ if.end:                                           ; preds = %entry.if.end_crit_e
 }
 
 define void @foo2(i32 %x) {
-; CHECK: @foo2(i32 %x)
-; CHECK: %.pre = load i32, ptr @y
-; CHECK: tail call void @bar(i32 %.pre)
+; CHECK-LABEL: define void @foo2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @y, align 4
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    store i32 1, ptr @z, align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    tail call void @bar(i32 [[DOTPRE]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %t = sub i32 %x, %x
   %.pre = load i32, ptr @y, align 4
diff --git a/llvm/test/Transforms/NewGVN/condprop.ll b/llvm/test/Transforms/NewGVN/condprop.ll
index e685dfedab109..d97fd380c730b 100644
--- a/llvm/test/Transforms/NewGVN/condprop.ll
+++ b/llvm/test/Transforms/NewGVN/condprop.ll
@@ -134,11 +134,11 @@ define void @test4(i1 %b, i32 %x) {
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
 ; CHECK:       sw:
 ; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[CASE0:%.*]]
-; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
-; CHECK-NEXT:    i32 2, label [[CASE0]]
-; CHECK-NEXT:    i32 3, label [[CASE3]]
-; CHECK-NEXT:    i32 4, label [[DEFAULT]]
+; CHECK-NEXT:      i32 0, label [[CASE0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE0]]
+; CHECK-NEXT:      i32 3, label [[CASE3]]
+; CHECK-NEXT:      i32 4, label [[DEFAULT]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default:
 ; CHECK-NEXT:    call void @bar(i32 [[X]])
diff --git a/llvm/test/Transforms/NewGVN/crash-no-aa.ll b/llvm/test/Transforms/NewGVN/crash-no-aa.ll
index 55e2bcb00fb75..30f2e379a3760 100644
--- a/llvm/test/Transforms/NewGVN/crash-no-aa.ll
+++ b/llvm/test/Transforms/NewGVN/crash-no-aa.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -disable-basic-aa -passes=newgvn -S < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/llvm/test/Transforms/NewGVN/crash-usecounts.ll b/llvm/test/Transforms/NewGVN/crash-usecounts.ll
index 5527beabbf342..5cae740fd484c 100644
--- a/llvm/test/Transforms/NewGVN/crash-usecounts.ll
+++ b/llvm/test/Transforms/NewGVN/crash-usecounts.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 define void @test(i1 %arg, i1 %arg1) {
diff --git a/llvm/test/Transforms/NewGVN/crash.ll b/llvm/test/Transforms/NewGVN/crash.ll
index c886bd384eee2..26eaa766a0543 100644
--- a/llvm/test/Transforms/NewGVN/crash.ll
+++ b/llvm/test/Transforms/NewGVN/crash.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 ; PR5631
@@ -106,7 +107,7 @@ if.then21.i:
   ret ptr undef
 
 do.body36.i:
-  %ivar38.i = load i64, ptr @g 
+  %ivar38.i = load i64, ptr @g
   %add.ptr39.sum.i = add i64 %ivar38.i, 8
   %tmp40.i = getelementptr inbounds i8, ptr %tmp18.i, i64 %add.ptr39.sum.i
   %tmp41.i = load i64, ptr %tmp40.i
@@ -132,14 +133,14 @@ declare i32 @foo2()
 define i32 @test4() {
 entry:
   ret i32 0
-  
+
 dead:
   %P2 = getelementptr i32, ptr %P2, i32 52
   %Q2 = getelementptr i32, ptr %Q2, i32 52
   store i32 4, ptr %P2
   %A = load i32, ptr %Q2
   br i1 true, label %dead, label %dead2
-  
+
 dead2:
   ret i32 %A
 }
diff --git a/llvm/test/Transforms/NewGVN/cyclic-phi-handling.ll b/llvm/test/Transforms/NewGVN/cyclic-phi-handling.ll
index 4a2f0b972c9fe..dc150799c849c 100644
--- a/llvm/test/Transforms/NewGVN/cyclic-phi-handling.ll
+++ b/llvm/test/Transforms/NewGVN/cyclic-phi-handling.ll
@@ -5,15 +5,15 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define void @foo(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ %arg1, %bb ], [ [[TMP:%.*]]4, %bb7 ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ %arg, %bb ], [ [[TMP]], %bb7 ]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 %arg2(i32 [[TMP4]], i32 [[TMP]])
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ [[ARG1:%.*]], [[BB:%.*]] ], [ [[TMP4:%.*]], [[BB7:%.*]] ]
+; CHECK-NEXT:    [[TMP4]] = phi i32 [ [[ARG:%.*]], [[BB]] ], [ [[TMP]], [[BB7]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 [[ARG2:%.*]](i32 [[TMP4]], i32 [[TMP]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %bb7, label %bb8
+; CHECK-NEXT:    br i1 [[TMP6]], label [[BB7]], label [[BB8:%.*]]
 ; CHECK:       bb7:
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb8:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/NewGVN/dbg-redundant-load.ll b/llvm/test/Transforms/NewGVN/dbg-redundant-load.ll
index 01d95aebdf2d4..cd2eca0de6d14 100644
--- a/llvm/test/Transforms/NewGVN/dbg-redundant-load.ll
+++ b/llvm/test/Transforms/NewGVN/dbg-redundant-load.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 ; Check that the redundant load from %if.then is removed.
@@ -6,15 +7,22 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK: @test_redundant_load(
-; CHECK-LABEL: entry:
-; CHECK-NEXT: load i32, ptr %Y, align 4, !dbg ![[LOC:[0-9]+]]
-; CHECK-LABEL: if.then:
-; CHECK-NOT: load
-; CHECK-LABEL: if.end:
-; CHECK: ![[LOC]] = !DILocation(line: 3, scope: !{{.*}})
 
 define i32 @test_redundant_load(i32 %X, ptr %Y) !dbg !6 {
+; CHECK-LABEL: define i32 @test_redundant_load(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[Y:%.*]]) !dbg [[DBG6:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], -1, !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !dbg [[DBG9]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP0]], !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    call void @foo(), !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br label [[IF_END]], !dbg [[DBG12:![0-9]+]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RESULT_0:%.*]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0]], !dbg [[DBG13:![0-9]+]]
+;
 entry:
   %0 = load i32, ptr %Y, align 4, !dbg !8
   %cmp = icmp sgt i32 %X, -1, !dbg !9
@@ -50,3 +58,16 @@ declare void @foo()
 !11 = !DILocation(line: 7, scope: !6)
 !12 = !DILocation(line: 8, scope: !6)
 !13 = !DILocation(line: 10, scope: !6)
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: [[META2:![0-9]+]])
+; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: "")
+; CHECK: [[META2]] = !{}
+; CHECK: [[DBG6]] = distinct !DISubprogram(name: "test_redundant_load", scope: [[META1]], file: [[META1]], line: 2, type: [[META7:![0-9]+]], scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META7]] = !DISubroutineType(types: [[META2]])
+; CHECK: [[DBG8]] = !DILocation(line: 3, scope: [[DBG6]])
+; CHECK: [[DBG9]] = !DILocation(line: 5, scope: [[DBG6]])
+; CHECK: [[DBG10]] = !DILocation(line: 6, scope: [[DBG6]])
+; CHECK: [[DBG11]] = !DILocation(line: 7, scope: [[DBG6]])
+; CHECK: [[DBG12]] = !DILocation(line: 8, scope: [[DBG6]])
+; CHECK: [[DBG13]] = !DILocation(line: 10, scope: [[DBG6]])
+;.
diff --git a/llvm/test/Transforms/NewGVN/edge.ll b/llvm/test/Transforms/NewGVN/edge.ll
index 8699c85c9ed73..143e52cd139c5 100644
--- a/llvm/test/Transforms/NewGVN/edge.ll
+++ b/llvm/test/Transforms/NewGVN/edge.ll
@@ -1,7 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 define i32 @f1(i32 %x) {
-  ; CHECK-LABEL: define i32 @f1(
+; CHECK-LABEL: define i32 @f1(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret i32 [[X]]
+;
 bb0:
   %cmp = icmp eq i32 %x, 0
   br i1 %cmp, label %bb2, label %bb1
@@ -11,12 +21,19 @@ bb2:
   %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
   %foo = add i32 %cond, %x
   ret i32 %foo
-  ; CHECK: bb2:
-  ; CHECK: ret i32 %x
 }
 
 define i32 @f2(i32 %x) {
-  ; CHECK-LABEL: define i32 @f2(
+; CHECK-LABEL: define i32 @f2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret i32 [[X]]
+;
 bb0:
   %cmp = icmp ne i32 %x, 0
   br i1 %cmp, label %bb1, label %bb2
@@ -26,12 +43,20 @@ bb2:
   %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
   %foo = add i32 %cond, %x
   ret i32 %foo
-  ; CHECK: bb2:
-  ; CHECK: ret i32 %x
 }
 
 define i32 @f3(i32 %x) {
-  ; CHECK-LABEL: define i32 @f3(
+; CHECK-LABEL: define i32 @f3(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    switch i32 [[X]], label [[BB1:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret i32 [[X]]
+;
 bb0:
   switch i32 %x, label %bb1 [ i32 0, label %bb2]
 bb1:
@@ -40,13 +65,21 @@ bb2:
   %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
   %foo = add i32 %cond, %x
   ret i32 %foo
-  ; CHECK: bb2:
-  ; CHECK: ret i32 %x
 }
 
 declare void @g(i1)
 define void @f4(ptr %x)  {
 ; CHECK-LABEL: define void @f4(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[Y:%.*]] = icmp eq ptr null, [[X]]
+; CHECK-NEXT:    br i1 [[Y]], label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @g(i1 [[Y]])
+; CHECK-NEXT:    ret void
+;
 bb0:
   %y = icmp eq ptr null, %x
   br i1 %y, label %bb2, label %bb1
@@ -55,11 +88,22 @@ bb1:
 bb2:
   %zed = icmp eq ptr null, %x
   call void @g(i1 %zed)
-; CHECK: call void @g(i1 %y)
   ret void
 }
 
 define double @fcmp_oeq_not_zero(double %x, double %y) {
+; CHECK-LABEL: define double @fcmp_oeq_not_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 2.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], 2.000000e+00
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
   %cmp = fcmp oeq double %y, 2.0
   br i1 %cmp, label %if, label %return
@@ -72,11 +116,21 @@ return:
   %retval = phi double [ %div, %if ], [ %x, %entry ]
   ret double %retval
 
-; CHECK-LABEL: define double @fcmp_oeq_not_zero(
-; CHECK: %div = fdiv double %x, 2.0
 }
 
 define double @fcmp_une_not_zero(double %x, double %y) {
+; CHECK-LABEL: define double @fcmp_une_not_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y]], 2.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], 2.000000e+00
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
   %cmp = fcmp une double %y, 2.0
   br i1 %cmp, label %return, label %else
@@ -89,14 +143,24 @@ return:
   %retval = phi double [ %div, %else ], [ %x, %entry ]
   ret double %retval
 
-; CHECK-LABEL: define double @fcmp_une_not_zero(
-; CHECK: %div = fdiv double %x, 2.0
 }
 
-; PR22376 - We can't propagate zero constants because -0.0 
+; PR22376 - We can't propagate zero constants because -0.0
 ; compares equal to 0.0. If %y is -0.0 in this test case,
 ; we would produce the wrong sign on the infinity return value.
 define double @fcmp_oeq_zero(double %x, double %y) {
+; CHECK-LABEL: define double @fcmp_oeq_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], [[Y]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
   %cmp = fcmp oeq double %y, 0.0
   br i1 %cmp, label %if, label %return
@@ -109,11 +173,21 @@ return:
   %retval = phi double [ %div, %if ], [ %x, %entry ]
   ret double %retval
 
-; CHECK-LABEL: define double @fcmp_oeq_zero(
-; CHECK: %div = fdiv double %x, %y
 }
 
 define double @fcmp_une_zero(double %x, double %y) {
+; CHECK-LABEL: define double @fcmp_une_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y]], -0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], [[Y]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
   %cmp = fcmp une double %y, -0.0
   br i1 %cmp, label %return, label %else
@@ -126,45 +200,65 @@ return:
   %retval = phi double [ %div, %else ], [ %x, %entry ]
   ret double %retval
 
-; CHECK-LABEL: define double @fcmp_une_zero(
-; CHECK: %div = fdiv double %x, %y
 }
 
 ; We also cannot propagate a value if it's not a constant.
 ; This is because the value could be 0.0 or -0.0.
 
 define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2) {
+; CHECK-LABEL: define double @fcmp_oeq_maybe_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]], double [[Z1:%.*]], double [[Z2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1]], [[Z2]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], [[Z]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], [[Z]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
- %z = fadd double %z1, %z2
- %cmp = fcmp oeq double %y, %z
- br i1 %cmp, label %if, label %return
+  %z = fadd double %z1, %z2
+  %cmp = fcmp oeq double %y, %z
+  br i1 %cmp, label %if, label %return
 
 if:
- %div = fdiv double %x, %z
- br label %return
+  %div = fdiv double %x, %z
+  br label %return
 
 return:
- %retval = phi double [ %div, %if ], [ %x, %entry ]
- ret double %retval
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
 
-; CHECK-LABEL: define double @fcmp_oeq_maybe_zero(
-; CHECK: %div = fdiv double %x, %z
 }
 
 define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2) {
+; CHECK-LABEL: define double @fcmp_une_maybe_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]], double [[Z1:%.*]], double [[Z2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1]], [[Z2]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y]], [[Z]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], [[Z]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
- %z = fadd double %z1, %z2
- %cmp = fcmp une double %y, %z
- br i1 %cmp, label %return, label %else
+  %z = fadd double %z1, %z2
+  %cmp = fcmp une double %y, %z
+  br i1 %cmp, label %return, label %else
 
 else:
- %div = fdiv double %x, %z
- br label %return
+  %div = fdiv double %x, %z
+  br label %return
 
 return:
- %retval = phi double [ %div, %else ], [ %x, %entry ]
- ret double %retval
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
 
-; CHECK-LABEL: define double @fcmp_une_maybe_zero(
-; CHECK: %div = fdiv double %x, %z
 }
diff --git a/llvm/test/Transforms/NewGVN/eliminate-callsite-inline.ll b/llvm/test/Transforms/NewGVN/eliminate-callsite-inline.ll
index 748485c03c18a..6cf543840cfe9 100644
--- a/llvm/test/Transforms/NewGVN/eliminate-callsite-inline.ll
+++ b/llvm/test/Transforms/NewGVN/eliminate-callsite-inline.ll
@@ -1,15 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=inline,newgvn -S < %s | FileCheck %s
 
-; CHECK-LABEL: @f2()
-; CHECK-NEXT:    ret void
 
 define void @f2() {
+; CHECK-LABEL: define void @f2() {
+; CHECK-NEXT:    ret void
+;
   call void @f1()
   call void @f1()
   ret void
 }
 
 define internal void @f1() #1 {
+; CHECK-LABEL: define internal void @f1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
 entry:
   ret void
 }
diff --git a/llvm/test/Transforms/NewGVN/equivalent-phi.ll b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
index 925795d49af17..ba4fc14fa2feb 100644
--- a/llvm/test/Transforms/NewGVN/equivalent-phi.ll
+++ b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
@@ -11,22 +11,22 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) #0 {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ %arg, %bb ], [ [[TMP:%.*]]15, %bb17 ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ %arg2, %bb ], [ [[TMP18:%.*]], %bb17 ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ 0, %bb ], [ [[TMP14:%.*]], %bb17 ]
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ [[ARG:%.*]], [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB17:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[ARG2:%.*]], [[BB]] ], [ [[TMP18:%.*]], [[BB17]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP14:%.*]], [[BB17]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @global, i64 0, i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP6]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP14]] = add nsw i32 [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw i32 [[TMP]], %arg1
-; CHECK-NEXT:    br label %bb17
+; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[TMP]], [[ARG1:%.*]]
+; CHECK-NEXT:    br label [[BB17]]
 ; CHECK:       bb17:
 ; CHECK-NEXT:    [[TMP18]] = add i32 [[TMP4]], -1
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP19]], label %bb3, label %bb20
+; CHECK-NEXT:    br i1 [[TMP19]], label [[BB3]], label [[BB20:%.*]]
 ; CHECK:       bb20:
 ; CHECK-NEXT:    ret i32 [[TMP14]]
 ;
diff --git a/llvm/test/Transforms/NewGVN/fold-const-expr.ll b/llvm/test/Transforms/NewGVN/fold-const-expr.ll
index 2821791469494..54020b88d65a8 100644
--- a/llvm/test/Transforms/NewGVN/fold-const-expr.ll
+++ b/llvm/test/Transforms/NewGVN/fold-const-expr.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; GVN failed to do constant expression folding and expanded
 ; them unfolded in many places, producing exponentially large const
 ; expressions. As a result, the compilation never fisished.
@@ -6,6 +7,16 @@
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 %2 = type { i32, i32, i32, i32, i32 }
 define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
+; CHECK-LABEL: define i32 @_Z16vector3util_mainv(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [[TMP0]], ptr [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    store <4 x i32> <i32 234567891, i32 345678912, i32 456789123, i32 0>, ptr [[TMP114]], align 4
+; CHECK-NEXT:    store i32 310393545, ptr [[TMP114]], align 4
+; CHECK-NEXT:    store i32 -383584258, ptr [[TMP114]], align 4
+; CHECK-NEXT:    store i32 -57163022, ptr [[TMP114]], align 4
+; CHECK-NEXT:    ret i32 0
+;
   %tmp1 = alloca %2, align 4
   %tmp114 = getelementptr inbounds %2, ptr %tmp1, i64 0, i32 1
   store <4 x i32> <i32 234567891, i32 345678912, i32 456789123, i32 0>, ptr %tmp114, align 4
@@ -36,7 +47,6 @@ define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
   %tmp1739 = shl i32 %tmp1738, 22
   %tmp1740 = xor i32 %tmp1739, %tmp1738
   store i32 %tmp1740, ptr %tmp1683, align 4
-; CHECK: store i32 310393545, ptr %tmp114, align 4
   %tmp1756 = getelementptr inbounds %2, ptr %tmp1, i64 0, i32 1
   %tmp1761 = load i32, ptr %tmp1756, align 4
   %tmp1766 = shl i32 %tmp1761, 5
@@ -64,7 +74,6 @@ define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
   %tmp1812 = shl i32 %tmp1811, 22
   %tmp1813 = xor i32 %tmp1812, %tmp1811
   store i32 %tmp1813, ptr %tmp1756, align 4
-; CHECK: store i32 -383584258, ptr %tmp114, align 4
   %tmp2645 = getelementptr inbounds %2, ptr %tmp1, i64 0, i32 1
   %tmp2650 = load i32, ptr %tmp2645, align 4
   %tmp2655 = shl i32 %tmp2650, 5
@@ -92,6 +101,5 @@ define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
   %tmp2701 = shl i32 %tmp2700, 22
   %tmp2702 = xor i32 %tmp2701, %tmp2700
   store i32 %tmp2702, ptr %tmp2645, align 4
-; CHECK: store i32 -57163022, ptr %tmp114, align 4
   ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/fpmath.ll b/llvm/test/Transforms/NewGVN/fpmath.ll
index e8cec8af3e021..d936c01bbe788 100644
--- a/llvm/test/Transforms/NewGVN/fpmath.ll
+++ b/llvm/test/Transforms/NewGVN/fpmath.ll
@@ -1,10 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 define double @test1(double %x, double %y) {
-; CHECK: @test1(double %x, double %y)
-; CHECK: %add1 = fadd double %x, %y
-; CHECK-NOT: fpmath
-; CHECK: %foo = fadd double %add1, %add1
+; CHECK-LABEL: define double @test1(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[X]], [[Y]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[ADD1]], [[ADD1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %add1 = fadd double %x, %y, !fpmath !0
   %add2 = fadd double %x, %y
   %foo = fadd double %add1, %add2
@@ -12,9 +15,12 @@ define double @test1(double %x, double %y) {
 }
 
 define double @test2(double %x, double %y) {
-; CHECK: @test2(double %x, double %y)
-; CHECK: %add1 = fadd double %x, %y, !fpmath ![[MD0:[0-9]+]]
-; CHECK: %foo = fadd double %add1, %add1
+; CHECK-LABEL: define double @test2(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[X]], [[Y]], !fpmath [[META0:![0-9]+]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[ADD1]], [[ADD1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %add1 = fadd double %x, %y, !fpmath !0
   %add2 = fadd double %x, %y, !fpmath !0
   %foo = fadd double %add1, %add2
@@ -22,9 +28,12 @@ define double @test2(double %x, double %y) {
 }
 
 define double @test3(double %x, double %y) {
-; CHECK: @test3(double %x, double %y)
-; CHECK: %add1 = fadd double %x, %y, !fpmath ![[MD1:[0-9]+]]
-; CHECK: %foo = fadd double %add1, %add1
+; CHECK-LABEL: define double @test3(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[X]], [[Y]], !fpmath [[META1:![0-9]+]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[ADD1]], [[ADD1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %add1 = fadd double %x, %y, !fpmath !1
   %add2 = fadd double %x, %y, !fpmath !0
   %foo = fadd double %add1, %add2
@@ -32,9 +41,12 @@ define double @test3(double %x, double %y) {
 }
 
 define double @test4(double %x, double %y) {
-; CHECK: @test4(double %x, double %y)
-; CHECK: %add1 = fadd double %x, %y, !fpmath ![[MD1]]
-; CHECK: %foo = fadd double %add1, %add1
+; CHECK-LABEL: define double @test4(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[X]], [[Y]], !fpmath [[META1]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[ADD1]], [[ADD1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %add1 = fadd double %x, %y, !fpmath !0
   %add2 = fadd double %x, %y, !fpmath !1
   %foo = fadd double %add1, %add2
@@ -42,17 +54,22 @@ define double @test4(double %x, double %y) {
 }
 
 define double @test5(double %x) {
-; CHECK: @test5(double %x)
-; CHECK: %neg1 = fneg double %x, !fpmath ![[MD1]]
-; CHECK: %foo = fadd double %neg1, %neg1
+; CHECK-LABEL: define double @test5(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:    [[NEG1:%.*]] = fneg double [[X]], !fpmath [[META1]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[NEG1]], [[NEG1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %neg1 = fneg double %x, !fpmath !0
   %neg2 = fneg double %x, !fpmath !1
   %foo = fadd double %neg1, %neg2
   ret double %foo
 }
 
-; CHECK: ![[MD0]] = !{float 5.000000e+00}
-; CHECK: ![[MD1]] = !{float 2.500000e+00}
 
 !0 = !{ float 5.0 }
 !1 = !{ float 2.5 }
+;.
+; CHECK: [[META0]] = !{float 5.000000e+00}
+; CHECK: [[META1]] = !{float 2.500000e+00}
+;.
diff --git a/llvm/test/Transforms/NewGVN/funclet.ll b/llvm/test/Transforms/NewGVN/funclet.ll
index 3df3f940ec2d7..8c1cbd6b08fd7 100644
--- a/llvm/test/Transforms/NewGVN/funclet.ll
+++ b/llvm/test/Transforms/NewGVN/funclet.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i686-pc-windows-msvc"
@@ -8,13 +9,35 @@ target triple = "i686-pc-windows-msvc"
 @"_TI1?AUA@@" = external constant %eh.ThrowInfo
 
 define i8 @f() personality ptr @__CxxFrameHandler3 {
+; CHECK-LABEL: define i8 @f() personality ptr @__CxxFrameHandler3 {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[C:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[B]], align 1
+; CHECK-NEXT:    store i8 13, ptr [[C]], align 1
+; CHECK-NEXT:    invoke void @_CxxThrowException(ptr [[B]], ptr nonnull @"_TI1?AUA@@")
+; CHECK-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK:       catch.dispatch:
+; CHECK-NEXT:    [[CS1:%.*]] = catchswitch within none [label %catch] unwind to caller
+; CHECK:       catch:
+; CHECK-NEXT:    [[CATCHPAD:%.*]] = catchpad within [[CS1]] [ptr null, i32 64, ptr null]
+; CHECK-NEXT:    store i8 5, ptr [[B]], align 1
+; CHECK-NEXT:    catchret from [[CATCHPAD]] to label [[TRY_CONT:%.*]]
+; CHECK:       try.cont:
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[B]], align 1
+; CHECK-NEXT:    [[LOAD_C:%.*]] = load i8, ptr [[C]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD_B]], [[LOAD_C]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+;
 entry:
   %b = alloca i8
   %c = alloca i8
   store i8 42, ptr %b
   store i8 13, ptr %c
   invoke void @_CxxThrowException(ptr %b, ptr nonnull @"_TI1?AUA@@")
-          to label %unreachable unwind label %catch.dispatch
+  to label %unreachable unwind label %catch.dispatch
 
 catch.dispatch:                                   ; preds = %entry
   %cs1 = catchswitch within none [label %catch] unwind to caller
@@ -33,11 +56,6 @@ try.cont:                                         ; preds = %catch
 unreachable:                                      ; preds = %entry
   unreachable
 }
-; CHECK-LABEL: define i8 @f(
-; CHECK:       %[[load_b:.*]] = load i8, ptr %b
-; CHECK-NEXT:  %[[load_c:.*]] = load i8, ptr %c
-; CHECK-NEXT:  %[[add:.*]] = add i8 %[[load_b]], %[[load_c]]
-; CHECK-NEXT:  ret i8 %[[add]]
 
 declare i32 @__CxxFrameHandler3(...)
 
diff --git a/llvm/test/Transforms/NewGVN/int_sideeffect.ll b/llvm/test/Transforms/NewGVN/int_sideeffect.ll
index f715d022ba010..a2c54bd38e93c 100644
--- a/llvm/test/Transforms/NewGVN/int_sideeffect.ll
+++ b/llvm/test/Transforms/NewGVN/int_sideeffect.ll
@@ -1,27 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S < %s -passes=newgvn | FileCheck %s
 
 declare void @llvm.sideeffect()
 
 ; Store-to-load forwarding across a @llvm.sideeffect.
 
-; CHECK-LABEL: s2l
-; CHECK-NOT: load
 define float @s2l(ptr %p) {
-    store float 0.0, ptr %p
-    call void @llvm.sideeffect()
-    %t = load float, ptr %p
-    ret float %t
+; CHECK-LABEL: define float @s2l(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[P]], align 4
+; CHECK-NEXT:    call void @llvm.sideeffect()
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  store float 0.0, ptr %p
+  call void @llvm.sideeffect()
+  %t = load float, ptr %p
+  ret float %t
 }
 
 ; Redundant load elimination across a @llvm.sideeffect.
 
-; CHECK-LABEL: rle
-; CHECK: load
-; CHECK-NOT: load
 define float @rle(ptr %p) {
-    %r = load float, ptr %p
-    call void @llvm.sideeffect()
-    %s = load float, ptr %p
-    %t = fadd float %r, %s
-    ret float %t
+; CHECK-LABEL: define float @rle(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P]], align 4
+; CHECK-NEXT:    call void @llvm.sideeffect()
+; CHECK-NEXT:    [[T:%.*]] = fadd float [[R]], [[R]]
+; CHECK-NEXT:    ret float [[T]]
+;
+  %r = load float, ptr %p
+  call void @llvm.sideeffect()
+  %s = load float, ptr %p
+  %t = fadd float %r, %s
+  ret float %t
 }
diff --git a/llvm/test/Transforms/NewGVN/invariant.group.ll b/llvm/test/Transforms/NewGVN/invariant.group.ll
index 81e733f84ddb1..7c14059c88c67 100644
--- a/llvm/test/Transforms/NewGVN/invariant.group.ll
+++ b/llvm/test/Transforms/NewGVN/invariant.group.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 %struct.A = type { ptr }
@@ -6,86 +7,131 @@
 
 @unknownPtr = external global i8
 
-; CHECK-LABEL: define i8 @simple() {
 define i8 @simple() {
+; CHECK-LABEL: define i8 @simple() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0:![0-9]+]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
-
-    %a = load i8, ptr %ptr, !invariant.group !0
-    %b = load i8, ptr %ptr, !invariant.group !0
-    %c = load i8, ptr %ptr, !invariant.group !0
-; CHECK: ret i8 42
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
+
+  %a = load i8, ptr %ptr, !invariant.group !0
+  %b = load i8, ptr %ptr, !invariant.group !0
+  %c = load i8, ptr %ptr, !invariant.group !0
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @optimizable1() {
 define i8 @optimizable1() {
+; CHECK-LABEL: define i8 @optimizable1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    [[PTR2:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[PTR]])
+; CHECK-NEXT:    call void @foo(ptr [[PTR2]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    %ptr2 = call ptr @llvm.launder.invariant.group.p0(ptr %ptr)
-    %a = load i8, ptr %ptr, !invariant.group !0
-    
-    call void @foo(ptr %ptr2); call to use %ptr2
-; CHECK: ret i8 42
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  %ptr2 = call ptr @llvm.launder.invariant.group.p0(ptr %ptr)
+  %a = load i8, ptr %ptr, !invariant.group !0
+
+  call void @foo(ptr %ptr2); call to use %ptr2
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @optimizable2() {
 define i8 @optimizable2() {
+; CHECK-LABEL: define i8 @optimizable2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    store i8 13, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @bar(i8 13)
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
-    
-    store i8 13, ptr %ptr ; can't use this store with invariant.group
-    %a = load i8, ptr %ptr 
-    call void @bar(i8 %a) ; call to use %a
-    
-    call void @foo(ptr %ptr)
-    %b = load i8, ptr %ptr, !invariant.group !0
-    
-; CHECK: ret i8 42
-    ret i8 %b
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
+
+  store i8 13, ptr %ptr ; can't use this store with invariant.group
+  %a = load i8, ptr %ptr
+  call void @bar(i8 %a) ; call to use %a
+
+  call void @foo(ptr %ptr)
+  %b = load i8, ptr %ptr, !invariant.group !0
+
+  ret i8 %b
 }
 
-; CHECK-LABEL: define i1 @proveEqualityForStrip(
 define i1 @proveEqualityForStrip(ptr %a) {
+; CHECK-LABEL: define i1 @proveEqualityForStrip(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
   %b1 = call ptr @llvm.strip.invariant.group.p0(ptr %a)
-; CHECK-NOT: llvm.strip.invariant.group
   %b2 = call ptr @llvm.strip.invariant.group.p0(ptr %a)
   %r = icmp eq ptr %b1, %b2
-; CHECK: ret i1 true
   ret i1 %r
 }
 
-; CHECK-LABEL: define i8 @unoptimizable1() {
 define i8 @unoptimizable1() {
+; CHECK-LABEL: define i8 @unoptimizable1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    ret i8 [[A]]
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr
-    call void @foo(ptr %ptr)
-    %a = load i8, ptr %ptr, !invariant.group !0
-; CHECK: ret i8 %a
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr
+  call void @foo(ptr %ptr)
+  %a = load i8, ptr %ptr, !invariant.group !0
+  ret i8 %a
 }
 
 ; NewGVN doesn't support assumes.
-; CHECK-LABEL: define void @indirectLoads() {
 define void @indirectLoads() {
+; CHECK-LABEL: define void @indirectLoads() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr @getPointer(ptr null)
+; CHECK-NEXT:    call void @_ZN1AC1Ev(ptr [[CALL]])
+; CHECK-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CALL]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    [[CMP_VTABLES:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2)
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_VTABLES]])
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; CHECK-NEXT:    call void [[TMP0]](ptr [[CALL]])
+; CHECK-NEXT:    [[VTABLE2:%.*]] = load ptr, ptr [[CALL]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE2]], align 8
+; CHECK-NEXT:    call void [[TMP1]](ptr [[CALL]])
+; CHECK-NEXT:    [[VTABLE4:%.*]] = load ptr, ptr [[CALL]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VTABLE4]], align 8
+; CHECK-NEXT:    call void [[TMP2]](ptr [[CALL]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; CHECK-NEXT:    call void [[TMP3]](ptr [[CALL]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %a = alloca ptr, align 8
-  
-  %call = call ptr @getPointer(ptr null) 
+
+  %call = call ptr @getPointer(ptr null)
   call void @_ZN1AC1Ev(ptr %call)
-  
-; CHECK: %vtable = load {{.*}} !invariant.group
+
   %vtable = load ptr, ptr %call, align 8, !invariant.group !0
   %cmp.vtables = icmp eq ptr %vtable, getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2)
   call void @llvm.assume(i1 %cmp.vtables)
-  
+
   store ptr %call, ptr %a, align 8
   %0 = load ptr, ptr %a, align 8
 
@@ -98,36 +144,45 @@ entry:
 ; FIXME: call void @_ZN1A3fooEv(
   %vtable2 = load ptr, ptr %2, align 8, !invariant.group !0
   %3 = load ptr, ptr %vtable2, align 8
-  
+
   call void %3(ptr %2)
   %4 = load ptr, ptr %a, align 8
-  
+
   %vtable4 = load ptr, ptr %4, align 8, !invariant.group !0
   %5 = load ptr, ptr %vtable4, align 8
 ; FIXME: call void @_ZN1A3fooEv(
   call void %5(ptr %4)
- 
+
   %vtable5 = load ptr, ptr %call, align 8, !invariant.group !0
   %6 = load ptr, ptr %vtable5, align 8
 ; FIXME: call void @_ZN1A3fooEv(
   call void %6(ptr %4)
-  
+
   ret void
 }
 
 ; NewGVN won't CSE loads with different pointee types.
-; CHECK-LABEL: define void @combiningBitCastWithLoad() {
 define void @combiningBitCastWithLoad() {
+; CHECK-LABEL: define void @combiningBitCastWithLoad() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr @getPointer(ptr null)
+; CHECK-NEXT:    call void @_ZN1AC1Ev(ptr [[CALL]])
+; CHECK-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CALL]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; CHECK-NEXT:    call void [[TMP0]](ptr [[CALL]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %a = alloca ptr, align 8
-  
-  %call = call ptr @getPointer(ptr null) 
+
+  %call = call ptr @getPointer(ptr null)
   call void @_ZN1AC1Ev(ptr %call)
-  
-; CHECK: %vtable = load {{.*}} !invariant.group
+
   %vtable = load ptr, ptr %call, align 8, !invariant.group !0
   %cmp.vtables = icmp eq ptr %vtable, getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2)
-  
+
   store ptr %call, ptr %a, align 8
 ; FIXME-NOT: !invariant.group
   %0 = load ptr, ptr %a, align 8
@@ -139,173 +194,245 @@ entry:
   ret void
 }
 
-; CHECK-LABEL:define void @loadCombine() {
 define void @loadCombine() {
+; CHECK-LABEL: define void @loadCombine() {
+; CHECK-NEXT:  enter:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[A]])
+; CHECK-NEXT:    call void @bar(i8 [[A]])
+; CHECK-NEXT:    ret void
+;
 enter:
   %ptr = alloca i8
   store i8 42, ptr %ptr
   call void @foo(ptr %ptr)
-; CHECK: %[[A:.*]] = load i8, ptr %ptr, align 1, !invariant.group
   %a = load i8, ptr %ptr, !invariant.group !0
-; CHECK-NOT: load
   %b = load i8, ptr %ptr, !invariant.group !0
-; CHECK: call void @bar(i8 %[[A]])
   call void @bar(i8 %a)
-; CHECK: call void @bar(i8 %[[A]])
   call void @bar(i8 %b)
   ret void
 }
 
-; CHECK-LABEL: define void @loadCombine1() {
 define void @loadCombine1() {
+; CHECK-LABEL: define void @loadCombine1() {
+; CHECK-NEXT:  enter:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[C:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[C]])
+; CHECK-NEXT:    call void @bar(i8 [[C]])
+; CHECK-NEXT:    ret void
+;
 enter:
   %ptr = alloca i8
   store i8 42, ptr %ptr
   call void @foo(ptr %ptr)
-; CHECK: %[[D:.*]] = load i8, ptr %ptr, align 1, !invariant.group
   %c = load i8, ptr %ptr
-; CHECK-NOT: load
   %d = load i8, ptr %ptr, !invariant.group !0
-; CHECK: call void @bar(i8 %[[D]])
   call void @bar(i8 %c)
-; CHECK: call void @bar(i8 %[[D]])
   call void @bar(i8 %d)
   ret void
 }
 
-; CHECK-LABEL: define void @loadCombine2() {    
 define void @loadCombine2() {
+; CHECK-LABEL: define void @loadCombine2() {
+; CHECK-NEXT:  enter:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[E]])
+; CHECK-NEXT:    call void @bar(i8 [[E]])
+; CHECK-NEXT:    ret void
+;
 enter:
   %ptr = alloca i8
   store i8 42, ptr %ptr
   call void @foo(ptr %ptr)
-; CHECK: %[[E:.*]] = load i8, ptr %ptr, align 1, !invariant.group
   %e = load i8, ptr %ptr, !invariant.group !0
-; CHECK-NOT: load
   %f = load i8, ptr %ptr
-; CHECK: call void @bar(i8 %[[E]])
   call void @bar(i8 %e)
-; CHECK: call void @bar(i8 %[[E]])
   call void @bar(i8 %f)
   ret void
 }
 
-; CHECK-LABEL: define void @loadCombine3() {
 define void @loadCombine3() {
+; CHECK-LABEL: define void @loadCombine3() {
+; CHECK-NEXT:  enter:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[E]])
+; CHECK-NEXT:    call void @bar(i8 [[E]])
+; CHECK-NEXT:    ret void
+;
 enter:
   %ptr = alloca i8
   store i8 42, ptr %ptr
   call void @foo(ptr %ptr)
-; CHECK: %[[E:.*]] = load i8, ptr %ptr, align 1, !invariant.group
   %e = load i8, ptr %ptr, !invariant.group !0
-; CHECK-NOT: load
   %f = load i8, ptr %ptr, !invariant.group !0
-; CHECK: call void @bar(i8 %[[E]])
   call void @bar(i8 %e)
-; CHECK: call void @bar(i8 %[[E]])
   call void @bar(i8 %f)
   ret void
 }
 
-; CHECK-LABEL: define i8 @unoptimizable2() {
 define i8 @unoptimizable2() {
+; CHECK-LABEL: define i8 @unoptimizable2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    ret i8 [[A]]
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr
-    call void @foo(ptr %ptr)
-    %a = load i8, ptr %ptr
-    call void @foo(ptr %ptr)
-    %b = load i8, ptr %ptr, !invariant.group !0
-    
-; CHECK: ret i8 %a
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr
+  call void @foo(ptr %ptr)
+  %a = load i8, ptr %ptr
+  call void @foo(ptr %ptr)
+  %b = load i8, ptr %ptr, !invariant.group !0
+
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @unoptimizable3() {
 define i8 @unoptimizable3() {
+; CHECK-LABEL: define i8 @unoptimizable3() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    [[PTR2:%.*]] = call ptr @getPointer(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR2]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    ret i8 [[A]]
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    %ptr2 = call ptr @getPointer(ptr %ptr)
-    %a = load i8, ptr %ptr2, !invariant.group !0
-    
-; CHECK: ret i8 %a
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  %ptr2 = call ptr @getPointer(ptr %ptr)
+  %a = load i8, ptr %ptr2, !invariant.group !0
+
+  ret i8 %a
 }
 
 ; NewGVN cares about the launder for some reason.
-; CHECK-LABEL: define i8 @optimizable4() {
 define i8 @optimizable4() {
+; CHECK-LABEL: define i8 @optimizable4() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    [[PTR2:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR2]], align 1
+; CHECK-NEXT:    ret i8 [[A]]
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr
-    %ptr2 = call ptr @llvm.launder.invariant.group.p0(ptr %ptr)
+  %ptr = alloca i8
+  store i8 42, ptr %ptr
+  %ptr2 = call ptr @llvm.launder.invariant.group.p0(ptr %ptr)
 ; FIXME-NOT: load
-    %a = load i8, ptr %ptr2
-    
+  %a = load i8, ptr %ptr2
+
 ; FIXME: ret i8 42
-    ret i8 %a
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @volatile1() {
 define i8 @volatile1() {
+; CHECK-LABEL: define i8 @volatile1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[B:%.*]] = load volatile i8, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @bar(i8 [[B]])
+; CHECK-NEXT:    [[C:%.*]] = load volatile i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[C]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
-    %a = load i8, ptr %ptr, !invariant.group !0
-    %b = load volatile i8, ptr %ptr
-; CHECK: call void @bar(i8 %b)
-    call void @bar(i8 %b)
-
-    %c = load volatile i8, ptr %ptr, !invariant.group !0
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
+  %a = load i8, ptr %ptr, !invariant.group !0
+  %b = load volatile i8, ptr %ptr
+  call void @bar(i8 %b)
+
+  %c = load volatile i8, ptr %ptr, !invariant.group !0
 ; We might be able to optimize this, but nobody cares
-; CHECK: call void @bar(i8 %c)
-    call void @bar(i8 %c)
-; CHECK: ret i8 42
-    ret i8 %a
+  call void @bar(i8 %c)
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @volatile2() {
 define i8 @volatile2() {
+; CHECK-LABEL: define i8 @volatile2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[B:%.*]] = load volatile i8, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @bar(i8 [[B]])
+; CHECK-NEXT:    [[C:%.*]] = load volatile i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[C]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
-    %a = load i8, ptr %ptr, !invariant.group !0
-    %b = load volatile i8, ptr %ptr
-; CHECK: call void @bar(i8 %b)
-    call void @bar(i8 %b)
-
-    %c = load volatile i8, ptr %ptr, !invariant.group !0
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
+  %a = load i8, ptr %ptr, !invariant.group !0
+  %b = load volatile i8, ptr %ptr
+  call void @bar(i8 %b)
+
+  %c = load volatile i8, ptr %ptr, !invariant.group !0
 ; We might be able to optimize this, but nobody cares
-; CHECK: call void @bar(i8 %c)
-    call void @bar(i8 %c)
-; CHECK: ret i8 42
-    ret i8 %a
+  call void @bar(i8 %c)
+  ret i8 %a
 }
 
-; CHECK-LABEL: define void @fun() {
 define void @fun() {
+; CHECK-LABEL: define void @fun() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    call void @bar(i8 42)
+; CHECK-NEXT:    ret void
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
 
-    %a = load i8, ptr %ptr, !invariant.group !0 ; Can assume that value under %ptr didn't change
-; CHECK: call void @bar(i8 42)
-    call void @bar(i8 %a)
+  %a = load i8, ptr %ptr, !invariant.group !0 ; Can assume that value under %ptr didn't change
+  call void @bar(i8 %a)
 
-    ret void
+  ret void
 }
 
 ; FIXME: NewGVN doesn't run instsimplify on a load from a vtable definition?
 ; This test checks if invariant.group understands gep with zeros
-; CHECK-LABEL: define void @testGEP0() {
 define void @testGEP0() {
+; CHECK-LABEL: define void @testGEP0() {
+; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), ptr [[A]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @_ZN1A3fooEv(ptr nonnull dereferenceable(8) [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @unknownPtr, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[_Z1GR1A_EXIT:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), align 8
+; CHECK-NEXT:    call void [[TMP4]](ptr nonnull [[A]])
+; CHECK-NEXT:    br label [[_Z1GR1A_EXIT]]
+; CHECK:       _Z1gR1A.exit:
+; CHECK-NEXT:    ret void
+;
   %a = alloca %struct.A, align 8
   store ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), ptr %a, align 8, !invariant.group !0
-; CHECK: call void @_ZN1A3fooEv(ptr nonnull dereferenceable(8) %a)
   call void @_ZN1A3fooEv(ptr nonnull dereferenceable(8) %a) ; This call may change vptr
   %1 = load i8, ptr @unknownPtr, align 4
   %2 = icmp eq i8 %1, 0
@@ -325,54 +452,93 @@ _Z1gR1A.exit:                                     ; preds = %0, %3
 ; Check if no optimizations are performed with global pointers.
 ; FIXME: we could do the optimizations if we would check if dependency comes
 ; from the same function.
-; CHECK-LABEL: define void @testGlobal() {
 define void @testGlobal() {
-; CHECK:  %a = load i8, ptr @unknownPtr, align 1, !invariant.group !0
-   %a = load i8, ptr @unknownPtr, !invariant.group !0
-   call void @foo2(ptr @unknownPtr, i8 %a)
-; CHECK:  %1 = load i8, ptr @unknownPtr, align 1, !invariant.group !0
-   %1 = load i8, ptr @unknownPtr, !invariant.group !0
-   call void @bar(i8 %1)
-
-   call void @fooBit(ptr @unknownPtr, i1 1)
+; CHECK-LABEL: define void @testGlobal() {
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr @unknownPtr, align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo2(ptr @unknownPtr, i8 [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @unknownPtr, align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[TMP1]])
+; CHECK-NEXT:    call void @fooBit(ptr @unknownPtr, i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i1, ptr @unknownPtr, align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @fooBit(ptr @unknownPtr, i1 [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load i1, ptr @unknownPtr, align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @fooBit(ptr @unknownPtr, i1 [[TMP3]])
+; CHECK-NEXT:    ret void
+;
+  %a = load i8, ptr @unknownPtr, !invariant.group !0
+  call void @foo2(ptr @unknownPtr, i8 %a)
+  %1 = load i8, ptr @unknownPtr, !invariant.group !0
+  call void @bar(i8 %1)
+
+  call void @fooBit(ptr @unknownPtr, i1 1)
 ; Adding regex because of canonicalization of bitcasts
-; CHECK: %2 = load i1, ptr {{.*}}, !invariant.group !0
-   %2 = load i1, ptr @unknownPtr, !invariant.group !0
-   call void @fooBit(ptr @unknownPtr, i1 %2)
-; CHECK:  %3 = load i1, ptr {{.*}}, !invariant.group !0
-   %3 = load i1, ptr @unknownPtr, !invariant.group !0
-   call void @fooBit(ptr @unknownPtr, i1 %3)
-   ret void
+  %2 = load i1, ptr @unknownPtr, !invariant.group !0
+  call void @fooBit(ptr @unknownPtr, i1 %2)
+  %3 = load i1, ptr @unknownPtr, !invariant.group !0
+  call void @fooBit(ptr @unknownPtr, i1 %3)
+  ret void
 }
 
 ; Might be similar to above where NewGVN doesn't handle loads of different types from the same location.
 ; Not super important anyway.
-; CHECK-LABEL: define void @testTrunc() {
 define void @testTrunc() {
-   %a = alloca i8
-   call void @foo(ptr %a)
-; CHECK:  %b = load i8, ptr %a, align 1, !invariant.group !0
-   %b = load i8, ptr %a, !invariant.group !0
-   call void @foo2(ptr %a, i8 %b)
-
-   %1 = load i8, ptr %a, !invariant.group !0
-; CHECK: call void @bar(i8 %b)
-   call void @bar(i8 %1)
-
-   call void @fooBit(ptr %a, i1 1)
+; CHECK-LABEL: define void @testTrunc() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @foo(ptr [[A]])
+; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo2(ptr [[A]], i8 [[B]])
+; CHECK-NEXT:    call void @bar(i8 [[B]])
+; CHECK-NEXT:    call void @fooBit(ptr [[A]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = load i1, ptr [[A]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @fooBit(ptr [[A]], i1 [[TMP1]])
+; CHECK-NEXT:    call void @fooBit(ptr [[A]], i1 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8
+  call void @foo(ptr %a)
+  %b = load i8, ptr %a, !invariant.group !0
+  call void @foo2(ptr %a, i8 %b)
+
+  %1 = load i8, ptr %a, !invariant.group !0
+  call void @bar(i8 %1)
+
+  call void @fooBit(ptr %a, i1 1)
 ; FIXME: %1 = trunc i8 %b to i1
-   %2 = load i1, ptr %a, !invariant.group !0
+  %2 = load i1, ptr %a, !invariant.group !0
 ; FIXME-NEXT: call void @fooBit(ptr %a, i1 %1)
-   call void @fooBit(ptr %a, i1 %2)
-   %3 = load i1, ptr %a, !invariant.group !0
+  call void @fooBit(ptr %a, i1 %2)
+  %3 = load i1, ptr %a, !invariant.group !0
 ; FIXME-NEXT: call void @fooBit(ptr %a, i1 %1)
-   call void @fooBit(ptr %a, i1 %3)
-   ret void
+  call void @fooBit(ptr %a, i1 %3)
+  ret void
 }
 
 ; See comment in @testGEP0 on what NewGVN is lacking.
-; CHECK-LABEL: define void @handling_loops()
 define void @handling_loops() {
+; CHECK-LABEL: define void @handling_loops() {
+; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), ptr [[A]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @unknownPtr, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[DOTLR_PH_I:%.*]], label [[_Z2G2R1A_EXIT:%.*]]
+; CHECK:       .lr.ph.i:
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i8 [[TMP1]], 1
+; CHECK-NEXT:    br i1 [[TMP3]], label [[DOT_CRIT_EDGE_PREHEADER:%.*]], label [[_Z2G2R1A_EXIT]]
+; CHECK:       ._crit_edge.preheader:
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i8 [ [[TMP6:%.*]], [[DOT_CRIT_EDGE]] ], [ 1, [[DOT_CRIT_EDGE_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), align 8
+; CHECK-NEXT:    call void [[TMP5]](ptr nonnull [[A]])
+; CHECK-NEXT:    [[TMP6]] = add nuw nsw i8 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr @unknownPtr, align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[DOT_CRIT_EDGE]], label [[_Z2G2R1A_EXIT_LOOPEXIT:%.*]]
+; CHECK:       _Z2g2R1A.exit.loopexit:
+; CHECK-NEXT:    br label [[_Z2G2R1A_EXIT]]
+; CHECK:       _Z2g2R1A.exit:
+; CHECK-NEXT:    ret void
+;
   %a = alloca %struct.A, align 8
   store ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), ptr %a, align 8, !invariant.group !0
   %1 = load i8, ptr @unknownPtr, align 4
@@ -424,3 +590,6 @@ declare void @llvm.assume(i1 %cmp.vtables) #0
 
 attributes #0 = { nounwind }
 !0 = !{}
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/NewGVN/invariant.start.ll b/llvm/test/Transforms/NewGVN/invariant.start.ll
index 100b79fd3bff2..9bf1c5530006b 100644
--- a/llvm/test/Transforms/NewGVN/invariant.start.ll
+++ b/llvm/test/Transforms/NewGVN/invariant.start.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; Test to make sure llvm.invariant.start calls are not treated as clobbers.
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
@@ -7,10 +8,12 @@ declare void @llvm.invariant.end.p0(ptr, i64, ptr nocapture) nounwind
 
 ; We forward store to the load across the invariant.start intrinsic
 define i8 @forward_store() {
-; CHECK-LABEL: @forward_store
-; CHECK: call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
-; CHECK-NOT: load
-; CHECK: ret i8 0
+; CHECK-LABEL: define i8 @forward_store() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
+; CHECK-NEXT:    [[I:%.*]] = call ptr @llvm.invariant.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    ret i8 0
+;
   %a = alloca i8
   store i8 0, ptr %a
   %i = call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
@@ -23,10 +26,18 @@ declare i8 @dummy(ptr nocapture) nounwind readonly
 ; We forward store to the load in the non-local analysis case,
 ; i.e. invariant.start is in another basic block.
 define i8 @forward_store_nonlocal(i1 %cond) {
-; CHECK-LABEL: forward_store_nonlocal
-; CHECK: call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
-; CHECK: ret i8 0
-; CHECK: ret i8 %val
+; CHECK-LABEL: define i8 @forward_store_nonlocal(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
+; CHECK-NEXT:    [[I:%.*]] = call ptr @llvm.invariant.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    br i1 [[COND]], label [[LOADBLOCK:%.*]], label [[EXIT:%.*]]
+; CHECK:       loadblock:
+; CHECK-NEXT:    ret i8 0
+; CHECK:       exit:
+; CHECK-NEXT:    [[VAL:%.*]] = call i8 @dummy(ptr [[A]])
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
   %a = alloca i8
   store i8 0, ptr %a
   %i = call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
@@ -43,12 +54,14 @@ exit:
 
 ; We should not value forward %foo to the invariant.end corresponding to %bar.
 define i8 @forward_store1() {
-; CHECK-LABEL: forward_store1
-; CHECK: %foo = call ptr @llvm.invariant.start.p0
-; CHECK-NOT: load
-; CHECK: %bar = call ptr @llvm.invariant.start.p0
-; CHECK: call void @llvm.invariant.end.p0(ptr %bar, i64 1, ptr %a)
-; CHECK: ret i8 0
+; CHECK-LABEL: define i8 @forward_store1() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
+; CHECK-NEXT:    [[FOO:%.*]] = call ptr @llvm.invariant.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    [[BAR:%.*]] = call ptr @llvm.invariant.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.invariant.end.p0(ptr [[BAR]], i64 1, ptr [[A]])
+; CHECK-NEXT:    ret i8 0
+;
   %a = alloca i8
   store i8 0, ptr %a
   %foo = call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
diff --git a/llvm/test/Transforms/NewGVN/lifetime-simple.ll b/llvm/test/Transforms/NewGVN/lifetime-simple.ll
index 5d31101218ee6..55e46111fc9c9 100644
--- a/llvm/test/Transforms/NewGVN/lifetime-simple.ll
+++ b/llvm/test/Transforms/NewGVN/lifetime-simple.ll
@@ -1,12 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
 
 define i8 @test(ptr %P) nounwind {
-; CHECK: lifetime.start
-; CHECK-NOT: load
-; CHECK: lifetime.end
+; CHECK-LABEL: define i8 @test(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[P]])
+; CHECK-NEXT:    store i8 1, ptr [[P]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[P]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
 entry:
   call void @llvm.lifetime.start.p0(i64 32, ptr %P)
   %0 = load i8, ptr %P
diff --git a/llvm/test/Transforms/NewGVN/load-constant-mem.ll b/llvm/test/Transforms/NewGVN/load-constant-mem.ll
index 06439c59f9d2d..ae91147237fad 100644
--- a/llvm/test/Transforms/NewGVN/load-constant-mem.ll
+++ b/llvm/test/Transforms/NewGVN/load-constant-mem.ll
@@ -7,14 +7,14 @@ define i32 @test(ptr %p, i32 %i) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr [4 x i32], ptr @G, i32 0, i32 [[I:%.*]]
-; CHECK-NEXT:    store i8 4, ptr [[P:%.*]]
+; CHECK-NEXT:    store i8 4, ptr [[P1:%.*]], align 1
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
-  %P = getelementptr [4 x i32], ptr @G, i32 0, i32 %i
-  %A = load i32, ptr %P
+  %p.i = getelementptr [4 x i32], ptr @G, i32 0, i32 %i
+  %A = load i32, ptr %p.i
   store i8 4, ptr %p
-  %B = load i32, ptr %P
+  %B = load i32, ptr %p.i
   %C = sub i32 %A, %B
   ret i32 %C
 }
diff --git a/llvm/test/Transforms/NewGVN/load-from-unreachable-predecessor.ll b/llvm/test/Transforms/NewGVN/load-from-unreachable-predecessor.ll
index 74cb70066eb1d..3ca9b9efbe4c2 100644
--- a/llvm/test/Transforms/NewGVN/load-from-unreachable-predecessor.ll
+++ b/llvm/test/Transforms/NewGVN/load-from-unreachable-predecessor.ll
@@ -1,12 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 ; Check that an unreachable predecessor to a PHI node doesn't cause a crash.
 ; PR21625.
 
 define i32 @f(ptr %f) {
-; CHECK: bb0:
+; CHECK-LABEL: define i32 @f(
+; CHECK-SAME: ptr [[F:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br i1 false, label [[BB1:%.*]], label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    ret i32 [[STOREMERGE]]
+;
 ; Load should be removed, since it's ignored.
-; CHECK-NEXT: br label
 bb0:
   %bar = load ptr, ptr %f
   br label %bb2
diff --git a/llvm/test/Transforms/NewGVN/loadforward.ll b/llvm/test/Transforms/NewGVN/loadforward.ll
index d8a9022dc66f6..85ceafd433f45 100644
--- a/llvm/test/Transforms/NewGVN/loadforward.ll
+++ b/llvm/test/Transforms/NewGVN/loadforward.ll
@@ -9,8 +9,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ;; Test that we forward the first store to the second load
 define i16 @bazinga() {
 ; CHECK-LABEL: @bazinga(
-; CHECK-NEXT:    [[_TMP10:%.*]] = load i16, ptr getelementptr inbounds (%rec11, ptr @str, i64 0, i32 1)
-; CHECK-NEXT:    store i16 [[_TMP10]], ptr @str
+; CHECK-NEXT:    [[_TMP10:%.*]] = load i16, ptr getelementptr inbounds ([[REC11:%.*]], ptr @str, i64 0, i32 1), align 2
+; CHECK-NEXT:    store i16 [[_TMP10]], ptr @str, align 2
 ; CHECK-NEXT:    [[_TMP15:%.*]] = icmp eq i16 [[_TMP10]], 3
 ; CHECK-NEXT:    [[_TMP16:%.*]] = select i1 [[_TMP15]], i16 1, i16 0
 ; CHECK-NEXT:    br label [[BB1:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/malloc-load-removal.ll b/llvm/test/Transforms/NewGVN/malloc-load-removal.ll
index b487acfe93e47..3e11d9221c9dd 100644
--- a/llvm/test/Transforms/NewGVN/malloc-load-removal.ll
+++ b/llvm/test/Transforms/NewGVN/malloc-load-removal.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn < %s | FileCheck %s
 ; PR13694
 
@@ -7,6 +8,17 @@ target triple = "x86_64-apple-macosx10.8.0"
 declare ptr @malloc(i64) nounwind allockind("alloc,uninitialized") allocsize(0) "alloc-family"="malloc"
 
 define noalias ptr @test1() nounwind uwtable ssp {
+; CHECK-LABEL: define noalias ptr @test1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @malloc(i64 100) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 0, ptr [[CALL]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
 entry:
   %call = tail call ptr @malloc(i64 100) nounwind
   %0 = load i8, ptr %call, align 1
@@ -20,14 +32,22 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   ret ptr %call
 
-; CHECK-LABEL: @test1(
-; CHECK-NOT: load
-; CHECK-NOT: icmp
 }
 
 declare ptr @_Znwm(i64) nounwind
 
 define noalias ptr @test2() nounwind uwtable ssp {
+; CHECK-LABEL: define noalias ptr @test2(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @_Znwm(i64 100) #[[ATTR2]]
+; CHECK-NEXT:    br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 0, ptr [[CALL]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
 entry:
   %call = tail call ptr @_Znwm(i64 100) nounwind
   %0 = load i8, ptr %call, align 1
@@ -41,14 +61,22 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   ret ptr %call
 
-; CHECK-LABEL: @test2(
-; CHECK-NOT: load
-; CHECK-NOT: icmp
 }
 
 declare ptr @aligned_alloc(i64 allocalign, i64) nounwind allockind("alloc,uninitialized,aligned") allocsize(1) "alloc-family"="malloc"
 
 define noalias ptr @test3() nounwind uwtable ssp {
+; CHECK-LABEL: define noalias ptr @test3(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @aligned_alloc(i64 256, i64 32) #[[ATTR2]]
+; CHECK-NEXT:    br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 0, ptr [[CALL]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
 entry:
   %call = tail call ptr @aligned_alloc(i64 256, i64 32) nounwind
   %0 = load i8, ptr %call, align 32
@@ -62,7 +90,4 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   ret ptr %call
 
-; CHECK-LABEL: @test3(
-; CHECK-NOT: load
-; CHECK-NOT: icmp
 }
diff --git a/llvm/test/Transforms/NewGVN/memory-handling.ll b/llvm/test/Transforms/NewGVN/memory-handling.ll
index b1e1c4f2de3e1..c72ff749ba410 100644
--- a/llvm/test/Transforms/NewGVN/memory-handling.ll
+++ b/llvm/test/Transforms/NewGVN/memory-handling.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ;; This test is really dependent on propagating a lot of memory info around, but in the end, not
 ;; screwing up a single add.
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
@@ -20,6 +21,121 @@ declare ptr @__ctype_b_loc() local_unnamed_addr #1
 
 ; Function Attrs: nounwind uwtable
 define void @BuildMask(ptr nocapture readonly) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @BuildMask(
+; CHECK-SAME: ptr nocapture readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 16 @alPhrase, i8 0, i64 416, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainMask, i8 0, i64 16, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainSign, i8 0, i64 16, i1 false)
+; CHECK-NEXT:    br label [[DOTSINK_SPLIT:%.*]]
+; CHECK:       .sink.split:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ [[TMP0]], [[TMP1:%.*]] ], [ [[TMP3:%.*]], [[TMP14:%.*]] ]
+; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP22:%.*]], [[TMP14]] ]
+; CHECK-NEXT:    store i32 [[DOTSINK]], ptr @cchPhraseLength, align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    br label [[TMP2:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[DOT1:%.*]] = phi ptr [ [[DOT0]], [[DOTSINK_SPLIT]] ], [ [[TMP3]], [[TMP6:%.*]] ]
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i8, ptr [[DOT1]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[DOT1]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[DOTPREHEADER_PREHEADER:%.*]], label [[TMP6]]
+; CHECK:       .preheader.preheader:
+; CHECK-NEXT:    br label [[DOTPREHEADER:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call ptr @__ctype_b_loc() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i8 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP10]], align 2, !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and i16 [[TMP11]], 1024
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i16 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[TMP2]], label [[TMP14]]
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call i32 @tolower(i32 [[TMP15]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], -97
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 16, !tbaa [[TBAA9:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 1
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 16, !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP22]] = add nsw i32 [[DOTSINK]], 1
+; CHECK-NEXT:    br label [[DOTSINK_SPLIT]]
+; CHECK:       .preheader:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[DOTPREHEADER_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[TMP57:%.*]] ]
+; CHECK-NEXT:    [[DOT04961:%.*]] = phi i32 [ [[DOT2:%.*]], [[TMP57]] ], [ 0, [[DOTPREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[DOT05160:%.*]] = phi i32 [ [[DOT253:%.*]], [[TMP57]] ], [ 0, [[DOTPREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 16, !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP24]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [26 x i32], ptr @auGlobalFrequency, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[TMP27:%.*]], label [[TMP28:%.*]]
+; CHECK:       27:
+; CHECK-NEXT:    store i32 -1, ptr [[TMP26]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    br label [[TMP57]]
+; CHECK:       28:
+; CHECK-NEXT:    store i32 0, ptr [[TMP26]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP24]] to i64
+; CHECK-NEXT:    br i1 false, label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[DOT04658:%.*]] = phi i64 [ [[TMP31:%.*]], [[DOTLR_PH]] ], [ 1, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[DOT04857:%.*]] = phi i32 [ [[TMP30:%.*]], [[DOTLR_PH]] ], [ 1, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP30]] = add nuw nsw i32 [[DOT04857]], 1
+; CHECK-NEXT:    [[TMP31]] = shl i64 [[DOT04658]], 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ult i64 [[TMP29]], [[TMP31]]
+; CHECK-NEXT:    br i1 [[TMP32]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[DOTLR_PH]]
+; CHECK:       ._crit_edge.loopexit:
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[DOT048_LCSSA:%.*]] = phi i32 [ poison, [[TMP28]] ], [ [[TMP30]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    [[DOT046_LCSSA:%.*]] = phi i64 [ poison, [[TMP28]] ], [ [[TMP31]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = add nsw i32 [[DOT048_LCSSA]], [[DOT04961]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ugt i32 [[TMP33]], 64
+; CHECK-NEXT:    br i1 [[TMP34]], label [[TMP35:%.*]], label [[TMP39:%.*]]
+; CHECK:       35:
+; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[DOT05160]], 1
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ugt i32 [[TMP36]], 1
+; CHECK-NEXT:    br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39]]
+; CHECK:       38:
+; CHECK-NEXT:    tail call void @Fatal(ptr @.str.7, i32 0)
+; CHECK-NEXT:    br label [[TMP39]]
+; CHECK:       39:
+; CHECK-NEXT:    [[DOT152:%.*]] = phi i32 [ [[DOT05160]], [[DOT_CRIT_EDGE]] ], [ [[TMP36]], [[TMP38]] ], [ [[TMP36]], [[TMP35]] ]
+; CHECK-NEXT:    [[DOT150:%.*]] = phi i32 [ [[DOT04961]], [[DOT_CRIT_EDGE]] ], [ 0, [[TMP38]] ], [ 0, [[TMP35]] ]
+; CHECK-NEXT:    [[TMP40:%.*]] = add i64 [[DOT046_LCSSA]], 4294967295
+; CHECK-NEXT:    [[TMP41:%.*]] = trunc i64 [[TMP40]] to i32
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 2
+; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP42]], align 8, !tbaa [[TBAA11:![0-9]+]]
+; CHECK-NEXT:    [[TMP43:%.*]] = zext i32 [[DOT150]] to i64
+; CHECK-NEXT:    [[DOT046_:%.*]] = shl i64 [[DOT046_LCSSA]], [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = zext i32 [[DOT152]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [2 x i64], ptr @aqMainSign, i64 0, i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP45]], align 8, !tbaa [[TBAA12:![0-9]+]]
+; CHECK-NEXT:    [[TMP47:%.*]] = or i64 [[TMP46]], [[DOT046_]]
+; CHECK-NEXT:    store i64 [[TMP47]], ptr [[TMP45]], align 8, !tbaa [[TBAA12]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP23]], align 16, !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP48]] to i64
+; CHECK-NEXT:    [[TMP50:%.*]] = shl i64 [[TMP49]], [[TMP43]]
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [2 x i64], ptr @aqMainMask, i64 0, i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP51]], align 8, !tbaa [[TBAA12]]
+; CHECK-NEXT:    [[TMP53:%.*]] = or i64 [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    store i64 [[TMP53]], ptr [[TMP51]], align 8, !tbaa [[TBAA12]]
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 1
+; CHECK-NEXT:    store i32 [[DOT150]], ptr [[TMP54]], align 4, !tbaa [[TBAA14:![0-9]+]]
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 3
+; CHECK-NEXT:    store i32 [[DOT152]], ptr [[TMP55]], align 4, !tbaa [[TBAA15:![0-9]+]]
+; CHECK-NEXT:    [[TMP56:%.*]] = add nsw i32 [[DOT150]], [[DOT048_LCSSA]]
+; CHECK-NEXT:    br label [[TMP57]]
+; CHECK:       57:
+; CHECK-NEXT:    [[DOT253]] = phi i32 [ [[DOT05160]], [[TMP27]] ], [ [[DOT152]], [[TMP39]] ]
+; CHECK-NEXT:    [[DOT2]] = phi i32 [ [[DOT04961]], [[TMP27]] ], [ [[TMP56]], [[TMP39]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 26
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOTPREHEADER]], label [[TMP58:%.*]]
+; CHECK:       58:
+; CHECK-NEXT:    ret void
+;
   tail call void @llvm.memset.p0.i64(ptr align 16 @alPhrase, i8 0, i64 416, i1 false)
   tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainMask, i8 0, i64 16, i1 false)
   tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainSign, i8 0, i64 16, i1 false)
@@ -113,7 +229,6 @@ define void @BuildMask(ptr nocapture readonly) local_unnamed_addr #0 {
 ; If we screw up the revisitation of the users of store of %sink above
 ; we will end up propagating and simplifying this to 1 in the final output
 ; because we keep an optimistic assumption we should not.
-; CHECK:  add i32 %.05160, 1
   %37 = add i32 %.05160, 1
   %38 = icmp ugt i32 %37, 1
   br i1 %38, label %39, label %40
@@ -193,3 +308,20 @@ attributes #5 = { nounwind readonly }
 !14 = !{!"long", !3, i64 0}
 !15 = !{!11, !2, i64 4}
 !16 = !{!11, !2, i64 12}
+;.
+; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"any pointer", [[META3]], i64 0}
+; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[META8]] = !{!"short", [[META3]], i64 0}
+; CHECK: [[TBAA9]] = !{[[META10:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META10]] = !{!"", [[META2]], i64 0, [[META2]], i64 4, [[META2]], i64 8, [[META2]], i64 12}
+; CHECK: [[TBAA11]] = !{[[META10]], [[META2]], i64 8}
+; CHECK: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; CHECK: [[META13]] = !{!"long", [[META3]], i64 0}
+; CHECK: [[TBAA14]] = !{[[META10]], [[META2]], i64 4}
+; CHECK: [[TBAA15]] = !{[[META10]], [[META2]], i64 12}
+;.
diff --git a/llvm/test/Transforms/NewGVN/metadata-nonnull.ll b/llvm/test/Transforms/NewGVN/metadata-nonnull.ll
index cd0922f901af4..5de4c581c051c 100644
--- a/llvm/test/Transforms/NewGVN/metadata-nonnull.ll
+++ b/llvm/test/Transforms/NewGVN/metadata-nonnull.ll
@@ -150,7 +150,7 @@ define ptr @test7(ptr %v0) {
 ; CHECK-LABEL: define ptr @test7
 ; CHECK-SAME: (ptr [[V0:%.*]]) {
 ; CHECK-NEXT:  top:
-; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[V0]], align 8, !nonnull !0
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[V0]], align 8, !nonnull [[META0:![0-9]+]]
 ; CHECK-NEXT:    call void @use2(ptr [[V1]])
 ; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
diff --git a/llvm/test/Transforms/NewGVN/metadata-simplify.ll b/llvm/test/Transforms/NewGVN/metadata-simplify.ll
index a84c581165c3a..e981e3702d560 100644
--- a/llvm/test/Transforms/NewGVN/metadata-simplify.ll
+++ b/llvm/test/Transforms/NewGVN/metadata-simplify.ll
@@ -9,11 +9,11 @@ define i1 @test1(ptr %arg, i1 %arg2) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[ARG:%.*]], !nonnull !0
+; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[ARG:%.*]], align 8, !nonnull [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[LOAD1]], null
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[ARG]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[ARG]], align 8
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq ptr [[LOAD2]], null
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -34,11 +34,11 @@ define i1 @test2(ptr %arg, i1 %arg2) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[ARG:%.*]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[LOAD1]], null
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[ARG]], !nonnull !0
+; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[ARG]], align 8, !nonnull [[META0]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq ptr [[LOAD2]], null
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -60,11 +60,11 @@ define i1 @test3(ptr %ptr, i1 %arg2) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], !range !1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !range [[RNG1:![0-9]+]]
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[LOAD1]], 999
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[LOAD2]], 999
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -85,11 +85,11 @@ define i1 @test4(ptr %ptr, i1 %arg2) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[LOAD1]], 999
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], !range !1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4, !range [[RNG1]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[LOAD2]], 999
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -110,11 +110,11 @@ define i1 @test5(ptr %ptr, i1 %arg2) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], !range !1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !range [[RNG1]]
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[LOAD1]], 999
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[LOAD2]], 999
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -135,11 +135,11 @@ define i1 @test6(ptr %ptr, i1 %arg2) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[LOAD1]], 999
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], !range !1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4, !range [[RNG1]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[LOAD2]], 999
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
diff --git a/llvm/test/Transforms/NewGVN/noalias.ll b/llvm/test/Transforms/NewGVN/noalias.ll
index 5f0c5dd8a3aea..2cb5c196a7abf 100644
--- a/llvm/test/Transforms/NewGVN/noalias.ll
+++ b/llvm/test/Transforms/NewGVN/noalias.ll
@@ -1,10 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 define i32 @test1(ptr %p, ptr %q) {
-; CHECK-LABEL: @test1(ptr %p, ptr %q)
-; CHECK: load i32, ptr %p
-; CHECK-NOT: noalias
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test1(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = load i32, ptr %p, !noalias !3
   %b = load i32, ptr %p
   %c = add i32 %a, %b
@@ -12,9 +15,12 @@ define i32 @test1(ptr %p, ptr %q) {
 }
 
 define i32 @test2(ptr %p, ptr %q) {
-; CHECK-LABEL: @test2(ptr %p, ptr %q)
-; CHECK: load i32, ptr %p, align 4, !alias.scope ![[SCOPE1:[0-9]+]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test2(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = load i32, ptr %p, !alias.scope !3
   %b = load i32, ptr %p, !alias.scope !3
   %c = add i32 %a, %b
@@ -22,17 +28,18 @@ define i32 @test2(ptr %p, ptr %q) {
 }
 
 define i32 @test3(ptr %p, ptr %q) {
-; CHECK-LABEL: @test3(ptr %p, ptr %q)
-; CHECK: load i32, ptr %p, align 4, !alias.scope ![[SCOPE2:[0-9]+]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test3(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !alias.scope [[META3:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = load i32, ptr %p, !alias.scope !4
   %b = load i32, ptr %p, !alias.scope !5
   %c = add i32 %a, %b
   ret i32 %c
 }
 
-; CHECK:   ![[SCOPE1]] = !{!{{[0-9]+}}}
-; CHECK:   ![[SCOPE2]] = !{!{{[0-9]+}}, !{{[0-9]+}}}
 declare i32 @foo(ptr) readonly
 
 !0 = distinct !{!0, !2, !"callee0: %a"}
@@ -42,3 +49,10 @@ declare i32 @foo(ptr) readonly
 !3 = !{!0}
 !4 = !{!1}
 !5 = !{!0, !1}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"callee0: %a"}
+; CHECK: [[META2]] = distinct !{[[META2]], !"callee0"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META1]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]], !"callee0: %b"}
+;.
diff --git a/llvm/test/Transforms/NewGVN/nomemlocation.ll b/llvm/test/Transforms/NewGVN/nomemlocation.ll
index 0f716b30801b2..332e6c6304c31 100644
--- a/llvm/test/Transforms/NewGVN/nomemlocation.ll
+++ b/llvm/test/Transforms/NewGVN/nomemlocation.ll
@@ -1,8 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -S -p='newgvn' | FileCheck %s
 ; MemorySSA should be able to handle a clobber query with an empty MemoryLocation.
 
-; CHECK: @userread
 define ptr @userread(ptr %p) {
+; CHECK-LABEL: define ptr @userread(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[POS:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[DIFF:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[POS]]
+; CHECK-NEXT:    [[LD:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[READVAL:%.*]] = call i64 @fread(ptr noundef nonnull [[GEP]], i64 noundef 1, i64 noundef [[POS]], ptr noundef [[LD]])
+; CHECK-NEXT:    [[READVALISPOS:%.*]] = icmp eq i64 [[READVAL]], [[POS]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[READVALISPOS]])
+; CHECK-NEXT:    [[DIFF]] = sub i64 0, [[POS]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/NewGVN/non-integral-pointers.ll b/llvm/test/Transforms/NewGVN/non-integral-pointers.ll
index 6119577d3b242..1e3da6e295d16 100644
--- a/llvm/test/Transforms/NewGVN/non-integral-pointers.ll
+++ b/llvm/test/Transforms/NewGVN/non-integral-pointers.ll
@@ -1,37 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:4"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @f0(i1 %alwaysFalse, i64 %val, ptr %loc) {
-; CHECK-LABEL: @f0(
-; CHECK-NOT: inttoptr
-; CHECK-NOT: ptrtoint
- entry:
+; CHECK-LABEL: define void @f0(
+; CHECK-SAME: i1 [[ALWAYSFALSE:%.*]], i64 [[VAL:%.*]], ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 [[VAL]], ptr [[LOC]], align 8
+; CHECK-NEXT:    br i1 [[ALWAYSFALSE]], label [[NEVERTAKEN:%.*]], label [[ALWAYSTAKEN:%.*]]
+; CHECK:       neverTaken:
+; CHECK-NEXT:    [[PTR:%.*]] = load ptr addrspace(4), ptr [[LOC]], align 8
+; CHECK-NEXT:    store i8 5, ptr addrspace(4) [[PTR]], align 1
+; CHECK-NEXT:    ret void
+; CHECK:       alwaysTaken:
+; CHECK-NEXT:    ret void
+;
+  entry:
   store i64 %val, ptr %loc
   br i1 %alwaysFalse, label %neverTaken, label %alwaysTaken
 
- neverTaken:
+  neverTaken:
   %ptr = load ptr addrspace(4), ptr %loc
   store i8 5, ptr addrspace(4) %ptr
   ret void
 
- alwaysTaken:
+  alwaysTaken:
   ret void
 }
 
 define i64 @f1(i1 %alwaysFalse, ptr addrspace(4) %val, ptr %loc) {
-; CHECK-LABEL: @f1(
-; CHECK-NOT: inttoptr
-; CHECK-NOT: ptrtoint
- entry:
+; CHECK-LABEL: define i64 @f1(
+; CHECK-SAME: i1 [[ALWAYSFALSE:%.*]], ptr addrspace(4) [[VAL:%.*]], ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store ptr addrspace(4) [[VAL]], ptr [[LOC]], align 8
+; CHECK-NEXT:    br i1 [[ALWAYSFALSE]], label [[NEVERTAKEN:%.*]], label [[ALWAYSTAKEN:%.*]]
+; CHECK:       neverTaken:
+; CHECK-NEXT:    [[INT:%.*]] = load i64, ptr [[LOC]], align 8
+; CHECK-NEXT:    ret i64 [[INT]]
+; CHECK:       alwaysTaken:
+; CHECK-NEXT:    ret i64 42
+;
+  entry:
   store ptr addrspace(4) %val, ptr %loc
   br i1 %alwaysFalse, label %neverTaken, label %alwaysTaken
 
- neverTaken:
+  neverTaken:
   %int = load i64, ptr %loc
   ret i64 %int
 
- alwaysTaken:
+  alwaysTaken:
   ret i64 42
 }
diff --git a/llvm/test/Transforms/NewGVN/null-aliases-nothing.ll b/llvm/test/Transforms/NewGVN/null-aliases-nothing.ll
index 666621119d4cb..25295fac7400d 100644
--- a/llvm/test/Transforms/NewGVN/null-aliases-nothing.ll
+++ b/llvm/test/Transforms/NewGVN/null-aliases-nothing.ll
@@ -1,19 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 %t = type { i32 }
 declare void @test1f(ptr)
 
 define void @test1(ptr noalias %stuff ) {
-    %before = load i32, ptr %stuff
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: ptr noalias [[STUFF:%.*]]) {
+; CHECK-NEXT:    [[BEFORE:%.*]] = load i32, ptr [[STUFF]], align 4
+; CHECK-NEXT:    call void @test1f(ptr null)
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[BEFORE]], [[BEFORE]]
+; CHECK-NEXT:    store i32 [[SUM]], ptr [[STUFF]], align 4
+; CHECK-NEXT:    ret void
+;
+  %before = load i32, ptr %stuff
 
-    call void @test1f(ptr null)
+  call void @test1f(ptr null)
 
-    %after = load i32, ptr %stuff ; <--- This should be a dead load
-    %sum = add i32 %before, %after
+  %after = load i32, ptr %stuff ; <--- This should be a dead load
+  %sum = add i32 %before, %after
 
-    store i32 %sum, ptr %stuff
-    ret void
-; CHECK: load
-; CHECK-NOT: load
-; CHECK: ret void
+  store i32 %sum, ptr %stuff
+  ret void
 }
diff --git a/llvm/test/Transforms/NewGVN/phi-edge-handling.ll b/llvm/test/Transforms/NewGVN/phi-edge-handling.ll
index 8dfac7942210c..0e871b6b261a4 100644
--- a/llvm/test/Transforms/NewGVN/phi-edge-handling.ll
+++ b/llvm/test/Transforms/NewGVN/phi-edge-handling.ll
@@ -9,8 +9,8 @@ define i16 @hoge() {
 ; CHECK-LABEL: @hoge(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    switch i8 undef, label [[BB7:%.*]] [
-; CHECK-NEXT:    i8 0, label [[BB1:%.*]]
-; CHECK-NEXT:    i8 12, label [[BB2:%.*]]
+; CHECK-NEXT:      i8 0, label [[BB1:%.*]]
+; CHECK-NEXT:      i8 12, label [[BB2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB6:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll b/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll
index ac07148c75521..573a4c0bd0bcb 100644
--- a/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll
+++ b/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll
@@ -88,8 +88,8 @@ define void @pr42422(i1 %c.1, i1 %c.2) {
 ; CHECK:       bb16:
 ; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ poison, [[BB15]] ], [ 1, [[BB14]] ], [ 9, [[BB7]] ]
 ; CHECK-NEXT:    switch i32 [[TMP17]], label [[BB19]] [
-; CHECK-NEXT:    i32 0, label [[BB6]]
-; CHECK-NEXT:    i32 9, label [[BB18:%.*]]
+; CHECK-NEXT:      i32 0, label [[BB6]]
+; CHECK-NEXT:      i32 9, label [[BB18:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb18:
 ; CHECK-NEXT:    br label [[BB19]]
diff --git a/llvm/test/Transforms/NewGVN/phi-translate-partial-alias.ll b/llvm/test/Transforms/NewGVN/phi-translate-partial-alias.ll
index cb24b050a599e..7dd41904970c0 100644
--- a/llvm/test/Transforms/NewGVN/phi-translate-partial-alias.ll
+++ b/llvm/test/Transforms/NewGVN/phi-translate-partial-alias.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
@@ -6,12 +7,19 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; not actually redundant around the loop backedge, despite appearances
 ; if phi-translation is ignored.
 
-; CHECK: define void @test0(ptr %begin)
-; CHECK: loop:
-; CHECK:   %l0 = load i8, ptr %phi
-; CHECK:   call void @bar(i8 %l0)
-; CHECK:   %l1 = load i8, ptr %phi
 define void @test0(ptr %begin) {
+; CHECK-LABEL: define void @test0(
+; CHECK-SAME: ptr [[BEGIN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[BEGIN]], [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr [[PHI]], align 1
+; CHECK-NEXT:    call void @bar(i8 [[L0]])
+; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[PHI]], align 1
+; CHECK-NEXT:    [[NEXT]] = getelementptr inbounds i8, ptr [[PHI]], i8 [[L1]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/NewGVN/pr17732.ll b/llvm/test/Transforms/NewGVN/pr17732.ll
index 427543d3aae7a..e66547cb9e193 100644
--- a/llvm/test/Transforms/NewGVN/pr17732.ll
+++ b/llvm/test/Transforms/NewGVN/pr17732.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S -o - < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -12,6 +13,12 @@ target triple = "x86_64-unknown-linux-gnu"
 @vector_with_zeroinit = common global %struct.with_vector zeroinitializer, align 4
 
 define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 @array_with_zeroinit, ptr align 4 @main.obj_with_array, i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 @vector_with_zeroinit, ptr align 4 @main.obj_with_vector, i64 12, i1 false)
+; CHECK-NEXT:    ret i32 1
+;
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 @array_with_zeroinit, ptr align 4 @main.obj_with_array, i64 12, i1 false)
   %0 = load i8, ptr getelementptr inbounds (%struct.with_array, ptr @array_with_zeroinit, i64 0, i32 2), align 4
@@ -22,8 +29,6 @@ entry:
   %conv1 = sext i8 %1 to i32
   %and = and i32 %conv0, %conv1
   ret i32 %and
-; CHECK-LABEL: define i32 @main(
-; CHECK: ret i32 1
 }
 
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
diff --git a/llvm/test/Transforms/NewGVN/pr17852.ll b/llvm/test/Transforms/NewGVN/pr17852.ll
index 5858982ce1cd8..bffde12416444 100644
--- a/llvm/test/Transforms/NewGVN/pr17852.ll
+++ b/llvm/test/Transforms/NewGVN/pr17852.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 %struct.S0 = type { [2 x i8], [2 x i8], [4 x i8], [2 x i8], i32, i32, i32, i32 }
diff --git a/llvm/test/Transforms/NewGVN/pr24397.ll b/llvm/test/Transforms/NewGVN/pr24397.ll
index f3f112ae1fe31..d9981443522e7 100644
--- a/llvm/test/Transforms/NewGVN/pr24397.ll
+++ b/llvm/test/Transforms/NewGVN/pr24397.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/NewGVN/pr24426.ll b/llvm/test/Transforms/NewGVN/pr24426.ll
index e8a88f5b267ab..8f0974aa0bd35 100644
--- a/llvm/test/Transforms/NewGVN/pr24426.ll
+++ b/llvm/test/Transforms/NewGVN/pr24426.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=memcpyopt,mldst-motion,newgvn -S | FileCheck %s
 
 declare void @check(i8)
@@ -5,11 +6,17 @@ declare void @check(i8)
 declare void @write(ptr %res)
 
 define void @test1() {
+; CHECK-LABEL: define void @test1() {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [10 x i8], align 1
+; CHECK-NEXT:    call void @write(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; CHECK-NEXT:    call void @check(i8 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
   %1 = alloca [10 x i8]
   call void @write(ptr %1)
   %2 = load i8, ptr %1
 
-; CHECK-NOT: undef
   call void @check(i8 %2)
 
   ret void
diff --git a/llvm/test/Transforms/NewGVN/pr25440.ll b/llvm/test/Transforms/NewGVN/pr25440.ll
index b3ebf447afc2e..9d9c4cda34258 100644
--- a/llvm/test/Transforms/NewGVN/pr25440.ll
+++ b/llvm/test/Transforms/NewGVN/pr25440.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ;RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
@@ -10,19 +11,51 @@ target triple = "thumbv7--linux-gnueabi"
 
 ; Function Attrs: nounwind
 define fastcc void @foo(ptr nocapture readonly %x) {
-;CHECK-LABEL: foo
+; CHECK-LABEL: define fastcc void @foo(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB0:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[X_TR:%.*]] = phi ptr [ [[X]], [[ENTRY:%.*]] ], [ null, [[LAND_LHS_TRUE:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[X_TR]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    switch i32 [[CONV]], label [[IF_END_50:%.*]] [
+; CHECK-NEXT:      i32 43, label [[CLEANUP:%.*]]
+; CHECK-NEXT:      i32 52, label [[IF_THEN_5:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       if.then.5:
+; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE]], label [[IF_THEN_26:%.*]]
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br i1 undef, label [[CLEANUP]], label [[BB0]]
+; CHECK:       if.then.26:
+; CHECK-NEXT:    br i1 undef, label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[MODE:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[X_TR]], i32 0, i32 1
+; CHECK-NEXT:    [[BF_LOAD:%.*]] = load i16, ptr [[MODE]], align 2
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN_44:%.*]], label [[CLEANUP]]
+; CHECK:       if.then.44:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end.50:
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds [0 x i32], ptr @length, i32 0, i32 [[CONV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX52]], align 4
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY_57:%.*]], label [[CLEANUP]]
+; CHECK:       for.body.57:
+; CHECK-NEXT:    unreachable
+; CHECK:       cleanup:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %bb0
 
 bb0:                                      ; preds = %land.lhs.true, %entry
-;CHECK: bb0:
   %x.tr = phi ptr [ %x, %entry ], [ null, %land.lhs.true ]
   %0 = load i16, ptr %x.tr, align 4
-; CHECK: load i16, ptr
   %conv = zext i16 %0 to i32
   switch i32 %conv, label %if.end.50 [
-    i32 43, label %cleanup
-    i32 52, label %if.then.5
+  i32 43, label %cleanup
+  i32 52, label %if.then.5
   ]
 
 if.then.5:                                        ; preds = %bb0
@@ -36,8 +69,6 @@ if.then.26:                                       ; preds = %if.then.5
   br i1 undef, label %cond.end, label %cond.false
 
 cond.false:                                       ; preds = %if.then.26
-; CHECK: cond.false:
-; CHECK: load i16
   %mode = getelementptr inbounds %struct.a, ptr %x.tr.lcssa163, i32 0, i32 1
   %bf.load = load i16, ptr %mode, align 2
   %bf.shl = shl i16 %bf.load, 8
@@ -50,7 +81,6 @@ if.then.44:                                       ; preds = %cond.end
   unreachable
 
 if.end.50:                                        ; preds = %bb0
-;%CHECK: if.end.50:
   %conv.lcssa = phi i32 [ %conv, %bb0 ]
   %arrayidx52 = getelementptr inbounds [0 x i32], ptr @length, i32 0, i32 %conv.lcssa
   %1 = load i32, ptr %arrayidx52, align 4
@@ -68,7 +98,34 @@ cleanup:                                          ; preds = %if.end.50, %cond.en
 @dfg_text = external global ptr, align 4
 
 define void @dfg_lex() {
-;CHECK-LABEL: dfg_lex
+; CHECK-LABEL: define void @dfg_lex() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_BODYTHREAD_PRE_SPLIT:%.*]]
+; CHECK:       while.bodythread-pre-split:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN_14:%.*]], label [[IF_END_15:%.*]]
+; CHECK:       if.then.14:
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr @dfg_text, align 4
+; CHECK-NEXT:    br label [[IF_END_15]]
+; CHECK:       if.end.15:
+; CHECK-NEXT:    [[V2:%.*]] = load ptr, ptr @yy_c_buf_p, align 4
+; CHECK-NEXT:    br label [[WHILE_COND_16:%.*]]
+; CHECK:       while.cond.16:
+; CHECK-NEXT:    br i1 undef, label [[WHILE_COND_16]], label [[WHILE_END:%.*]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[V2]], i32 undef
+; CHECK-NEXT:    store ptr [[ADD_PTR]], ptr @dfg_text, align 4
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST25:%.*]] = ptrtoint ptr [[ADD_PTR]] to i32
+; CHECK-NEXT:    switch i32 undef, label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 65, label [[WHILE_BODYTHREAD_PRE_SPLIT]]
+; CHECK-NEXT:      i32 3, label [[RETURN:%.*]]
+; CHECK-NEXT:      i32 57, label [[WHILE_BODYTHREAD_PRE_SPLIT]]
+; CHECK-NEXT:      i32 60, label [[IF_THEN_14]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.default:
+; CHECK-NEXT:    unreachable
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %while.bodythread-pre-split
 
@@ -93,10 +150,10 @@ while.end:                                        ; preds = %while.cond.16
   %sub.ptr.rhs.cast25 = ptrtoint ptr %add.ptr to i32
   %sub.ptr.sub26 = sub i32 0, %sub.ptr.rhs.cast25
   switch i32 undef, label %sw.default [
-    i32 65, label %while.bodythread-pre-split
-    i32 3, label %return
-    i32 57, label %while.bodythread-pre-split
-    i32 60, label %if.then.14
+  i32 65, label %while.bodythread-pre-split
+  i32 3, label %return
+  i32 57, label %while.bodythread-pre-split
+  i32 60, label %if.then.14
   ]
 
 sw.default:                                       ; preds = %while.end
diff --git a/llvm/test/Transforms/NewGVN/pr28562.ll b/llvm/test/Transforms/NewGVN/pr28562.ll
index a62fdd31da721..320224c24b5bc 100644
--- a/llvm/test/Transforms/NewGVN/pr28562.ll
+++ b/llvm/test/Transforms/NewGVN/pr28562.ll
@@ -1,9 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn < %s | FileCheck %s
 define ptr @test1(ptr %a) {
+; CHECK-LABEL: define ptr @test1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:    [[X2:%.*]] = getelementptr i32, ptr [[A]], i32 10
+; CHECK-NEXT:    ret ptr [[X2]]
+;
   %x1 = getelementptr inbounds i32, ptr %a, i32 10
   %x2 = getelementptr i32, ptr %a, i32 10
   ret ptr %x2
-; CHECK-LABEL: @test1(
-; CHECK: %[[x:.*]] = getelementptr i32, ptr %a, i32 10
-; CHECK: ret ptr %[[x]]
 }
diff --git a/llvm/test/Transforms/NewGVN/pr31472.ll b/llvm/test/Transforms/NewGVN/pr31472.ll
index 8bb9a14e81365..8eeb61400f814 100644
--- a/llvm/test/Transforms/NewGVN/pr31472.ll
+++ b/llvm/test/Transforms/NewGVN/pr31472.ll
@@ -9,11 +9,13 @@ target triple = "x86_64-apple-macosx10.12.0"
 define i32 @main() personality ptr @__gxx_personality_v0{
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:    [[TMP1:%.*]] = invoke i32 @foo()
-; CHECK-NEXT:    to label %good unwind label %bad
+; CHECK-NEXT:            to label [[GOOD:%.*]] unwind label [[BAD:%.*]]
 ; CHECK:       good:
 ; CHECK-NEXT:    ret i32 5
 ; CHECK:       bad:
-; CHECK-NEXT:    [[TMP2:%.*]] = landingpad { ptr, i32
+; CHECK-NEXT:    [[TMP2:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    resume { ptr, i32 } [[TMP2]]
 ;
   %1 = invoke i32 @foo()
   to label %good unwind label %bad
diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll
index fe957dec72cf1..0e7461c2612b9 100644
--- a/llvm/test/Transforms/NewGVN/pr31483.ll
+++ b/llvm/test/Transforms/NewGVN/pr31483.ll
@@ -10,20 +10,20 @@ define signext i32 @ham(ptr %arg, ptr %arg1) #0 {
 ; CHECK-LABEL: @ham(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    store ptr %arg1, ptr [[TMP]], align 8
-; CHECK-NEXT:    br label %bb2
+; CHECK-NEXT:    store ptr [[ARG1:%.*]], ptr [[TMP]], align 8
+; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ %arg, %bb ], [ %tmp7, %bb22 ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[ARG:%.*]], [[BB:%.*]] ], [ [[TMP7:%.*]], [[BB22:%.*]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %bb6, label %bb23
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BB6:%.*]], label [[BB23:%.*]]
 ; CHECK:       bb6:
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7]] = getelementptr inbounds i8, ptr [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP4]] to i32
-; CHECK-NEXT:    switch i32 [[TMP9]], label %bb22 [
-; CHECK-NEXT:    i32 115, label %bb10
-; CHECK-NEXT:    i32 105, label %bb16
-; CHECK-NEXT:    i32 99, label %bb16
+; CHECK-NEXT:    switch i32 [[TMP9]], label [[BB22]] [
+; CHECK-NEXT:      i32 115, label [[BB10:%.*]]
+; CHECK-NEXT:      i32 105, label [[BB16:%.*]]
+; CHECK-NEXT:      i32 99, label [[BB16]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb10:
 ; CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8
@@ -31,15 +31,15 @@ define signext i32 @ham(ptr %arg, ptr %arg1) #0 {
 ; CHECK-NEXT:    store ptr [[TMP12]], ptr [[TMP]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = call signext i32 (ptr, ...) @zot(ptr @global, ptr [[TMP14]])
-; CHECK-NEXT:    br label %bb22
+; CHECK-NEXT:    br label [[BB22]]
 ; CHECK:       bb16:
 ; CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 8
 ; CHECK-NEXT:    store ptr [[TMP18]], ptr [[TMP]], align 8
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 4
-; CHECK-NEXT:    br label %bb22
+; CHECK-NEXT:    br label [[BB22]]
 ; CHECK:       bb22:
-; CHECK-NEXT:    br label %bb2
+; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb23:
 ; CHECK-NEXT:    call void @llvm.va_end(ptr [[TMP]])
 ; CHECK-NEXT:    ret i32 undef
diff --git a/llvm/test/Transforms/NewGVN/pr31491.ll b/llvm/test/Transforms/NewGVN/pr31491.ll
index 5f6b371333bbf..f27f13ed24e08 100644
--- a/llvm/test/Transforms/NewGVN/pr31491.ll
+++ b/llvm/test/Transforms/NewGVN/pr31491.ll
@@ -7,13 +7,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define internal i32 @pr31491() {
 ; CHECK-LABEL: @pr31491(
 ; CHECK-NEXT:  bb5:
-; CHECK-NEXT:    br label %bb7
+; CHECK-NEXT:    br label [[BB7:%.*]]
 ; CHECK:       bb7:
-; CHECK-NEXT:    [[TMP:%.*]] = phi ptr [ [[TMP:%.*]]11, %bb10 ], [ undef, %bb5 ]
-; CHECK-NEXT:    br label %bb10
+; CHECK-NEXT:    [[TMP:%.*]] = phi ptr [ [[TMP11:%.*]], [[BB10:%.*]] ], [ undef, [[BB5:%.*]] ]
+; CHECK-NEXT:    br label [[BB10]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[TMP11:%.*]] = tail call ptr @patatino(ptr [[TMP]])
-; CHECK-NEXT:    br label %bb7
+; CHECK-NEXT:    [[TMP11]] = tail call ptr @patatino(ptr [[TMP]])
+; CHECK-NEXT:    br label [[BB7]]
 ;
 bb5:
   br label %bb7
diff --git a/llvm/test/Transforms/NewGVN/pr31501.ll b/llvm/test/Transforms/NewGVN/pr31501.ll
index 55195fd6ada76..18bfcd1b9ca09 100644
--- a/llvm/test/Transforms/NewGVN/pr31501.ll
+++ b/llvm/test/Transforms/NewGVN/pr31501.ll
@@ -52,30 +52,30 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr #0 align 2 {
 ; CHECK-LABEL: @quux(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds %struct.barney, ptr %arg, i64 0, i32 3, i32 0, i32 0, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa !2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds %struct.barney, ptr %arg, i64 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8, !tbaa !7
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_BARNEY:%.*]], ptr [[ARG:%.*]], i64 0, i32 3, i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa [[TBAA2:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_BARNEY]], ptr [[ARG]], i64 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8, !tbaa [[TBAA7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq ptr [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %bb21, label %bb8
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB21:%.*]], label [[BB8:%.*]]
 ; CHECK:       bb8:
-; CHECK-NEXT:    br label %bb11
+; CHECK-NEXT:    br label [[BB11:%.*]]
 ; CHECK:       bb9:
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq ptr [[TMP18:%.*]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP10]], label %bb19, label %bb11
+; CHECK-NEXT:    br i1 [[TMP10]], label [[BB19:%.*]], label [[BB11]]
 ; CHECK:       bb11:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP17:%.*]], %bb9 ], [ undef, %bb8 ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[TMP18]], %bb9 ], [ [[TMP3]], %bb8 ]
-; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP13]], align 8, !tbaa !8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq ptr [[TMP15]], %arg1
+; CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP17:%.*]], [[BB9:%.*]] ], [ undef, [[BB8]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[TMP18]], [[BB9]] ], [ [[TMP3]], [[BB8]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP13]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq ptr [[TMP15]], [[ARG1:%.*]]
 ; CHECK-NEXT:    [[TMP17]] = select i1 [[TMP16]], ptr [[TMP13]], ptr [[TMP12]]
-; CHECK-NEXT:    [[TMP18]] = getelementptr inbounds %struct.foo, ptr [[TMP13]], i64 1
-; CHECK-NEXT:    br i1 [[TMP16]], label %bb19, label %bb9
+; CHECK-NEXT:    [[TMP18]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[TMP13]], i64 1
+; CHECK-NEXT:    br i1 [[TMP16]], label [[BB19]], label [[BB9]]
 ; CHECK:       bb19:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi ptr [ null, %bb9 ], [ [[TMP17]], %bb11 ]
-; CHECK-NEXT:    br label %bb21
+; CHECK-NEXT:    [[TMP20:%.*]] = phi ptr [ null, [[BB9]] ], [ [[TMP17]], [[BB11]] ]
+; CHECK-NEXT:    br label [[BB21]]
 ; CHECK:       bb21:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ null, %bb ], [ [[TMP20]], %bb19 ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ null, [[BB:%.*]] ], [ [[TMP20]], [[BB19]] ]
 ; CHECK-NEXT:    ret ptr [[TMP22]]
 ;
 bb:
diff --git a/llvm/test/Transforms/NewGVN/pr31573.ll b/llvm/test/Transforms/NewGVN/pr31573.ll
index 2382c487c19af..7835e9d45a671 100644
--- a/llvm/test/Transforms/NewGVN/pr31573.ll
+++ b/llvm/test/Transforms/NewGVN/pr31573.ll
@@ -10,7 +10,7 @@ define void @patatino(ptr %blah) {
 ; CHECK:       while.cond:
 ; CHECK-NEXT:    [[MEH:%.*]] = phi ptr [ [[BLAH:%.*]], [[ENTRY:%.*]] ], [ null, [[WHILE_BODY:%.*]] ]
 ; CHECK-NEXT:    switch i32 undef, label [[WHILE_BODY]] [
-; CHECK-NEXT:    i32 666, label [[WHILE_END:%.*]]
+; CHECK-NEXT:      i32 666, label [[WHILE_END:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    br label [[WHILE_COND]]
diff --git a/llvm/test/Transforms/NewGVN/pr31594.ll b/llvm/test/Transforms/NewGVN/pr31594.ll
index 47294d5640e46..d1a02d62934de 100644
--- a/llvm/test/Transforms/NewGVN/pr31594.ll
+++ b/llvm/test/Transforms/NewGVN/pr31594.ll
@@ -10,8 +10,8 @@ define i1 @patatino(ptr %blah, i32 %choice) {
 ; CHECK:       while.cond:
 ; CHECK-NEXT:    [[FOO:%.*]] = phi ptr [ [[BLAH:%.*]], [[ENTRY:%.*]] ], [ null, [[WHILE_BODY:%.*]] ]
 ; CHECK-NEXT:    switch i32 [[CHOICE:%.*]], label [[WHILE_BODY]] [
-; CHECK-NEXT:    i32 -1, label [[WHILE_END:%.*]]
-; CHECK-NEXT:    i32 40, label [[LAND_END:%.*]]
+; CHECK-NEXT:      i32 -1, label [[WHILE_END:%.*]]
+; CHECK-NEXT:      i32 40, label [[LAND_END:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       land.end:
 ; CHECK-NEXT:    br label [[WHILE_END]]
@@ -66,7 +66,7 @@ define void @foo(ptr %arg) {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i8 0, ptr [[TMP]], align 1, !g !0
+; CHECK-NEXT:    store i8 0, ptr [[TMP]], align 1, !g [[META0:![0-9]+]]
 ; CHECK-NEXT:    br label [[BB4:%.*]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    br label [[BB6:%.*]]
@@ -74,13 +74,13 @@ define void @foo(ptr %arg) {
 ; CHECK-NEXT:    br i1 undef, label [[BB9:%.*]], label [[BB7:%.*]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    switch i8 0, label [[BB6]] [
-; CHECK-NEXT:    i8 6, label [[BB8:%.*]]
+; CHECK-NEXT:      i8 6, label [[BB8:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb8:
 ; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[BB4]]
 ; CHECK:       bb9:
-; CHECK-NEXT:    store i8 0, ptr [[ARG]], align 1, !g !0
+; CHECK-NEXT:    store i8 0, ptr [[ARG]], align 1, !g [[META0]]
 ; CHECK-NEXT:    unreachable
 ;
 bb:
diff --git a/llvm/test/Transforms/NewGVN/pr31613.ll b/llvm/test/Transforms/NewGVN/pr31613.ll
index 943cdbc113dc4..0bcf86a571118 100644
--- a/llvm/test/Transforms/NewGVN/pr31613.ll
+++ b/llvm/test/Transforms/NewGVN/pr31613.ll
@@ -74,7 +74,7 @@ declare void @c.d.p(i64, ptr)
 define void @e(i32 %a0, i32 %a1, ptr %p2) {
 ; CHECK-LABEL: @e(
 ; CHECK-NEXT:    [[F:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[A0:%.*]], ptr [[F]], align 4, !g !0
+; CHECK-NEXT:    store i32 [[A0:%.*]], ptr [[F]], align 4, !g [[META0:![0-9]+]]
 ; CHECK-NEXT:    br label [[H:%.*]]
 ; CHECK:       h:
 ; CHECK-NEXT:    call void @c.d.p(i64 8, ptr undef)
@@ -88,10 +88,10 @@ define void @e(i32 %a0, i32 %a1, ptr %p2) {
 ; CHECK-NEXT:    br label [[R]]
 ; CHECK:       r:
 ; CHECK-NEXT:    switch i32 undef, label [[N:%.*]] [
-; CHECK-NEXT:    i32 0, label [[S:%.*]]
+; CHECK-NEXT:      i32 0, label [[S:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       s:
-; CHECK-NEXT:    store i32 [[A1:%.*]], ptr [[F]], align 4, !g !0
+; CHECK-NEXT:    store i32 [[A1:%.*]], ptr [[F]], align 4, !g [[META0]]
 ; CHECK-NEXT:    br label [[H]]
 ; CHECK:       n:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/NewGVN/pr31682.ll b/llvm/test/Transforms/NewGVN/pr31682.ll
index 00a1bf2b84505..3d8c9e28635c6 100644
--- a/llvm/test/Transforms/NewGVN/pr31682.ll
+++ b/llvm/test/Transforms/NewGVN/pr31682.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define void @bar() {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr @global
+; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr @global, align 8
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br i1 undef, label [[BB2]], label [[BB7:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr31758.ll b/llvm/test/Transforms/NewGVN/pr31758.ll
index 43188430e34c2..274d605802a6b 100644
--- a/llvm/test/Transforms/NewGVN/pr31758.ll
+++ b/llvm/test/Transforms/NewGVN/pr31758.ll
@@ -12,7 +12,7 @@ define void @tinkywinky() {
 ; CHECK:       bb90:
 ; CHECK-NEXT:    br label [[BB90]]
 ; CHECK:       bb138:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[BB138:%.*]]
 ;
 bb:
diff --git a/llvm/test/Transforms/NewGVN/pr32607.ll b/llvm/test/Transforms/NewGVN/pr32607.ll
index 4770724dc2baa..7460ab5ec27e0 100644
--- a/llvm/test/Transforms/NewGVN/pr32607.ll
+++ b/llvm/test/Transforms/NewGVN/pr32607.ll
@@ -7,7 +7,7 @@ define hidden void @foo() {
 ; CHECK:       if:
 ; CHECK-NEXT:    br i1 false, label [[L50:%.*]], label [[IF]]
 ; CHECK:       L50:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    ret void
 ;
 top:
diff --git a/llvm/test/Transforms/NewGVN/pr32836.ll b/llvm/test/Transforms/NewGVN/pr32836.ll
index 5488655b3683d..00f3fb07b2025 100644
--- a/llvm/test/Transforms/NewGVN/pr32836.ll
+++ b/llvm/test/Transforms/NewGVN/pr32836.ll
@@ -5,19 +5,19 @@
 @b = external global %struct.anon
 define void @tinkywinky(i1 %patatino) {
 ; CHECK-LABEL: @tinkywinky(
-; CHECK-NEXT:    store i32 8, ptr null
+; CHECK-NEXT:    store i32 8, ptr null, align 4
 ; CHECK-NEXT:    br i1 [[PATATINO:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br label [[L:%.*]]
 ; CHECK:       L:
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr null
-; CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i32, ptr @b
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i32, ptr @b, align 4
 ; CHECK-NEXT:    [[BF_VALUE:%.*]] = and i32 [[TMP1]], 536870911
 ; CHECK-NEXT:    [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], -536870912
 ; CHECK-NEXT:    [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_VALUE]]
-; CHECK-NEXT:    store i32 [[BF_SET]], ptr @b
+; CHECK-NEXT:    store i32 [[BF_SET]], ptr @b, align 4
 ; CHECK-NEXT:    br label [[LOR_END:%.*]]
 ; CHECK:       lor.end:
 ; CHECK-NEXT:    br label [[L]]
diff --git a/llvm/test/Transforms/NewGVN/pr32838.ll b/llvm/test/Transforms/NewGVN/pr32838.ll
index 5ba68fa15c24d..87c93d9312e67 100644
--- a/llvm/test/Transforms/NewGVN/pr32838.ll
+++ b/llvm/test/Transforms/NewGVN/pr32838.ll
@@ -10,7 +10,7 @@ define void @fn1(i64 noundef %arg) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br i1 false, label [[FIRSTPHIBLOCK:%.*]], label [[TEMP:%.*]]
 ; CHECK:       firstphiblock:
-; CHECK-NEXT:    br i1 undef, label %for.cond17thread-pre-split, label [[SECONDPHIBLOCK:%.*]]
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND17THREAD_PRE_SPLIT:%.*]], label [[SECONDPHIBLOCK:%.*]]
 ; CHECK:       secondphiblock:
 ; CHECK-NEXT:    [[SECONDPHI:%.*]] = phi i64 [ [[THIRDPHI:%.*]], [[THIRDPHIBLOCK:%.*]] ], [ undef, [[FIRSTPHIBLOCK]] ]
 ; CHECK-NEXT:    br i1 undef, label [[FIRSTPHIBLOCK]], label [[THIRDPHIBLOCK]]
@@ -55,7 +55,7 @@ define void @fn2(i64 noundef %arg) {
 ; CHECK-NEXT:    br i1 false, label [[FIRSTPHIBLOCK:%.*]], label [[TEMP:%.*]]
 ; CHECK:       firstphiblock:
 ; CHECK-NEXT:    [[FIRSTPHI:%.*]] = phi i64 [ poison, [[IF_THEN]] ], [ [[SECONDPHI:%.*]], [[SECONDPHIBLOCK:%.*]] ]
-; CHECK-NEXT:    br i1 undef, label %for.cond17thread-pre-split, label [[SECONDPHIBLOCK]]
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND17THREAD_PRE_SPLIT:%.*]], label [[SECONDPHIBLOCK]]
 ; CHECK:       secondphiblock:
 ; CHECK-NEXT:    [[SECONDPHI]] = phi i64 [ [[THIRDPHI:%.*]], [[THIRDPHIBLOCK:%.*]] ], [ [[FIRSTPHI]], [[FIRSTPHIBLOCK]] ]
 ; CHECK-NEXT:    br i1 undef, label [[FIRSTPHIBLOCK]], label [[THIRDPHIBLOCK]]
@@ -65,7 +65,7 @@ define void @fn2(i64 noundef %arg) {
 ; CHECK:       for.cond17thread-pre-split:
 ; CHECK-NEXT:    br label [[COND_TRUE]]
 ; CHECK:       cond.true:
-; CHECK-NEXT:    [[FOURTHPHI:%.*]] = phi i64 [ [[ARG:%.*]], [[ENTRY:%.*]] ], [ [[FIRSTPHI]], %for.cond17thread-pre-split ]
+; CHECK-NEXT:    [[FOURTHPHI:%.*]] = phi i64 [ [[ARG:%.*]], [[ENTRY:%.*]] ], [ [[FIRSTPHI]], [[FOR_COND17THREAD_PRE_SPLIT]] ]
 ; CHECK-NEXT:    [[DIV]] = sdiv i64 [[FOURTHPHI]], 4
 ; CHECK-NEXT:    br label [[THIRDPHIBLOCK]]
 ; CHECK:       temp:
@@ -105,7 +105,7 @@ define void @fn3() {
 ; CHECK-NEXT:    [[F_0:%.*]] = phi ptr [ @b, [[ENTRY:%.*]] ], [ @a, [[L1_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond.loopexit:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_END14:%.*]], label [[FOR_COND1_PREHEADER:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr32845.ll b/llvm/test/Transforms/NewGVN/pr32845.ll
index d1182a627c59c..29b81b8e1c66a 100644
--- a/llvm/test/Transforms/NewGVN/pr32845.ll
+++ b/llvm/test/Transforms/NewGVN/pr32845.ll
@@ -13,7 +13,7 @@ define void @tinkywinky() {
 ; CHECK-NEXT:    [[F_0:%.*]] = phi ptr [ @b, [[ENTRY:%.*]] ], [ @a, [[L1_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond.loopexit:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_END14:%.*]], label [[FOR_COND1_PREHEADER:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr32852.ll b/llvm/test/Transforms/NewGVN/pr32852.ll
index 4fd5cf1a95fa7..ad5badd82fdf0 100644
--- a/llvm/test/Transforms/NewGVN/pr32852.ll
+++ b/llvm/test/Transforms/NewGVN/pr32852.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; Make sure GVN doesn't incorrectly think the branch terminating
 ; bb2 has a constant condition.
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
@@ -6,13 +7,26 @@
 @patatino = private unnamed_addr constant [3 x i8] c"0\0A\00"
 
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sge i32 [[TMP]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB2:%.*]], label [[BB7:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[BB5:%.*]], label [[BB7]]
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 (ptr, ...) @printf(ptr @patatino)
+; CHECK-NEXT:    br label [[BB7]]
+; CHECK:       bb7:
+; CHECK-NEXT:    ret void
+;
 bb:
   %tmp = load i32, ptr @a
   %tmp1 = icmp sge i32 %tmp, 0
   br i1 %tmp1, label %bb2, label %bb7
 bb2:
   %tmp4 = icmp sgt i32 %tmp, 0
-; CHECK: br i1 %tmp4, label %bb5, label %bb7
   br i1 %tmp4, label %bb5, label %bb7
 bb5:
   %tmp6 = call i32 (ptr, ...) @printf(ptr @patatino)
diff --git a/llvm/test/Transforms/NewGVN/pr32897.ll b/llvm/test/Transforms/NewGVN/pr32897.ll
index 35a3b0040bcaa..881c3a8e3ef53 100644
--- a/llvm/test/Transforms/NewGVN/pr32897.ll
+++ b/llvm/test/Transforms/NewGVN/pr32897.ll
@@ -6,7 +6,7 @@ define void @tinkywinky(ptr %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
-; CHECK-NEXT:    store i64 undef, ptr [[B:%.*]]
+; CHECK-NEXT:    store i64 undef, ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    br label [[BODY]]
diff --git a/llvm/test/Transforms/NewGVN/pr32934.ll b/llvm/test/Transforms/NewGVN/pr32934.ll
index fa725f8888e7e..c8218c209face 100644
--- a/llvm/test/Transforms/NewGVN/pr32934.ll
+++ b/llvm/test/Transforms/NewGVN/pr32934.ll
@@ -1,39 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
 
-; CHECK: define void @tinkywinky() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %d = alloca i32, align 4
-; CHECK-NEXT:   store i32 0, ptr null, align 4
-; CHECK-NEXT:   br label %for.cond
-; CHECK: for.cond:                                         ; preds = %if.end, %entry
-; CHECK-NEXT:   %0 = load i32, ptr null, align 4
-; CHECK-NEXT:   %cmp = icmp slt i32 %0, 1
-; CHECK-NEXT:   br i1 %cmp, label %for.body, label %while.cond
-; CHECK: for.body:                                         ; preds = %for.cond
-; CHECK-NEXT:   %1 = load i32, ptr @a, align 4
-; CHECK-NEXT:   store i32 %1, ptr %d, align 4
-; CHECK-NEXT:   br label %L
-; CHECK: L:                                                ; preds = %if.then, %for.body
-; CHECK-NEXT:   %tobool = icmp ne i32 %1, 0
-; CHECK-NEXT:   br i1 %tobool, label %if.then, label %if.end
-; CHECK: if.then:                                          ; preds = %L
-; CHECK-NEXT:   call void (ptr, ...) @printf(ptr @patatino)
-; CHECK-NEXT:   br label %L
-; CHECK: if.end:                                           ; preds = %L
-; CHECK-NEXT:   br label %for.cond
-; CHECK: while.cond:                                       ; preds = %while.body, %for.cond
-; CHECK-NEXT:   br i1 undef, label %while.body, label %while.end
-; CHECK: while.body:                                       ; preds = %while.cond
-; CHECK-NEXT:   call void (ptr, ...) @printf(ptr @patatino)
-; CHECK-NEXT:   br label %while.cond
-; CHECK: while.end:
-; CHECK-NEXT:   %2 = load i32, ptr @a, align 4
-; CHECK-NEXT:   store i32 %2, ptr undef, align 4
-; CHECK-NEXT:   ret void
 
 @a = external global i32, align 4
 @patatino = external unnamed_addr constant [2 x i8], align 1
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr null, align 4
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[D]], align 4
+; CHECK-NEXT:    br label [[L:%.*]]
+; CHECK:       L:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void (ptr, ...) @printf(ptr @patatino)
+; CHECK-NEXT:    br label [[L]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    br i1 undef, label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    call void (ptr, ...) @printf(ptr @patatino)
+; CHECK-NEXT:    br label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    store i32 [[TMP2]], ptr undef, align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %d = alloca i32, align 4
   store i32 0, ptr null, align 4
diff --git a/llvm/test/Transforms/NewGVN/pr32945.ll b/llvm/test/Transforms/NewGVN/pr32945.ll
index ebf3813c65bd5..7aabe4df05952 100644
--- a/llvm/test/Transforms/NewGVN/pr32945.ll
+++ b/llvm/test/Transforms/NewGVN/pr32945.ll
@@ -1,9 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
-; CHECK-NOT: call i32 @llvm.ssa.copy
 
 @d = external global i32
 @e = external global i32
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:    br i1 true, label [[LOR_LHS_FALSE:%.*]], label [[COND_TRUE:%.*]]
+; CHECK:       lor.lhs.false:
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr @d, align 4
+; CHECK-NEXT:    [[PATATINO:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[TMP]], [[PATATINO]]
+; CHECK-NEXT:    store i32 [[OR]], ptr @d, align 4
+; CHECK-NEXT:    br label [[COND_TRUE]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @e, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @d, align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE6:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.true6:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[COND_FALSE]], label [[COND_FALSE]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %lor.lhs.false, label %cond.true
 lor.lhs.false:
   %tmp = load i32, ptr @d, align 4
diff --git a/llvm/test/Transforms/NewGVN/pr32952.ll b/llvm/test/Transforms/NewGVN/pr32952.ll
index 5157bb27e6c59..49e4843294f57 100644
--- a/llvm/test/Transforms/NewGVN/pr32952.ll
+++ b/llvm/test/Transforms/NewGVN/pr32952.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; PR32952: Don't erroneously consider congruent two phi nodes which
 ; have the same arguments but different incoming edges.
 ; RUN: opt -passes=newgvn -S %s | FileCheck %s
@@ -6,6 +7,31 @@
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 define i32 @tinkywinky() {
+; CHECK-LABEL: define i32 @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @a, align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[NEG:%.*]] = xor i32 [[CONV]], -1
+; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[NEG]] to i16
+; CHECK-NEXT:    [[CONV3:%.*]] = zext i16 [[CONV1]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], [[CONV3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[TINKY:%.*]], label [[WINKY:%.*]]
+; CHECK:       tinky:
+; CHECK-NEXT:    store i16 2, ptr @a, align 2
+; CHECK-NEXT:    br label [[PATATINO:%.*]]
+; CHECK:       winky:
+; CHECK-NEXT:    br label [[PATATINO]]
+; CHECK:       patatino:
+; CHECK-NEXT:    [[MEH:%.*]] = phi i16 [ [[TMP0]], [[WINKY]] ], [ [[CONV1]], [[TINKY]] ]
+; CHECK-NEXT:    [[BANANA:%.*]] = phi i16 [ [[TMP0]], [[TINKY]] ], [ [[CONV1]], [[WINKY]] ]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i16 [[BANANA]] to i32
+; CHECK-NEXT:    [[OTHER:%.*]] = zext i16 [[MEH]] to i32
+; CHECK-NEXT:    [[FIRST:%.*]] = tail call i32 (ptr, ...) @printf(ptr @.str, i32 [[PROMOTED]])
+; CHECK-NEXT:    [[SECOND:%.*]] = tail call i32 (ptr, ...) @printf(ptr @.str, i32 [[OTHER]])
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %0 = load i16, ptr @a, align 2
   %conv = sext i16 %0 to i32
@@ -23,15 +49,11 @@ winky:
   br label %patatino
 
 patatino:
-; CHECK: %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ]
-; CHECK: %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ]
   %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ]
   %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ]
   br label %end
 
 end:
-; CHECK: %promoted = zext i16 %banana to i32
-; CHECK: %other = zext i16 %meh to i32
   %promoted = zext i16 %banana to i32
   %other = zext i16 %meh to i32
   %first = tail call i32 (ptr, ...) @printf(ptr @.str, i32 %promoted)
diff --git a/llvm/test/Transforms/NewGVN/pr33014.ll b/llvm/test/Transforms/NewGVN/pr33014.ll
index f6e919770d6c6..04f9df2704a6c 100644
--- a/llvm/test/Transforms/NewGVN/pr33014.ll
+++ b/llvm/test/Transforms/NewGVN/pr33014.ll
@@ -1,33 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; Make sure we don't end up in an infinite recursion in singleReachablePHIPath().
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 @c = external global i64, align 8
 
-; CHECK-LABEL: define void @tinkywinky() {
-; CHECK: entry:
-; CHECK-NEXT:   br i1 undef, label %l2, label %if.then
-; CHECK: if.then:                                          ; preds = %entry
-; CHECK-NEXT:   br label %for.body
-; CHECK: ph:                                               ; preds = %back, %ontrue
-; CHECK-NEXT:   br label %for.body
-; CHECK: for.body:                                         ; preds = %ph, %if.then
-; CHECK-NEXT:   br i1 undef, label %ontrue, label %onfalse
-; CHECK: onfalse:                                          ; preds = %for.body
-; CHECK-NEXT:   %patatino = load i64, ptr @c
-; CHECK-NEXT:   ret void
-; CHECK: ontrue:                                           ; preds = %for.body
-; CHECK-NEXT:   %dipsy = load i64, ptr @c
-; CHECK-NEXT:   br label %ph
-; CHECK: back:                                             ; preds = %l2
-; CHECK-NEXT:   store i8 poison, ptr null
-; CHECK-NEXT:   br label %ph
-; CHECK: end:                                              ; preds = %l2
-; CHECK-NEXT:   ret void
-; CHECK: l2:                                               ; preds = %entry
-; CHECK-NEXT:   br i1 false, label %back, label %end
-; CHECK-NEXT: }
 
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[L2:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       ph:
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[ONTRUE:%.*]], label [[ONFALSE:%.*]]
+; CHECK:       onfalse:
+; CHECK-NEXT:    [[PATATINO:%.*]] = load i64, ptr @c, align 4
+; CHECK-NEXT:    ret void
+; CHECK:       ontrue:
+; CHECK-NEXT:    [[DIPSY:%.*]] = load i64, ptr @c, align 4
+; CHECK-NEXT:    br label [[PH:%.*]]
+; CHECK:       back:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[PH]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+; CHECK:       l2:
+; CHECK-NEXT:    br i1 false, label [[BACK:%.*]], label [[END:%.*]]
+;
 entry:
   br i1 undef, label %l2, label %if.then
 if.then:
diff --git a/llvm/test/Transforms/NewGVN/pr33086.ll b/llvm/test/Transforms/NewGVN/pr33086.ll
index 54802cdbd9258..ab6c00dd1777f 100644
--- a/llvm/test/Transforms/NewGVN/pr33086.ll
+++ b/llvm/test/Transforms/NewGVN/pr33086.ll
@@ -1,31 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S %s | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK-LABEL: define void @tinkywinky() {
-; CHECK: entry:
-; CHECK-NEXT:   br i1 undef, label %for.cond18, label %for.cond.preheader
-; CHECK: for.cond.preheader:
-; CHECK-NEXT:   br label %for.cond2thread-pre-split
-; CHECK: for.cond2thread-pre-split:
-; CHECK-NEXT:   %conv24 = phi i32 [ 0, %for.cond.preheader ], [ %conv, %for.inc.split ]
-; CHECK-NEXT:   br label %for.inc.split
-; CHECK: for.inc.split:
-; CHECK-NEXT:   %add = shl nsw i32 %conv24, 16
-; CHECK-NEXT:   %sext23 = add i32 %add, 65536
-; CHECK-NEXT:   %conv = ashr exact i32 %sext23, 16
-; CHECK-NEXT:   %cmp = icmp slt i32 %sext23, 3604480
-; CHECK-NEXT:   br i1 %cmp, label %for.cond2thread-pre-split, label %l1.loopexit
-; CHECK: l1.loopexit:
-; CHECK-NEXT:   br label %l1
-; CHECK: l1:
-; CHECK-NEXT:   %0 = load i16, ptr null, align 2
-; CHECK-NEXT:   %g.0.g.0..pr = load i16, ptr null, align 2
-; CHECK-NEXT:   ret void
-; CHECK: for.cond18:
-; CHECK-NEXT:   br label %l1
-; CHECK-NEXT: }
 
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND18:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    br label [[FOR_COND2THREAD_PRE_SPLIT:%.*]]
+; CHECK:       for.cond2thread-pre-split:
+; CHECK-NEXT:    [[CONV24:%.*]] = phi i32 [ 0, [[FOR_COND_PREHEADER]] ], [ [[CONV:%.*]], [[FOR_INC_SPLIT:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_INC_SPLIT]]
+; CHECK:       for.inc.split:
+; CHECK-NEXT:    [[ADD:%.*]] = shl nsw i32 [[CONV24]], 16
+; CHECK-NEXT:    [[SEXT23:%.*]] = add i32 [[ADD]], 65536
+; CHECK-NEXT:    [[CONV]] = ashr exact i32 [[SEXT23]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SEXT23]], 3604480
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND2THREAD_PRE_SPLIT]], label [[L1_LOOPEXIT:%.*]]
+; CHECK:       l1.loopexit:
+; CHECK-NEXT:    br label [[L1:%.*]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr null, align 2
+; CHECK-NEXT:    [[G_0_G_0__PR:%.*]] = load i16, ptr null, align 2
+; CHECK-NEXT:    ret void
+; CHECK:       for.cond18:
+; CHECK-NEXT:    br label [[L1]]
+;
 entry:
   br i1 undef, label %for.cond18, label %for.cond.preheader
 
diff --git a/llvm/test/Transforms/NewGVN/pr33116.ll b/llvm/test/Transforms/NewGVN/pr33116.ll
index f5ef3ae7575ee..6609ef9e72dac 100644
--- a/llvm/test/Transforms/NewGVN/pr33116.ll
+++ b/llvm/test/Transforms/NewGVN/pr33116.ll
@@ -13,7 +13,7 @@ define void @b() {
 ; CHECK:       c:
 ; CHECK-NEXT:    br i1 undef, label [[IF_G:%.*]], label [[IF_E]]
 ; CHECK:       if.g:
-; CHECK-NEXT:    store i32 undef, ptr @a
+; CHECK-NEXT:    store i32 undef, ptr @a, align 4
 ; CHECK-NEXT:    br label [[WHILE_D]]
 ; CHECK:       if.e:
 ; CHECK-NEXT:    br label [[F]]
diff --git a/llvm/test/Transforms/NewGVN/pr33187.ll b/llvm/test/Transforms/NewGVN/pr33187.ll
index e5c3da2fb05a9..37668bba3d5b6 100644
--- a/llvm/test/Transforms/NewGVN/pr33187.ll
+++ b/llvm/test/Transforms/NewGVN/pr33187.ll
@@ -30,7 +30,7 @@ define void @fn1() local_unnamed_addr #0 {
 ; CHECK:       while.body12:
 ; CHECK-NEXT:    br i1 undef, label [[IF_END18]], label [[L]]
 ; CHECK:       L.loopexit:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[L]]
 ; CHECK:       L:
 ; CHECK-NEXT:    [[H_125]] = phi i32 [ [[H_127]], [[WHILE_BODY12]] ], [ poison, [[L_LOOPEXIT]] ]
@@ -114,13 +114,13 @@ attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="fals
 define void @a() {
 ; CHECK-LABEL: @a(
 ; CHECK-NEXT:  b:
-; CHECK-NEXT:    store ptr null, ptr null
+; CHECK-NEXT:    store ptr null, ptr null, align 8
 ; CHECK-NEXT:    br label [[D:%.*]]
 ; CHECK:       d:
 ; CHECK-NEXT:    [[I:%.*]] = phi ptr [ null, [[B:%.*]] ], [ [[E:%.*]], [[F:%.*]] ]
 ; CHECK-NEXT:    br i1 undef, label [[F]], label [[G:%.*]]
 ; CHECK:       g:
-; CHECK-NEXT:    store ptr [[I]], ptr null
+; CHECK-NEXT:    store ptr [[I]], ptr null, align 8
 ; CHECK-NEXT:    unreachable
 ; CHECK:       f:
 ; CHECK-NEXT:    [[E]] = getelementptr i8, ptr [[I]], i64 1
diff --git a/llvm/test/Transforms/NewGVN/pr33196.ll b/llvm/test/Transforms/NewGVN/pr33196.ll
index c312d5ec97930..c04b895fa47e9 100644
--- a/llvm/test/Transforms/NewGVN/pr33196.ll
+++ b/llvm/test/Transforms/NewGVN/pr33196.ll
@@ -1,33 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
 
-; CHECK: define i32 @main() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %tmp = load i32, ptr @d, align 4
-; CHECK-NEXT:   %tmp1 = load i32, ptr @c, align 4
-; CHECK-NEXT:   %tobool = icmp eq i32 %tmp1, -1
-; CHECK-NEXT:   br i1 %tobool, label %if.end, label %if.then
-; CHECK: if.then:
-; CHECK-NEXT:   br label %L
-; CHECK: L:
-; CHECK-NEXT:   %e.0 = phi i32 [ 0, %if.then ], [ %e.1, %if.then4 ]
-; CHECK-NEXT:   br label %if.end
-; CHECK: if.end:
-; CHECK-NEXT:   %e.1 = phi i32 [ %e.0, %L ], [ %tmp, %entry ]
-; CHECK-NEXT:   store i32 %e.1, ptr @a, align 4
-; CHECK-NEXT:   %tmp2 = load i32, ptr @b, align 4
-; CHECK-NEXT:   store i32 0, ptr @b, align 4
-; CHECK-NEXT:   %sext = shl i32 %tmp2, 16
-; CHECK-NEXT:   %conv1 = ashr exact i32 %sext, 16
-; CHECK-NEXT:   %add = add nsw i32 %conv1, %tmp1
-; CHECK-NEXT:   %add2 = add nsw i32 %add, %e.1
-; CHECK-NEXT:   store i32 %add2, ptr @a, align 4
-; CHECK-NEXT:   %tobool3 = icmp eq i32 %add2, 0
-; CHECK-NEXT:   br i1 %tobool3, label %if.end5, label %if.then4
-; CHECK: if.then4:
-; CHECK-NEXT:   br label %L
-; CHECK: if.end5:
-; CHECK-NEXT:   ret i32 0
-; CHECK-NEXT: }
 
 @d = global i32 1, align 4
 @c = common global i32 0, align 4
@@ -35,6 +8,34 @@
 @b = common global i32 0, align 4
 
 define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr @d, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @c, align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP1]], -1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[L:%.*]]
+; CHECK:       L:
+; CHECK-NEXT:    [[E_0:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[E_1:%.*]], [[IF_THEN4:%.*]] ]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[E_1]] = phi i32 [ [[E_0]], [[L]] ], [ [[TMP]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    store i32 [[E_1]], ptr @a, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @b, align 4
+; CHECK-NEXT:    store i32 0, ptr @b, align 4
+; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 [[TMP2]], 16
+; CHECK-NEXT:    [[CONV1:%.*]] = ashr exact i32 [[SEXT]], 16
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV1]], [[TMP1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[ADD]], [[E_1]]
+; CHECK-NEXT:    store i32 [[ADD2]], ptr @a, align 4
+; CHECK-NEXT:    [[TOBOOL3:%.*]] = icmp eq i32 [[ADD2]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL3]], label [[IF_END5:%.*]], label [[IF_THEN4]]
+; CHECK:       if.then4:
+; CHECK-NEXT:    br label [[L]]
+; CHECK:       if.end5:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %tmp = load i32, ptr @d, align 4
   %tmp1 = load i32, ptr @c, align 4
diff --git a/llvm/test/Transforms/NewGVN/pr33204.ll b/llvm/test/Transforms/NewGVN/pr33204.ll
index 99c48241a75c6..482e35e7fdb11 100644
--- a/llvm/test/Transforms/NewGVN/pr33204.ll
+++ b/llvm/test/Transforms/NewGVN/pr33204.ll
@@ -20,10 +20,10 @@ define void @hoge(i32 %arg) {
 ; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ 0, [[BB1:%.*]] ], [ [[ARG:%.*]], [[BB:%.*]] ]
 ; CHECK-NEXT:    br label [[BB6:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @global, align 4, !h !0
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @global, align 4, !h [[META0:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb6:
-; CHECK-NEXT:    store i32 [[TMP]], ptr @global.1, align 4, !h !0
+; CHECK-NEXT:    store i32 [[TMP]], ptr @global.1, align 4, !h [[META0]]
 ; CHECK-NEXT:    br i1 undef, label [[BB7:%.*]], label [[BB1]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    br i1 undef, label [[BB10:%.*]], label [[BB8:%.*]]
@@ -33,7 +33,7 @@ define void @hoge(i32 %arg) {
 ; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    store i32 0, ptr @global, align 4, !h !0
+; CHECK-NEXT:    store i32 0, ptr @global, align 4, !h [[META0]]
 ; CHECK-NEXT:    br label [[BB7]]
 ;
 bb:
diff --git a/llvm/test/Transforms/NewGVN/pr33305.ll b/llvm/test/Transforms/NewGVN/pr33305.ll
index f87cf08e2abf7..3a19f610defcd 100644
--- a/llvm/test/Transforms/NewGVN/pr33305.ll
+++ b/llvm/test/Transforms/NewGVN/pr33305.ll
@@ -19,14 +19,14 @@ target triple = "x86_64-apple-macosx10.12.0"
 define i32 @main() local_unnamed_addr #0 {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa !3
+; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[CMP13_I:%.*]] = icmp slt i32 [[DOTPR_I]], 1
 ; CHECK-NEXT:    br i1 [[CMP13_I]], label [[FOR_COND1_PREHEADER_LR_PH_I:%.*]], label [[ENTRY_FOR_END9_I_CRIT_EDGE:%.*]]
 ; CHECK:       entry.for.end9.i_crit_edge:
-; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @h, align 4, !tbaa !3
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @h, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[FOR_END9_I:%.*]]
 ; CHECK:       for.cond1.preheader.lr.ph.i:
-; CHECK-NEXT:    [[G_PROMOTED14_I:%.*]] = load i32, ptr @g, align 4, !tbaa !3
+; CHECK-NEXT:    [[G_PROMOTED14_I:%.*]] = load i32, ptr @g, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_I:%.*]]
 ; CHECK:       for.cond1.preheader.i:
 ; CHECK-NEXT:    [[INC816_I:%.*]] = phi i32 [ [[DOTPR_I]], [[FOR_COND1_PREHEADER_LR_PH_I]] ], [ [[INC8_I:%.*]], [[FOR_INC7_I:%.*]] ]
@@ -42,9 +42,9 @@ define i32 @main() local_unnamed_addr #0 {
 ; CHECK:       lor.rhs.i:
 ; CHECK-NEXT:    [[LNOT_I:%.*]] = xor i1 [[TOBOOL_I]], true
 ; CHECK-NEXT:    [[LNOT_EXT_I:%.*]] = zext i1 [[LNOT_I]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @e, align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @e, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[TMP3]], [[LNOT_EXT_I]]
-; CHECK-NEXT:    store i32 [[XOR_I]], ptr @e, align 4, !tbaa !3
+; CHECK-NEXT:    store i32 [[XOR_I]], ptr @e, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[LOR_END_I]]
 ; CHECK:       lor.end.i:
 ; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[INC12_I]], 1
@@ -55,28 +55,28 @@ define i32 @main() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[INC816_I]], 0
 ; CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_COND1_PREHEADER_I]], label [[FOR_COND_FOR_END9_CRIT_EDGE_I:%.*]]
 ; CHECK:       for.cond.for.end9_crit_edge.i:
-; CHECK-NEXT:    store i32 0, ptr @g, align 4, !tbaa !3
-; CHECK-NEXT:    store i32 2, ptr @h, align 4, !tbaa !3
-; CHECK-NEXT:    store i32 [[INC8_I]], ptr @c, align 4, !tbaa !3
+; CHECK-NEXT:    store i32 0, ptr @g, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 2, ptr @h, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 [[INC8_I]], ptr @c, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[FOR_END9_I]]
 ; CHECK:       for.end9.i:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[DOTPRE]], [[ENTRY_FOR_END9_I_CRIT_EDGE]] ], [ 2, [[FOR_COND_FOR_END9_CRIT_EDGE_I]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr @b, align 8, !tbaa !7
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4, !tbaa !3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr @e, align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr @b, align 8, !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr @e, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[CMP10_I:%.*]] = icmp slt i32 [[TMP6]], -1
 ; CHECK-NEXT:    br i1 [[CMP10_I]], label [[IF_THEN_I:%.*]], label [[FN1_EXIT:%.*]]
 ; CHECK:       if.then.i:
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr @f, align 4, !tbaa !3
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP5]], align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr @f, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP5]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[FN1_EXIT]]
 ; CHECK:       fn1.exit:
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr @a, align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr @a, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[PUTS2:%.*]] = tail call i32 @puts(ptr @str.2)
-; CHECK-NEXT:    tail call void @abort()
+; CHECK-NEXT:    tail call void @abort() #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[PUTS:%.*]] = tail call i32 @puts(ptr @str)
diff --git a/llvm/test/Transforms/NewGVN/pr33367.ll b/llvm/test/Transforms/NewGVN/pr33367.ll
index dc5d190291e1c..597caa2b34ef2 100644
--- a/llvm/test/Transforms/NewGVN/pr33367.ll
+++ b/llvm/test/Transforms/NewGVN/pr33367.ll
@@ -11,19 +11,19 @@ define %MNR_struct @f000316011717_2(ptr %pDS, ptr %pCG) #2 {
 ; CHECK-NEXT:  Entry:
 ; CHECK-NEXT:    [[RESTART:%.*]] = alloca [[MNR_STRUCT:%.*]], align 8
 ; CHECK-NEXT:    [[PCARRY:%.*]] = getelementptr [[DS_STRUCT:%.*]], ptr [[PDS:%.*]], i32 0, i32 1
-; CHECK-NEXT:    [[BASE:%.*]] = load ptr, ptr [[PDS]], align 8, !tbaa !14
+; CHECK-NEXT:    [[BASE:%.*]] = load ptr, ptr [[PDS]], align 8, !tbaa [[TBAA14:![0-9]+]]
 ; CHECK-NEXT:    [[ABSADDR:%.*]] = getelementptr i64, ptr [[BASE]], i64 9
-; CHECK-NEXT:    [[EXTARGET:%.*]] = load i64, ptr [[ABSADDR]], align 8, !tbaa !4
+; CHECK-NEXT:    [[EXTARGET:%.*]] = load i64, ptr [[ABSADDR]], align 8, !tbaa [[TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[TEMPLATE:%.*]] = icmp eq i64 [[EXTARGET]], 8593987412
 ; CHECK-NEXT:    br i1 [[TEMPLATE]], label %"BB3.000316011731#1", label [[BB2_000316011731_5:%.*]]
 ; CHECK:       "BB3.000316011731#1":
 ; CHECK-NEXT:    [[PBASE8:%.*]] = getelementptr [32 x ptr], ptr [[PDS]], i64 0, i64 29
-; CHECK-NEXT:    [[BASE9:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa !14
+; CHECK-NEXT:    [[BASE9:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa [[TBAA14]]
 ; CHECK-NEXT:    [[ABSADDR1:%.*]] = getelementptr i64, ptr [[BASE9]], i64 7
-; CHECK-NEXT:    [[RMEM:%.*]] = load i64, ptr [[ABSADDR1]], align 8, !tbaa !4
+; CHECK-NEXT:    [[RMEM:%.*]] = load i64, ptr [[ABSADDR1]], align 8, !tbaa [[TBAA4]]
 ; CHECK-NEXT:    [[PWT:%.*]] = getelementptr [[DS_STRUCT]], ptr [[PDS]], i32 0, i32 2
 ; CHECK-NEXT:    [[PWTE:%.*]] = getelementptr [32 x i16], ptr [[PWT]], i64 0, i64 8593987412
-; CHECK-NEXT:    [[SHIFTS:%.*]] = load i16, ptr [[PWTE]], align 2, !tbaa !18, !invariant.load !20
+; CHECK-NEXT:    [[SHIFTS:%.*]] = load i16, ptr [[PWTE]], align 2, !tbaa [[TBAA18:![0-9]+]], !invariant.load [[META20:![0-9]+]]
 ; CHECK-NEXT:    [[SLOWJ:%.*]] = icmp eq i16 [[SHIFTS]], 0
 ; CHECK-NEXT:    br i1 [[SLOWJ]], label [[BB2_000316011731_5]], label %"BB3.000316011731#1.1"
 ; CHECK:       BB2.000316011731.5:
@@ -34,22 +34,22 @@ define %MNR_struct @f000316011717_2(ptr %pDS, ptr %pCG) #2 {
 ; CHECK-NEXT:    [[SHIFTS1:%.*]] = zext i16 [[SHIFTS]] to i64
 ; CHECK-NEXT:    [[VAL:%.*]] = call i64 @llvm.x86.bmi.bextr.64(i64 [[RMEM]], i64 [[SHIFTS1]])
 ; CHECK-NEXT:    [[PREG:%.*]] = getelementptr [64 x i64], ptr [[PCG:%.*]], i64 0, i64 12
-; CHECK-NEXT:    store i64 [[VAL]], ptr [[PREG]], align 32, !tbaa !10
+; CHECK-NEXT:    store i64 [[VAL]], ptr [[PREG]], align 32, !tbaa [[TBAA10:![0-9]+]]
 ; CHECK-NEXT:    [[PREG2:%.*]] = getelementptr [64 x i64], ptr [[PCG]], i64 0, i64 14
-; CHECK-NEXT:    [[REG:%.*]] = load i64, ptr [[PREG2]], align 16, !tbaa !12
-; CHECK-NEXT:    [[BASE2:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa !14
+; CHECK-NEXT:    [[REG:%.*]] = load i64, ptr [[PREG2]], align 16, !tbaa [[TBAA12:![0-9]+]]
+; CHECK-NEXT:    [[BASE2:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa [[TBAA14]]
 ; CHECK-NEXT:    [[ABSADDR2:%.*]] = getelementptr i64, ptr [[BASE2]], i64 [[REG]]
-; CHECK-NEXT:    [[RMEM2:%.*]] = load i64, ptr [[ABSADDR2]], align 8, !tbaa !1
+; CHECK-NEXT:    [[RMEM2:%.*]] = load i64, ptr [[ABSADDR2]], align 8, !tbaa [[TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[PREG7:%.*]] = getelementptr [64 x i64], ptr [[PCG]], i64 0, i64 9
-; CHECK-NEXT:    store i64 [[RMEM2]], ptr [[PREG7]], align 8, !tbaa !8
+; CHECK-NEXT:    store i64 [[RMEM2]], ptr [[PREG7]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; CHECK-NEXT:    [[ADD2C279:%.*]] = add i64 [[RMEM2]], [[VAL]]
 ; CHECK-NEXT:    [[CCHK:%.*]] = icmp sge i64 [[ADD2C279]], 0
 ; CHECK-NEXT:    [[CFL:%.*]] = zext i1 [[CCHK]] to i8
-; CHECK-NEXT:    store i8 [[CFL]], ptr [[PCARRY]], align 1, !tbaa !16
+; CHECK-NEXT:    store i8 [[CFL]], ptr [[PCARRY]], align 1, !tbaa [[TBAA16:![0-9]+]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       Exit:
 ; CHECK-NEXT:    [[RESTART378:%.*]] = load [[MNR_STRUCT]], ptr [[RESTART]], align 8
-; CHECK-NEXT:    ret [[MNR_STRUCT]] %restart378
+; CHECK-NEXT:    ret [[MNR_STRUCT]] [[RESTART378]]
 ;
 Entry:
   %restart = alloca %MNR_struct
diff --git a/llvm/test/Transforms/NewGVN/pr34452.ll b/llvm/test/Transforms/NewGVN/pr34452.ll
index f5d5fda343037..9e65349a1b47b 100644
--- a/llvm/test/Transforms/NewGVN/pr34452.ll
+++ b/llvm/test/Transforms/NewGVN/pr34452.ll
@@ -9,7 +9,7 @@ source_filename = "bugpoint-output-09f7a24.bc"
 define void @sgrep() local_unnamed_addr #0 {
 ; CHECK-LABEL: @sgrep(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa [[TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2048, i32 2047
 ; CHECK-NEXT:    br label [[WHILE_BODY_US:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr42422-phi-of-ops.ll b/llvm/test/Transforms/NewGVN/pr42422-phi-of-ops.ll
index cbdf209d1ce50..1312f9f4f0296 100644
--- a/llvm/test/Transforms/NewGVN/pr42422-phi-of-ops.ll
+++ b/llvm/test/Transforms/NewGVN/pr42422-phi-of-ops.ll
@@ -40,8 +40,8 @@ define void @d() {
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    [[CLEANUP_DEST:%.*]] = phi i32 [ poison, [[IF_END12]] ], [ 1, [[IF_THEN11]] ], [ 9, [[IF_THEN]] ]
 ; CHECK-NEXT:    switch i32 [[CLEANUP_DEST]], label [[CLEANUP14]] [
-; CHECK-NEXT:    i32 0, label [[FOR_COND4]]
-; CHECK-NEXT:    i32 9, label [[FOR_END13:%.*]]
+; CHECK-NEXT:      i32 0, label [[FOR_COND4]]
+; CHECK-NEXT:      i32 9, label [[FOR_END13:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       for.end13:
 ; CHECK-NEXT:    br label [[CLEANUP14]]
diff --git a/llvm/test/Transforms/NewGVN/pr43441.ll b/llvm/test/Transforms/NewGVN/pr43441.ll
index 5c4a9c3bc7b5d..a5f711dbd69e5 100644
--- a/llvm/test/Transforms/NewGVN/pr43441.ll
+++ b/llvm/test/Transforms/NewGVN/pr43441.ll
@@ -1,15 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK-LABEL: @print_long_format()
 define dso_local void @print_long_format() #0 {
+; CHECK-LABEL: define dso_local void @print_long_format(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 undef, label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 1, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 0, label [[SW_BB19:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB23:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    unreachable
+; CHECK:       sw.bb19:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN37:%.*]], label [[IF_END50:%.*]]
+; CHECK:       sw.bb23:
+; CHECK-NEXT:    unreachable
+; CHECK:       sw.default:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.then37:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end50:
+; CHECK-NEXT:    [[CALL180:%.*]] = call i32 @timespec_cmp() #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
 entry:
   switch i32 undef, label %sw.default [
-    i32 1, label %sw.bb
-    i32 0, label %sw.bb19
-    i32 2, label %sw.bb23
+  i32 1, label %sw.bb
+  i32 0, label %sw.bb19
+  i32 2, label %sw.bb23
   ]
 
 sw.bb:                                            ; preds = %entry
diff --git a/llvm/test/Transforms/NewGVN/pre-compare.ll b/llvm/test/Transforms/NewGVN/pre-compare.ll
index 9fd20fcc76ed6..8e4e5f813e89a 100644
--- a/llvm/test/Transforms/NewGVN/pre-compare.ll
+++ b/llvm/test/Transforms/NewGVN/pre-compare.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 ; C source:
@@ -37,6 +38,28 @@
 @.str3 = private unnamed_addr constant [12 x i8] c"step 2: %d\0A\00", align 1
 
 define void @f(i32 %x) noreturn nounwind uwtable ssp {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_PREHEADER:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X]], 2
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP1]], ptr @.str, ptr @.str1
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @puts(ptr [[COND]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND_PREHEADER]]
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[X]], 2
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @puts(ptr @.str2) #[[ATTR1]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_COND_BACKEDGE:%.*]], label [[IF_END5:%.*]]
+; CHECK:       if.end5:
+; CHECK-NEXT:    [[CALL6:%.*]] = tail call i32 (ptr, ...) @printf(ptr @.str3, i32 [[X]]) #[[ATTR1]]
+; CHECK-NEXT:    br label [[FOR_COND_BACKEDGE]]
+; CHECK:       for.cond.backedge:
+; CHECK-NEXT:    br label [[FOR_COND]]
+;
 entry:
   %cmp = icmp eq i32 %x, 1
   br i1 %cmp, label %for.cond.preheader, label %if.then
diff --git a/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll b/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll
index 1ca24af84e48a..a63ca131b5c0d 100644
--- a/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll
+++ b/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll
@@ -9,7 +9,7 @@ declare void @use(i32)
 define i32 @test(ptr %p1, ptr %p2, i1 %c) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[P1:%.*]], align 8, !tbaa !0
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[P1:%.*]], align 8, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[CMP_1:%.*]] = icmp slt i32 [[LV]], 1
 ; CHECK-NEXT:    br i1 [[CMP_1]], label [[EXIT:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.false:
diff --git a/llvm/test/Transforms/NewGVN/readattrs.ll b/llvm/test/Transforms/NewGVN/readattrs.ll
index 049a2fc911687..544fe45be2497 100644
--- a/llvm/test/Transforms/NewGVN/readattrs.ll
+++ b/llvm/test/Transforms/NewGVN/readattrs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S -o - < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
@@ -6,12 +7,15 @@ target triple = "x86_64-unknown-linux-gnu"
 declare void @use(ptr readonly nocapture)
 
 define i8 @test() {
+; CHECK-LABEL: define i8 @test() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 1, ptr [[A]], align 1
+; CHECK-NEXT:    call void @use(ptr [[A]])
+; CHECK-NEXT:    ret i8 1
+;
   %a = alloca i8
   store i8 1, ptr %a
   call void @use(ptr %a)
   %b = load i8, ptr %a
   ret i8 %b
-; CHECK-LABEL: define i8 @test(
-; CHECK: call void @use(ptr %a)
-; CHECK-NEXT: ret i8 1
 }
diff --git a/llvm/test/Transforms/NewGVN/rle-nonlocal.ll b/llvm/test/Transforms/NewGVN/rle-nonlocal.ll
index c2fb39100da45..efdfd1f6dc7b5 100644
--- a/llvm/test/Transforms/NewGVN/rle-nonlocal.ll
+++ b/llvm/test/Transforms/NewGVN/rle-nonlocal.ll
@@ -7,14 +7,14 @@ define i32 @main(ptr %p, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[BLOCK2:%.*]], label [[BLOCK3:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[P:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    br label [[BLOCK4:%.*]]
 ; CHECK:       block3:
-; CHECK-NEXT:    [[B:%.*]] = load ptr, ptr [[P]]
+; CHECK-NEXT:    [[B:%.*]] = load ptr, ptr [[P]], align 8
 ; CHECK-NEXT:    br label [[BLOCK4]]
 ; CHECK:       block4:
 ; CHECK-NEXT:    [[EXISTINGPHI:%.*]] = phi ptr [ [[A]], [[BLOCK2]] ], [ [[B]], [[BLOCK3]] ]
-; CHECK-NEXT:    [[C:%.*]] = load i32, ptr [[EXISTINGPHI]]
+; CHECK-NEXT:    [[C:%.*]] = load i32, ptr [[EXISTINGPHI]], align 4
 ; CHECK-NEXT:    [[E:%.*]] = add i32 [[C]], [[C]]
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
diff --git a/llvm/test/Transforms/NewGVN/rle.ll b/llvm/test/Transforms/NewGVN/rle.ll
index 1cfdc83f40da0..950c492d87365 100644
--- a/llvm/test/Transforms/NewGVN/rle.ll
+++ b/llvm/test/Transforms/NewGVN/rle.ll
@@ -1,15 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -passes=newgvn,dce -S | FileCheck %s
 ; RUN: opt < %s -data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32" -passes=newgvn,dce -S | FileCheck %s
 ; memset -> i16 forwarding.
 define signext i16 @memset_to_i16_local(ptr %A) nounwind ssp {
+; CHECK-LABEL: define signext i16 @memset_to_i16_local(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr [[A]], i8 1, i64 200, i1 false)
+; CHECK-NEXT:    ret i16 257
+;
 entry:
   tail call void @llvm.memset.p0.i64(ptr %A, i8 1, i64 200, i1 false)
   %arrayidx = getelementptr inbounds i16, ptr %A, i64 42
   %tmp2 = load i16, ptr %arrayidx
   ret i16 %tmp2
-; CHECK-LABEL: @memset_to_i16_local(
-; CHECK-NOT: load
-; CHECK: ret i16 257
 }
 
 @GCst = constant {i32, float, i32 } { i32 42, float 14., i32 97 }
@@ -17,37 +21,48 @@ entry:
 
 ; memset -> float forwarding.
 define float @memcpy_to_float_local(ptr %A) nounwind ssp {
+; CHECK-LABEL: define float @memcpy_to_float_local(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[A]], ptr @GCst, i64 12, i1 false)
+; CHECK-NEXT:    ret float 1.400000e+01
+;
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %A, ptr @GCst, i64 12, i1 false)
   %arrayidx = getelementptr inbounds float, ptr %A, i64 1 ; <ptr> [#uses=1]
   %tmp2 = load float, ptr %arrayidx                   ; <float> [#uses=1]
   ret float %tmp2
-; CHECK-LABEL: @memcpy_to_float_local(
-; CHECK-NOT: load
-; CHECK: ret float 1.400000e+01
 }
 ; memcpy from address space 1
 define float @memcpy_to_float_local_as1(ptr %A) nounwind ssp {
+; CHECK-LABEL: define float @memcpy_to_float_local_as1(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p1.i64(ptr [[A]], ptr addrspace(1) @GCst_as1, i64 12, i1 false)
+; CHECK-NEXT:    ret float 1.400000e+01
+;
 entry:
   tail call void @llvm.memcpy.p0.p1.i64(ptr %A, ptr addrspace(1) @GCst_as1, i64 12, i1 false)
   %arrayidx = getelementptr inbounds float, ptr %A, i64 1 ; <ptr> [#uses=1]
   %tmp2 = load float, ptr %arrayidx                   ; <float> [#uses=1]
   ret float %tmp2
-; CHECK-LABEL: @memcpy_to_float_local_as1(
-; CHECK-NOT: load
-; CHECK: ret float 1.400000e+01
 }
 
 ; PR6642
 define i32 @memset_to_load() nounwind readnone {
+; CHECK-LABEL: define i32 @memset_to_load(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = alloca [256 x i32], align 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[X]], i8 0, i64 1024, i1 false)
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %x = alloca [256 x i32], align 4                ; <ptr> [#uses=2]
   call void @llvm.memset.p0.i64(ptr align 4 %x, i8 0, i64 1024, i1 false)
   %arraydecay = getelementptr inbounds [256 x i32], ptr %x, i32 0, i32 0 ; <ptr>
   %tmp1 = load i32, ptr %arraydecay                   ; <i32> [#uses=1]
   ret i32 %tmp1
-; CHECK-LABEL: @memset_to_load(
-; CHECK: ret i32 0
 }
 declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) nounwind
 
diff --git a/llvm/test/Transforms/NewGVN/simp-to-self.ll b/llvm/test/Transforms/NewGVN/simp-to-self.ll
index fb8a01962959b..f9a0ec257259a 100644
--- a/llvm/test/Transforms/NewGVN/simp-to-self.ll
+++ b/llvm/test/Transforms/NewGVN/simp-to-self.ll
@@ -1,13 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S < %s -passes=newgvn | FileCheck %s
 
-; CHECK-LABEL: for.cond:
-; CHECK-NEXT:    %lv = load i32, ptr @a
-; CHECK-NEXT:    %bf.clear = and i32 %lv, -131072
-; CHECK-NEXT:    %bf.set = or i32 1, %bf.clear
-; CHECK-NEXT:    br i1 %bc, label %for.cond, label %exit
 @a = external global i64
 
 define void @fn1(i1 %bc) {
+; CHECK-LABEL: define void @fn1(
+; CHECK-SAME: i1 [[BC:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    [[BF_CLEAR:%.*]] = and i32 [[LV]], -131072
+; CHECK-NEXT:    [[BF_SET:%.*]] = or i32 1, [[BF_CLEAR]]
+; CHECK-NEXT:    br i1 [[BC]], label [[FOR_COND]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i32 [[BF_SET]], ptr @a, align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.cond
 
diff --git a/llvm/test/Transforms/NewGVN/stale-loop-info.ll b/llvm/test/Transforms/NewGVN/stale-loop-info.ll
index 8870824d38ad6..7abe80b005ecf 100644
--- a/llvm/test/Transforms/NewGVN/stale-loop-info.ll
+++ b/llvm/test/Transforms/NewGVN/stale-loop-info.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes='require<loops>,newgvn' -S < %s | FileCheck %s
 
 ; This used to fail with ASAN enabled and if for some reason LoopInfo remained
@@ -14,6 +15,29 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 declare void @snork.1(ptr) local_unnamed_addr #0
 
 define hidden zeroext i1 @eggs(ptr %arg, i1 %arg2) unnamed_addr align 2 {
+; CHECK-LABEL: define hidden zeroext i1 @eggs(
+; CHECK-SAME: ptr [[ARG:%.*]], i1 [[ARG2:%.*]]) unnamed_addr align 2 {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 [[ARG2]], label [[BB14:%.*]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_WIBBLE_1028:%.*]], ptr [[ARG]], i64 0, i32 2, i32 0, i32 0, i64 0
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br i1 undef, label [[BB11:%.*]], label [[BB8:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP]], align 8
+; CHECK-NEXT:    br label [[BB12:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    br label [[BB12]]
+; CHECK:       bb12:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[TMP]], [[BB11]] ], [ [[TMP9]], [[BB8]] ]
+; CHECK-NEXT:    call void @snork.1(ptr [[TMP13]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    br label [[BB6]]
+; CHECK:       bb14:
+; CHECK-NEXT:    ret i1 false
+;
 bb:
   br i1 %arg2, label %bb14, label %bb3
 
@@ -29,7 +53,6 @@ bb7:                                              ; preds = %bb6
 
 bb8:                                              ; preds = %bb7
   %tmp9 = load ptr, ptr %tmp, align 8
-; CHECK: %tmp9 = load ptr, ptr %tmp, align 8
   br label %bb12
 
 bb11:                                             ; preds = %bb7
diff --git a/llvm/test/Transforms/NewGVN/tbaa.ll b/llvm/test/Transforms/NewGVN/tbaa.ll
index e6d66dca0decf..335e782acc8bc 100644
--- a/llvm/test/Transforms/NewGVN/tbaa.ll
+++ b/llvm/test/Transforms/NewGVN/tbaa.ll
@@ -1,10 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 define i32 @test1(ptr %p, ptr %q) {
-; CHECK-LABEL: @test1(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p)
-; CHECK-NOT: tbaa
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test1(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]])
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p)
   %c = add i32 %a, %b
@@ -12,9 +15,12 @@ define i32 @test1(ptr %p, ptr %q) {
 }
 
 define i32 @test2(ptr %p, ptr %q) {
-; CHECK-LABEL: @test2(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGC:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test2(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p), !tbaa !0
   %c = add i32 %a, %b
@@ -22,9 +28,12 @@ define i32 @test2(ptr %p, ptr %q) {
 }
 
 define i32 @test3(ptr %p, ptr %q) {
-; CHECK-LABEL: @test3(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGB:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test3(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !3
   %b = call i32 @foo(ptr %p), !tbaa !3
   %c = add i32 %a, %b
@@ -32,9 +41,12 @@ define i32 @test3(ptr %p, ptr %q) {
 }
 
 define i32 @test4(ptr %p, ptr %q) {
-; CHECK-LABEL: @test4(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGA:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test4(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !1
   %b = call i32 @foo(ptr %p), !tbaa !0
   %c = add i32 %a, %b
@@ -42,9 +54,12 @@ define i32 @test4(ptr %p, ptr %q) {
 }
 
 define i32 @test5(ptr %p, ptr %q) {
-; CHECK-LABEL: @test5(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGA]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test5(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p), !tbaa !1
   %c = add i32 %a, %b
@@ -52,9 +67,12 @@ define i32 @test5(ptr %p, ptr %q) {
 }
 
 define i32 @test6(ptr %p, ptr %q) {
-; CHECK-LABEL: @test6(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGA]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test6(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p), !tbaa !3
   %c = add i32 %a, %b
@@ -62,10 +80,12 @@ define i32 @test6(ptr %p, ptr %q) {
 }
 
 define i32 @test7(ptr %p, ptr %q) {
-; CHECK-LABEL: @test7(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p)
-; CHECK-NOT: tbaa
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test7(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]])
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !4
   %b = call i32 @foo(ptr %p), !tbaa !3
   %c = add i32 %a, %b
@@ -73,9 +93,11 @@ define i32 @test7(ptr %p, ptr %q) {
 }
 
 define i32 @test8(ptr %p, ptr %q) {
-; CHECK-LABEL: @test8
-; CHECK-NEXT: store i32 15, ptr %p
-; CHECK-NEXT: ret i32 0
+; CHECK-LABEL: define i32 @test8(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    store i32 15, ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 0
+;
 ; Since we know the location is invariant, we can forward the
 ; load across the potentially aliasing store.
 
@@ -87,9 +109,11 @@ define i32 @test8(ptr %p, ptr %q) {
 }
 
 define i32 @test9(ptr %p, ptr %q) {
-; CHECK-LABEL: @test9
-; CHECK-NEXT: call void @clobber()
-; CHECK-NEXT: ret i32 0
+; CHECK-LABEL: define i32 @test9(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    ret i32 0
+;
 ; Since we know the location is invariant, we can forward the
 ; load across the potentially aliasing store (within the call).
 
@@ -103,9 +127,12 @@ define i32 @test9(ptr %p, ptr %q) {
 define i32 @test10(ptr %p, ptr %q) {
 ; If one access encloses the other, then the merged access is the enclosed one
 ; and not just the common final access type.
-; CHECK-LABEL: @test10
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAG_X_i:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test10(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !15  ; TAG_X_i
   %b = call i32 @foo(ptr %p), !tbaa !19  ; TAG_Y_x_i
   %c = add i32 %a, %b
@@ -115,12 +142,6 @@ define i32 @test10(ptr %p, ptr %q) {
 declare void @clobber()
 declare i32 @foo(ptr) readonly
 
-; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
-; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
-; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}}
-; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
-; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]}
-; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
 !0 = !{!5, !5, i64 0}
 !1 = !{!6, !6, i64 0}
 !2 = !{!"tbaa root"}
@@ -132,9 +153,6 @@ declare i32 @foo(ptr) readonly
 !8 = !{!"another root"}
 !11 = !{!"scalar type", !8}
 
-; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0}
-; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0}
-; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0}
 !15 = !{!16, !17, i64 0}            ; TAG_X_i
 !16 = !{!"struct X", !17, i64 0}    ; struct X { int i; };
 !17 = !{!"int", !18, i64 0}
@@ -146,3 +164,16 @@ declare i32 @foo(ptr) readonly
 ; A TBAA structure who's only point is to have a constant location.
 !9 = !{!"yet another root"}
 !10 = !{!"node", !9, i64 1}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"C", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"A", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"tbaa root"}
+; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"B", [[META2]]}
+; CHECK: [[TBAA6]] = !{[[META2]], [[META2]], i64 0}
+; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 0}
+; CHECK: [[META8]] = !{!"struct X", [[META9]], i64 0}
+; CHECK: [[META9]] = !{!"int", [[META10:![0-9]+]], i64 0}
+; CHECK: [[META10]] = !{!"char", [[META3]], i64 0}
+;.
diff --git a/llvm/test/Transforms/NewGVN/unreachable_block_infinite_loop.ll b/llvm/test/Transforms/NewGVN/unreachable_block_infinite_loop.ll
index 7fbf50680a32e..70e5e1a138da7 100644
--- a/llvm/test/Transforms/NewGVN/unreachable_block_infinite_loop.ll
+++ b/llvm/test/Transforms/NewGVN/unreachable_block_infinite_loop.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -5,11 +6,11 @@ target triple = "x86_64-apple-darwin10.0"
 
 define i32 @test2() nounwind ssp {
 entry:
-    ret i32 0
+  ret i32 0
 
 unreachable_block:
-    %a = add i32 %a, 1
-    ret i32 %a
+  %a = add i32 %a, 1
+  ret i32 %a
 }
 
 define i32 @pr23096_test0() {
diff --git a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
index b863d23cffdc3..2a1fcf35157f4 100644
--- a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
+++ b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
@@ -1,21 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; Skip dead MemoryPhis when performing memory congruency verification
 ; in NewGVN.
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: define void @tinkywinky() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   br i1 false, label %body, label %end
-; CHECK:      body:
-; CHECK-NEXT:   store i8 poison, ptr null
-; CHECK-NEXT:   br label %end
-; CHECK:      end:
-; CHECK-NEXT:   ret void
-; CHECK-NEXT: }
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[BODY:%.*]], label [[END:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @llvm.lifetime.start.p0(i64 4, ptr undef)
   br i1 false, label %body, label %end
diff --git a/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll b/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll
index 2febea7e94819..d6daff99591f9 100644
--- a/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll
+++ b/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll
@@ -1,13 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 %struct.t = type { ptr }
 
 ; The loaded address and the location of the address itself are not aliased,
 ; so the second reload is not necessary. Check that it can be eliminated.
-; CHECK-LABEL: test1
-; CHECK: load
-; CHECK-NOT: load
 define void @test1(ptr nocapture readonly %p, i32 %v) #0 {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load ptr, ptr %p, align 4, !tbaa !1
   store volatile i32 %v, ptr %0, align 4, !tbaa !6
@@ -18,11 +24,16 @@ entry:
 
 ; The store via the loaded address may overwrite the address itself.
 ; Make sure that both loads remain.
-; CHECK-LABEL: test2
-; CHECK: load
-; CHECK: store
-; CHECK: load
 define void @test2(ptr nocapture readonly %p, i32 %v) #0 {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load ptr, ptr %p, align 4, !tbaa !1
   store volatile i32 %v, ptr %0, align 4, !tbaa !1
@@ -33,11 +44,16 @@ entry:
 
 ; The loads are ordered and non-monotonic. Although they are not aliased to
 ; the stores, make sure both are preserved.
-; CHECK-LABEL: test3
-; CHECK: load
-; CHECK: store
-; CHECK: load
 define void @test3(ptr nocapture readonly %p, i32 %v) #0 {
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic ptr, ptr [[P]] acquire, align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load atomic ptr, ptr [[P]] acquire, align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP1]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load atomic ptr, ptr %p acquire, align 4, !tbaa !1
   store volatile i32 %v, ptr %0, align 4, !tbaa !6
@@ -56,3 +72,12 @@ attributes #0 = { norecurse nounwind }
 !6 = !{!7, !7, i64 0}
 !7 = !{!"int", !4, i64 0}
 
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
+; CHECK: [[META1]] = !{!"", [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"any pointer", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"int", [[META3]], i64 0}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index 33cfd0aa8574e..ad100c399c08e 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -164,10 +164,10 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META11:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope [[META9]], !noalias [[META11]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[WIDE_LOAD10]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd <4 x float> [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[WIDE_LOAD11]]
-; CHECK-NEXT:    [[PREDPHI12:%.*]] = fadd <4 x float> [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[TMP6]], [[WIDE_LOAD10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[WIDE_LOAD11]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[TMP6]], <4 x float> [[TMP10]]
+; CHECK-NEXT:    [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4, !alias.scope [[META9]], !noalias [[META11]]
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI12]], ptr [[TMP12]], align 4, !alias.scope [[META9]], !noalias [[META11]]
diff --git a/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll b/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll
index b8f5d5dba0c4b..05d9acd191962 100644
--- a/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll
+++ b/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll
@@ -240,3 +240,32 @@ then:
 else:
   ret i16 0
 }
+
+define i1 @test_add_nuw_sub(i32 %a) {
+; CHECK-LABEL: @test_add_nuw_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A:%.*]], 10000
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[ADD]], -5000
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %add = add nuw i32 %a, 10000
+  %sub = add i32 %add, -5000
+  %cond = icmp ult i32 %sub, 5000
+  ret i1 %cond
+}
+
+define i1 @test_add_nsw_sub(i32 %a) {
+; CHECK-LABEL: @test_add_nsw_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A:%.*]], 10000
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[ADD]], -5000
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[SUB]], 5000
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+entry:
+  %add = add nsw i32 %a, 10000
+  %sub = add i32 %add, -5000
+  %cond = icmp ult i32 %sub, 5000
+  ret i1 %cond
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index cef791633655a..5e3fd156666f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -17,12 +17,13 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
 ; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
 ; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
new file mode 100644
index 0000000000000..6907724729759
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @h() {
+; CHECK-LABEL: define void @h() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i32> [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i1> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv9 = zext i16 0 to i32
+  %arrayidx2 = getelementptr i8, ptr null, i64 16
+  %conv310 = zext i16 0 to i32
+  %add4 = or i32 %conv310, %conv9
+  %sub = or i32 %conv9, %conv310
+  %conv15 = sext i16 0 to i32
+  %shr = ashr i32 0, 0
+  %arrayidx18 = getelementptr i8, ptr null, i64 24
+  %conv19 = sext i16 0 to i32
+  %sub20 = or i32 %shr, %conv19
+  %shr29 = ashr i32 0, 0
+  %add30 = or i32 %shr29, %conv15
+  %sub39 = or i32 %sub, %sub20
+  %conv40 = trunc i32 %sub39 to i16
+  store i16 %conv40, ptr %arrayidx2, align 2
+  %sub44 = or i32 %add4, %add30
+  %conv45 = trunc i32 %sub44 to i16
+  store i16 %conv45, ptr %arrayidx18, align 2
+  %arrayidx2.1 = getelementptr i8, ptr null, i64 18
+  %conv3.112 = zext i16 0 to i32
+  %add4.1 = or i32 %conv3.112, 0
+  %sub.1 = or i32 0, %conv3.112
+  %conv15.1 = sext i16 0 to i32
+  %shr.1 = ashr i32 0, 0
+  %arrayidx18.1 = getelementptr i8, ptr null, i64 26
+  %conv19.1 = sext i16 0 to i32
+  %sub20.1 = or i32 %shr.1, %conv19.1
+  %shr29.1 = ashr i32 0, 0
+  %add30.1 = or i32 %shr29.1, %conv15.1
+  %sub39.1 = or i32 %sub.1, %sub20.1
+  %conv40.1 = trunc i32 %sub39.1 to i16
+  store i16 %conv40.1, ptr %arrayidx2.1, align 2
+  %sub44.1 = or i32 %add4.1, %add30.1
+  %conv45.1 = trunc i32 %sub44.1 to i16
+  store i16 %conv45.1, ptr %arrayidx18.1, align 2
+  %conv.213 = zext i16 0 to i32
+  %arrayidx2.2 = getelementptr i8, ptr null, i64 20
+  %conv3.214 = zext i16 0 to i32
+  %add4.2 = or i32 0, %conv.213
+  %sub.2 = or i32 0, %conv3.214
+  %conv15.2 = sext i16 0 to i32
+  %shr.2 = ashr i32 0, 0
+  %arrayidx18.2 = getelementptr i8, ptr null, i64 28
+  %conv19.2 = sext i16 0 to i32
+  %sub20.2 = or i32 %shr.2, %conv19.2
+  %shr29.2 = ashr i32 0, 0
+  %add30.2 = or i32 %shr29.2, %conv15.2
+  %sub39.2 = or i32 %sub.2, %sub20.2
+  %conv40.2 = trunc i32 %sub39.2 to i16
+  store i16 %conv40.2, ptr %arrayidx2.2, align 2
+  %sub44.2 = or i32 %add4.2, %add30.2
+  %conv45.2 = trunc i32 %sub44.2 to i16
+  store i16 %conv45.2, ptr %arrayidx18.2, align 2
+  %conv.315 = zext i16 0 to i32
+  %arrayidx2.3 = getelementptr i8, ptr null, i64 22
+  %conv3.316 = zext i16 0 to i32
+  %add4.3 = or i32 0, %conv.315
+  %sub.3 = or i32 0, %conv3.316
+  %conv15.3 = sext i16 0 to i32
+  %shr.3 = ashr i32 0, 0
+  %arrayidx18.3 = getelementptr i8, ptr null, i64 30
+  %conv19.3 = sext i16 0 to i32
+  %sub20.3 = or i32 %shr.3, %conv19.3
+  %shr29.3 = ashr i32 0, 0
+  %add30.3 = or i32 %shr29.3, %conv15.3
+  %sub39.3 = or i32 %sub.3, %sub20.3
+  %conv40.3 = trunc i32 %sub39.3 to i16
+  store i16 %conv40.3, ptr %arrayidx2.3, align 2
+  %sub44.3 = or i32 %add4.3, %add30.3
+  %conv45.3 = trunc i32 %sub44.3 to i16
+  store i16 %conv45.3, ptr %arrayidx18.3, align 2
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
new file mode 100644
index 0000000000000..d51ef0bce3a4e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @h() {
+; CHECK-LABEL: define void @h() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i32> zeroinitializer to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <8 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i1> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv9 = zext i16 0 to i32
+  %arrayidx2 = getelementptr i8, ptr null, i64 16
+  %conv310 = zext i16 0 to i32
+  %add4 = add i32 %conv310, %conv9
+  %sub = sub i32 0, %conv310
+  %conv15 = sext i16 0 to i32
+  %shr = ashr i32 0, 0
+  %arrayidx18 = getelementptr i8, ptr null, i64 24
+  %conv19 = sext i16 0 to i32
+  %sub20 = sub i32 %shr, %conv19
+  %shr29 = ashr i32 0, 0
+  %add30 = add i32 %shr29, %conv15
+  %sub39 = or i32 %sub, %sub20
+  %conv40 = trunc i32 %sub39 to i16
+  store i16 %conv40, ptr %arrayidx2, align 2
+  %sub44 = or i32 %add4, %add30
+  %conv45 = trunc i32 %sub44 to i16
+  store i16 %conv45, ptr %arrayidx18, align 2
+  %arrayidx2.1 = getelementptr i8, ptr null, i64 18
+  %conv3.112 = zext i16 0 to i32
+  %add4.1 = add i32 %conv3.112, 0
+  %sub.1 = sub i32 0, %conv3.112
+  %conv15.1 = sext i16 0 to i32
+  %shr.1 = ashr i32 0, 0
+  %arrayidx18.1 = getelementptr i8, ptr null, i64 26
+  %conv19.1 = sext i16 0 to i32
+  %sub20.1 = sub i32 %shr.1, %conv19.1
+  %shr29.1 = ashr i32 0, 0
+  %add30.1 = add i32 %shr29.1, %conv15.1
+  %sub39.1 = or i32 %sub.1, %sub20.1
+  %conv40.1 = trunc i32 %sub39.1 to i16
+  store i16 %conv40.1, ptr %arrayidx2.1, align 2
+  %sub44.1 = or i32 %add4.1, %add30.1
+  %conv45.1 = trunc i32 %sub44.1 to i16
+  store i16 %conv45.1, ptr %arrayidx18.1, align 2
+  %conv.213 = zext i16 0 to i32
+  %arrayidx2.2 = getelementptr i8, ptr null, i64 20
+  %conv3.214 = zext i16 0 to i32
+  %add4.2 = add i32 0, %conv.213
+  %sub.2 = sub i32 0, %conv3.214
+  %conv15.2 = sext i16 0 to i32
+  %shr.2 = ashr i32 0, 0
+  %arrayidx18.2 = getelementptr i8, ptr null, i64 28
+  %conv19.2 = sext i16 0 to i32
+  %sub20.2 = sub i32 %shr.2, %conv19.2
+  %shr29.2 = ashr i32 0, 0
+  %add30.2 = add i32 %shr29.2, %conv15.2
+  %sub39.2 = or i32 %sub.2, %sub20.2
+  %conv40.2 = trunc i32 %sub39.2 to i16
+  store i16 %conv40.2, ptr %arrayidx2.2, align 2
+  %sub44.2 = or i32 %add4.2, %add30.2
+  %conv45.2 = trunc i32 %sub44.2 to i16
+  store i16 %conv45.2, ptr %arrayidx18.2, align 2
+  %conv.315 = zext i16 0 to i32
+  %arrayidx2.3 = getelementptr i8, ptr null, i64 22
+  %conv3.316 = zext i16 0 to i32
+  %add4.3 = add i32 0, %conv.315
+  %sub.3 = sub i32 0, %conv3.316
+  %conv15.3 = sext i16 0 to i32
+  %shr.3 = ashr i32 0, 0
+  %arrayidx18.3 = getelementptr i8, ptr null, i64 30
+  %conv19.3 = sext i16 0 to i32
+  %sub20.3 = sub i32 %shr.3, %conv19.3
+  %shr29.3 = ashr i32 0, 0
+  %add30.3 = add i32 %shr29.3, %conv15.3
+  %sub39.3 = or i32 %sub.3, %sub20.3
+  %conv40.3 = trunc i32 %sub39.3 to i16
+  store i16 %conv40.3, ptr %arrayidx2.3, align 2
+  %sub44.3 = or i32 %add4.3, %add30.3
+  %conv45.3 = trunc i32 %sub44.3 to i16
+  store i16 %conv45.3, ptr %arrayidx18.3, align 2
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index 47485e514ec2f..1cce52060c479 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 1986b51ec9482..7c5f9847db1f4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -228,7 +228,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; YAML-NEXT: Function:        test_unrolled_select
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-36'
+; YAML-NEXT:   - Cost:            '-41'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '10'
 
@@ -246,15 +246,17 @@ define i32 @test_unrolled_select(ptr noalias nocapture readonly %blk1, ptr noali
 ; CHECK-NEXT:    [[P2_045:%.*]] = phi ptr [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR88:%.*]], [[IF_END_86]] ]
 ; CHECK-NEXT:    [[P1_044:%.*]] = phi ptr [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END_86]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[P1_044]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[P2_045]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP8]], [[S_047]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <8 x i16> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <8 x i16> [[TMP8]] to <8 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP10]], [[S_047]]
 ; CHECK-NEXT:    [[CMP83:%.*]] = icmp slt i32 [[OP_RDX]], [[LIM:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       if.end.86:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
index d67fdc1cd6aa0..a7a7f642ced53 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
@@ -28,21 +28,11 @@ entry:
 define i64 @red_zext_ld_4xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_zext_ld_4xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LD0:%.*]] = load i8, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[LD0]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[ZEXT_1:%.*]] = zext i8 [[LD1]] to i64
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[ZEXT]], [[ZEXT_1]]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_1]], align 1
-; CHECK-NEXT:    [[ZEXT_2:%.*]] = zext i8 [[LD2]] to i64
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[ADD_1]], [[ZEXT_2]]
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 3
-; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[GEP_2]], align 1
-; CHECK-NEXT:    [[ZEXT_3:%.*]] = zext i8 [[LD3]] to i64
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i64 [[ADD_2]], [[ZEXT_3]]
-; CHECK-NEXT:    ret i64 [[ADD_3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %ld0 = load i8, ptr %ptr
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll
new file mode 100644
index 0000000000000..a38f4bdc4640e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -S -mtriple=aarch64 -vector-library=ArmPL -passes=slp-vectorizer | FileCheck %s
+
+@a = common global ptr null, align 8
+
+define void @frem_v2double() {
+; CHECK-LABEL: define void @frem_v2double() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr @a, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr @a, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = frem <2 x double> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr @a, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a0 = load double, ptr getelementptr inbounds (double, ptr @a, i64 0), align 8
+  %a1 = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8
+  %b0 = load double, ptr getelementptr inbounds (double, ptr @a, i64 0), align 8
+  %b1 = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8
+  %r0 = frem double %a0, %b0
+  %r1 = frem double %a1, %b1
+  store double %r0, ptr getelementptr inbounds (double, ptr @a, i64 0), align 8
+  store double %r1, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8
+  ret void
+}
+
+define void @frem_v4float() {
+; CHECK-LABEL: define void @frem_v4float() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr @a, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @a, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = frem <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr @a, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a0 = load float, ptr getelementptr inbounds (float, ptr @a, i64 0), align 8
+  %a1 = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8
+  %a2 = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8
+  %a3 = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8
+  %b0 = load float, ptr getelementptr inbounds (float, ptr @a, i64 0), align 8
+  %b1 = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8
+  %b2 = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8
+  %b3 = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8
+  %r0 = frem float %a0, %b0
+  %r1 = frem float %a1, %b1
+  %r2 = frem float %a2, %b2
+  %r3 = frem float %a3, %b3
+  store float %r0, ptr getelementptr inbounds (float, ptr @a, i64 0), align 8
+  store float %r1, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8
+  store float %r2, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8
+  store float %r3, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
new file mode 100644
index 0000000000000..6404cf4a2cd1d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @h() {
+; CHECK-LABEL: define void @h() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx2 = getelementptr i8, ptr null, i64 16
+  %conv310 = zext i16 0 to i32
+  %add4 = or i32 %conv310, 0
+  %sub = or i32 0, %conv310
+  %conv15 = sext i16 0 to i32
+  %shr = ashr i32 %conv15, 0
+  %arrayidx18 = getelementptr i8, ptr null, i64 24
+  %conv19 = sext i16 0 to i32
+  %sub20 = or i32 %shr, 0
+  %shr29 = ashr i32 %conv19, 0
+  %add30 = or i32 %shr29, %conv15
+  %sub39 = or i32 %sub, %sub20
+  %conv40 = trunc i32 %sub39 to i16
+  store i16 %conv40, ptr %arrayidx2, align 2
+  %sub44 = or i32 %add4, %add30
+  %conv45 = trunc i32 %sub44 to i16
+  store i16 %conv45, ptr %arrayidx18, align 2
+  %arrayidx2.1 = getelementptr i8, ptr null, i64 18
+  %conv3.112 = zext i16 0 to i32
+  %add4.1 = or i32 %conv3.112, 0
+  %sub.1 = or i32 0, %conv3.112
+  %conv15.1 = sext i16 0 to i32
+  %shr.1 = ashr i32 %conv15.1, 0
+  %arrayidx18.1 = getelementptr i8, ptr null, i64 26
+  %conv19.1 = sext i16 0 to i32
+  %sub20.1 = or i32 %shr.1, 0
+  %shr29.1 = ashr i32 %conv19.1, 0
+  %add30.1 = or i32 %shr29.1, 0
+  %sub39.1 = or i32 %sub.1, %sub20.1
+  %conv40.1 = trunc i32 %sub39.1 to i16
+  store i16 %conv40.1, ptr %arrayidx2.1, align 2
+  %sub44.1 = or i32 %add4.1, %add30.1
+  %conv45.1 = trunc i32 %sub44.1 to i16
+  store i16 %conv45.1, ptr %arrayidx18.1, align 2
+  %conv.213 = zext i16 0 to i32
+  %arrayidx2.2 = getelementptr i8, ptr null, i64 20
+  %conv3.214 = zext i16 0 to i32
+  %add4.2 = or i32 0, %conv.213
+  %sub.2 = or i32 0, %conv3.214
+  %conv15.2 = sext i16 0 to i32
+  %shr.2 = ashr i32 %conv15.2, 0
+  %arrayidx18.2 = getelementptr i8, ptr null, i64 28
+  %conv19.2 = sext i16 0 to i32
+  %sub20.2 = or i32 %shr.2, 0
+  %shr29.2 = ashr i32 %conv19.2, 0
+  %add30.2 = or i32 %shr29.2, 0
+  %sub39.2 = or i32 %sub.2, %sub20.2
+  %conv40.2 = trunc i32 %sub39.2 to i16
+  store i16 %conv40.2, ptr %arrayidx2.2, align 2
+  %sub44.2 = or i32 %add4.2, %add30.2
+  %conv45.2 = trunc i32 %sub44.2 to i16
+  store i16 %conv45.2, ptr %arrayidx18.2, align 2
+  %conv.315 = zext i16 0 to i32
+  %arrayidx2.3 = getelementptr i8, ptr null, i64 22
+  %conv3.316 = zext i16 0 to i32
+  %add4.3 = or i32 0, %conv.315
+  %sub.3 = or i32 0, %conv3.316
+  %conv15.3 = sext i16 0 to i32
+  %shr.3 = ashr i32 %conv15.3, 0
+  %arrayidx18.3 = getelementptr i8, ptr null, i64 30
+  %conv19.3 = sext i16 0 to i32
+  %sub20.3 = or i32 %shr.3, 0
+  %shr29.3 = ashr i32 %conv19.3, 0
+  %add30.3 = or i32 %shr29.3, 0
+  %sub39.3 = or i32 %sub.3, %sub20.3
+  %conv40.3 = trunc i32 %sub39.3 to i16
+  store i16 %conv40.3, ptr %arrayidx2.3, align 2
+  %sub44.3 = or i32 %add4.3, %add30.3
+  %conv45.3 = trunc i32 %sub44.3 to i16
+  store i16 %conv45.3, ptr %arrayidx18.3, align 2
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index ed73f7b134465..d87bdfe268991 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -6,7 +6,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 4, i64 6>
@@ -37,10 +37,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP101:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP101]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
@@ -70,9 +70,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
-; CHECK-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP42]], [[TMP41]]
+; CHECK-NEXT:    [[CONV:%.*]] = add i32 [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
@@ -104,32 +104,32 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]]
-; CHECK-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
 ; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]]
 ; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
-; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP76]], 15
-; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
-; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15
-; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
-; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP77]], 15
-; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
-; CHECK-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
-; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
+; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[TMP79]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
 ; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
+; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP107]], 15
+; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
+; CHECK-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15
+; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
+; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
+; CHECK-NEXT:    [[SHR_I49_5:%.*]] = lshr i32 [[CONV1]], 15
+; CHECK-NEXT:    [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
+; CHECK-NEXT:    [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
 ; CHECK-NEXT:    [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; CHECK-NEXT:    [[TMP79:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
+; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1
 ; CHECK-NEXT:    [[TMP81:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP80]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
@@ -147,20 +147,20 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP92]], [[TMP94]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP79]], i32 [[CONV33]], i32 1
+; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
 ; CHECK-NEXT:    [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]]
-; CHECK-NEXT:    [[TMP99:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> [[TMP79]], i32 [[CONV]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
-; CHECK-NEXT:    [[TMP102:%.*]] = add <2 x i32> [[TMP88]], [[TMP101]]
-; CHECK-NEXT:    [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> [[TMP102]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP104:%.*]] = add <2 x i32> [[TMP99]], [[TMP102]]
-; CHECK-NEXT:    [[TMP105:%.*]] = sub <2 x i32> [[TMP102]], [[TMP99]]
-; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <2 x i32> [[TMP104]], i32 0
-; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP104]], i32 1
-; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP107]], [[TMP106]]
+; CHECK-NEXT:    [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
+; CHECK-NEXT:    [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP103]]
+; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP165:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
+; CHECK-NEXT:    [[TMP105:%.*]] = sub <2 x i32> [[TMP200]], [[TMP104]]
+; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
+; CHECK-NEXT:    [[TMP143:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
+; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP143]], [[TMP238]]
 ; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
-; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP107]], 15
+; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP143]], 15
 ; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
 ; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
 ; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15
@@ -185,7 +185,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 1, i64 3>
 ; CHECK-NEXT:    [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT:    [[TMP153:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 5, i64 7>
 ; CHECK-NEXT:    [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
@@ -195,15 +195,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]]
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP128]]
+; CHECK-NEXT:    [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP153]]
 ; CHECK-NEXT:    [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]]
 ; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
 ; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]]
 ; CHECK-NEXT:    [[TMP142:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT:    [[TMP143:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]]
+; CHECK-NEXT:    [[TMP257:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]]
 ; CHECK-NEXT:    [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP139]]
-; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
+; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP257]], i32 0
+; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP257]], i32 1
 ; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]]
 ; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
@@ -217,51 +217,51 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
 ; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
 ; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
-; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP76]]
-; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I51_2]], [[ADD103]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP79]]
+; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51_3]], [[ADD105]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
 ; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]]
 ; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP107]]
+; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP143]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
 ; CHECK-NEXT:    [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB47_2]], i32 1
 ; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_2]], i32 1
-; CHECK-NEXT:    [[TMP153:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
+; CHECK-NEXT:    [[TMP163:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
 ; CHECK-NEXT:    [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP155]]
-; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP153]], i32 1
+; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
 ; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
-; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP158]], [[TMP157]]
-; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP153]], i32 0
+; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP158]], [[TMP157]]
+; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
 ; CHECK-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
-; CHECK-NEXT:    [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP161]], [[TMP160]]
-; CHECK-NEXT:    [[TMP163:%.*]] = sub <2 x i32> [[TMP153]], [[TMP156]]
-; CHECK-NEXT:    [[TMP164:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
-; CHECK-NEXT:    [[TMP165:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
-; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[TMP165]], [[TMP164]]
-; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[TMP164]], [[TMP165]]
+; CHECK-NEXT:    [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP161]], [[TMP160]]
+; CHECK-NEXT:    [[TMP164:%.*]] = sub <2 x i32> [[TMP163]], [[TMP156]]
+; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0
+; CHECK-NEXT:    [[TMP174:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[TMP174]], [[TMP173]]
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[TMP173]], [[TMP174]]
 ; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
-; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP77]]
-; CHECK-NEXT:    [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]]
+; CHECK-NEXT:    [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP167:%.*]] = lshr <2 x i32> [[TMP166]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP168:%.*]] = and <2 x i32> [[TMP167]], <i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP169:%.*]] = mul <2 x i32> [[TMP168]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT:    [[TMP171:%.*]] = shufflevector <2 x i32> [[TMP170]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT:    [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP171]], [[TMP173]]
-; CHECK-NEXT:    [[TMP175:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]]
-; CHECK-NEXT:    [[TMP176:%.*]] = shufflevector <2 x i32> [[TMP174]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP176]]
+; CHECK-NEXT:    [[TMP171:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP208:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT:    [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP282:%.*]] = add <2 x i32> [[TMP171]], [[TMP209]]
+; CHECK-NEXT:    [[TMP211:%.*]] = sub <2 x i32> [[TMP171]], [[TMP209]]
+; CHECK-NEXT:    [[TMP283:%.*]] = shufflevector <2 x i32> [[TMP282]], <2 x i32> [[TMP211]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP283]]
 ; CHECK-NEXT:    [[TMP178:%.*]] = xor <2 x i32> [[TMP177]], [[TMP166]]
 ; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
 ; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP108]]
@@ -271,90 +271,90 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP180:%.*]] = extractelement <2 x i32> [[TMP178]], i32 1
 ; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP180]]
 ; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP165]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_2]], i32 0
-; CHECK-NEXT:    [[TMP183:%.*]] = insertelement <2 x i32> [[TMP104]], i32 [[ADD46_2]], i32 0
+; CHECK-NEXT:    [[TMP183:%.*]] = insertelement <2 x i32> [[TMP165]], i32 [[CONV]], i32 0
 ; CHECK-NEXT:    [[TMP184:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]]
-; CHECK-NEXT:    [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP187:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]]
 ; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <2 x i32> [[TMP184]], i32 0
 ; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0
 ; CHECK-NEXT:    [[TMP190:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP189]], [[TMP188]]
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP189]], [[TMP188]]
 ; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP184]], i32 1
 ; CHECK-NEXT:    [[TMP192:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1
 ; CHECK-NEXT:    [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP192]], [[TMP191]]
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP192]], [[TMP191]]
 ; CHECK-NEXT:    [[TMP194:%.*]] = sub <2 x i32> [[TMP184]], [[TMP187]]
-; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP244:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP244]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
 ; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP199:%.*]] = add <2 x i32> [[TMP196]], [[TMP198]]
-; CHECK-NEXT:    [[TMP200:%.*]] = sub <2 x i32> [[TMP196]], [[TMP198]]
-; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP202:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
+; CHECK-NEXT:    [[TMP246:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]]
+; CHECK-NEXT:    [[TMP247:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]]
+; CHECK-NEXT:    [[TMP248:%.*]] = shufflevector <2 x i32> [[TMP246]], <2 x i32> [[TMP247]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP215:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
 ; CHECK-NEXT:    [[TMP203:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1
-; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP202]], [[TMP203]]
-; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP202]]
-; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
+; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP215]], [[TMP203]]
+; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP215]]
+; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_2]]
 ; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT:    [[TMP204:%.*]] = add <2 x i32> [[TMP149]], [[TMP201]]
-; CHECK-NEXT:    [[TMP205:%.*]] = xor <2 x i32> [[TMP204]], [[TMP110]]
-; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP106]], 15
+; CHECK-NEXT:    [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP248]]
+; CHECK-NEXT:    [[TMP267:%.*]] = xor <2 x i32> [[TMP266]], [[TMP110]]
+; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP238]], 15
 ; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
 ; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
 ; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP106]]
+; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP238]]
 ; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 0
+; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP267]], i32 0
 ; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP206]]
-; CHECK-NEXT:    [[TMP207:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1
+; CHECK-NEXT:    [[TMP207:%.*]] = extractelement <2 x i32> [[TMP267]], i32 1
 ; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP207]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP208:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB45_2]], i32 0
-; CHECK-NEXT:    [[TMP209:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB47_2]], i32 0
-; CHECK-NEXT:    [[TMP210:%.*]] = sub <2 x i32> [[TMP208]], [[TMP209]]
-; CHECK-NEXT:    [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP221:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB45_2]], i32 0
+; CHECK-NEXT:    [[TMP222:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB47_2]], i32 0
+; CHECK-NEXT:    [[TMP210:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
+; CHECK-NEXT:    [[TMP225:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP212]]
+; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP225]], [[TMP212]]
 ; CHECK-NEXT:    [[TMP214:%.*]] = extractelement <2 x i32> [[TMP210]], i32 0
-; CHECK-NEXT:    [[TMP215:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP215]], [[TMP214]]
+; CHECK-NEXT:    [[TMP227:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
+; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP227]], [[TMP214]]
 ; CHECK-NEXT:    [[TMP217:%.*]] = extractelement <2 x i32> [[TMP210]], i32 1
-; CHECK-NEXT:    [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP210]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP218]], [[TMP217]]
-; CHECK-NEXT:    [[TMP220:%.*]] = sub <2 x i32> [[TMP210]], [[TMP213]]
-; CHECK-NEXT:    [[TMP221:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
-; CHECK-NEXT:    [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP221]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP218:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SUB59:%.*]] = add i32 [[TMP218]], [[TMP217]]
+; CHECK-NEXT:    [[TMP220:%.*]] = sub <2 x i32> [[TMP210]], [[TMP226]]
+; CHECK-NEXT:    [[TMP274:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP275:%.*]] = shufflevector <2 x i32> [[TMP274]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
 ; CHECK-NEXT:    [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP225:%.*]] = add <2 x i32> [[TMP222]], [[TMP224]]
-; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP222]], [[TMP224]]
-; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP276:%.*]] = add <2 x i32> [[TMP275]], [[TMP224]]
+; CHECK-NEXT:    [[TMP277:%.*]] = sub <2 x i32> [[TMP275]], [[TMP224]]
+; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP276]], <2 x i32> [[TMP277]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP228:%.*]] = extractelement <2 x i32> [[TMP220]], i32 0
 ; CHECK-NEXT:    [[TMP229:%.*]] = extractelement <2 x i32> [[TMP220]], i32 1
 ; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[TMP228]], [[TMP229]]
 ; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[TMP229]], [[TMP228]]
-; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]]
-; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]]
-; CHECK-NEXT:    [[TMP230:%.*]] = lshr <2 x i32> [[TMP79]], <i32 15, i32 15>
+; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_3]]
+; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
+; CHECK-NEXT:    [[TMP230:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP231:%.*]] = and <2 x i32> [[TMP230]], <i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP232:%.*]] = mul <2 x i32> [[TMP231]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP233:%.*]] = add <2 x i32> [[TMP232]], [[TMP227]]
-; CHECK-NEXT:    [[TMP234:%.*]] = xor <2 x i32> [[TMP233]], [[TMP79]]
+; CHECK-NEXT:    [[TMP286:%.*]] = add <2 x i32> [[TMP232]], [[TMP278]]
+; CHECK-NEXT:    [[TMP287:%.*]] = xor <2 x i32> [[TMP286]], [[TMP102]]
 ; CHECK-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
 ; CHECK-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
 ; CHECK-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP235:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
+; CHECK-NEXT:    [[TMP235:%.*]] = extractelement <2 x i32> [[TMP287]], i32 0
 ; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP235]]
-; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
+; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP287]], i32 1
 ; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP236]]
 ; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
index 000e7a56df377..500f10659f04c 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@@ -802,9 +802,10 @@ define i64 @red_zext_ld_4xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_zext_ld_4xi64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
-; CHECK-NEXT:    ret i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %ld0 = load i8, ptr %ptr
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
index 4565d4928ba4a..05511f843a68f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
@@ -15,11 +15,12 @@ define { i64, i64 } @patatino(double %arg) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1
 ; CHECK-NEXT:    ret { i64, i64 } [[T17]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
index 12b227cbeb3d3..a153c3ceeaf5d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
@@ -25,8 +25,8 @@ define i32 @fn1() {
 ; CHECK-NEXT:    store i32 [[AND]], ptr @a, align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    switch i32 [[AND]], label [[IF_END:%.*]] [
-; CHECK-NEXT:    i32 7, label [[SAVE_STATE_AND_RETURN]]
-; CHECK-NEXT:    i32 0, label [[SAVE_STATE_AND_RETURN]]
+; CHECK-NEXT:      i32 7, label [[SAVE_STATE_AND_RETURN]]
+; CHECK-NEXT:      i32 0, label [[SAVE_STATE_AND_RETURN]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    br label [[SAVE_STATE_AND_RETURN]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
index a0af8e36b36c7..5ee8016076538 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-3 < %s | FileCheck %s
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-6 < %s | FileCheck %s
 
 define void @t(i64 %v) {
 ; CHECK-LABEL: define void @t(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
new file mode 100644
index 0000000000000..fc28d7ab4ee74
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mcpu=cascadelake < %s | FileCheck %s
+
+define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], i64 [[TMP1:%.*]], ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT:    [[BF_LOAD_I1336:%.*]] = load i24, ptr [[TMP2]], align 16
+; CHECK-NEXT:    [[AND_I_I_I1342:%.*]] = and i64 [[TMP1]], -16
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[AND_I_I_I1342]] to ptr
+; CHECK-NEXT:    store ptr [[TMP3]], ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16
+; CHECK-NEXT:    [[BF_LOAD_I1345:%.*]] = load i24, ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i24> poison, i24 [[BF_LOAD_I1336]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i24> [[TMP6]], i24 [[BF_LOAD_I1345]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = and <2 x i24> [[TMP7]], <i24 255, i24 255>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
+; CHECK-NEXT:    [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
+; CHECK-NEXT:    [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <2 x i32> [[TMP14]], <i32 32, i32 32>
+; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP15]], <2 x i8> <i8 31, i8 31>, <2 x i8> [[TMP25]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <2 x i32> [[TMP16]], <i32 54, i32 54>
+; CHECK-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP17]], <2 x i8> <i8 53, i8 53>, <2 x i8> [[TMP18]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i8> [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[P1]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i8> [[TMP21]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[CMP210_NOT:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    ret i1 [[CMP210_NOT]]
+;
+newFuncRoot:
+  %2 = getelementptr inbounds i8, ptr %0, i64 16
+  %bf.load.i1336 = load i24, ptr %2, align 16
+  %bf.clear.i1337 = and i24 %bf.load.i1336, 255
+  %and.i.i.i1342 = and i64 %1, -16
+  %3 = inttoptr i64 %and.i.i.i1342 to ptr
+  store ptr %3, ptr %p, align 8
+  %4 = load ptr, ptr %3, align 16
+  %5 = getelementptr inbounds i8, ptr %4, i64 16
+  %bf.load.i1345 = load i24, ptr %5, align 16
+  %bf.clear.i1346 = and i24 %bf.load.i1345, 255
+  %cmp182 = icmp eq i24 %bf.clear.i1337, 24
+  %narrow = select i1 %cmp182, i24 23, i24 %bf.clear.i1337
+  %s = zext nneg i24 %narrow to i32
+  %cmp185 = icmp eq i24 %bf.clear.i1346, 24
+  %narrow1790 = select i1 %cmp185, i24 23, i24 %bf.clear.i1346
+  %s1139 = zext nneg i24 %narrow1790 to i32
+  %6 = and i32 %s, 254
+  %or.cond1132 = icmp eq i32 %6, 4
+  %s1142 = select i1 %or.cond1132, i32 2, i32 %s
+  %7 = and i32 %s1139, 254
+  %or.cond1133 = icmp eq i32 %7, 4
+  %s1140 = select i1 %or.cond1133, i32 2, i32 %s1139
+  %cmp198 = icmp eq i32 %s1142, 32
+  %s1134 = select i1 %cmp198, i32 31, i32 %s1142
+  %cmp201 = icmp eq i32 %s1140, 32
+  %s1143 = select i1 %cmp201, i32 31, i32 %s1140
+  %cmp204 = icmp eq i32 %s1134, 54
+  %s1135 = select i1 %cmp204, i32 53, i32 %s1134
+  store i32 %s1135, ptr %p1, align 4
+  %cmp207 = icmp eq i32 %s1143, 54
+  %s1141 = select i1 %cmp207, i32 53, i32 %s1143
+  %cmp210.not = icmp eq i32 %s1135, %s1141
+  ret i1 %cmp210.not
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll
index 6e512fcbb7392..6051638562b59 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll
@@ -6,18 +6,17 @@ define void @test(i8 %0) {
 ; CHECK-SAME: i8 [[TMP0:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> <i8 0, i8 poison>, i8 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i16> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
-; CHECK-NEXT:    [[ADD:%.*]] = or i32 [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = or i32 [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD]], 1
 ; CHECK-NEXT:    [[CONV9:%.*]] = trunc i32 [[SHR]] to i8
 ; CHECK-NEXT:    store i8 [[CONV9]], ptr null, align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
index 2c834616becc0..4acd63078b82e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
@@ -6,15 +6,20 @@ define void @test(i64 %d.promoted.i) {
 ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[AND_1_I_1:%.*]] = and i64 0, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I_1]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i64> [[TMP0]], i64 [[AND_1_I]], i32 9
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <16 x i1> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 0
-; CHECK-NEXT:    store i32 [[TMP6]], ptr null, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I_1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and i32 [[OP_RDX]], 0
+; CHECK-NEXT:    store i32 [[TMP10]], ptr null, align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
new file mode 100644
index 0000000000000..50b19d01ad58f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %block, ptr noalias %pixels, i1 %b) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[BLOCK:%.*]], ptr noalias [[PIXELS:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i1> <i1 true, i1 poison, i1 false, i1 false>, i1 [[B]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[BLOCK]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i16> [[TMP2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i8> [[TMP4]], <4 x i8> [[TMP1]]
+; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr [[PIXELS]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i16, ptr %block, align 2
+  %tobool.not.i78 = icmp ult i16 %0, 0
+  %conv.i80 = sext i1 true to i8
+  %conv1.i81 = trunc i16 %0 to i8
+  %retval.0.i82 = select i1 %tobool.not.i78, i8 %conv1.i81, i8 %conv.i80
+  store i8 %retval.0.i82, ptr %pixels, align 1
+  %arrayidx2 = getelementptr i8, ptr %block, i64 2
+  %1 = load i16, ptr %arrayidx2, align 2
+  %tobool.not.i73 = icmp ult i16 %1, 0
+  %conv.i75 = sext i1 %b to i8
+  %conv1.i76 = trunc i16 %1 to i8
+  %retval.0.i77 = select i1 %tobool.not.i73, i8 %conv1.i76, i8 %conv.i75
+  %arrayidx5 = getelementptr i8, ptr %pixels, i64 1
+  store i8 %retval.0.i77, ptr %arrayidx5, align 1
+  %arrayidx6 = getelementptr i8, ptr %block, i64 4
+  %2 = load i16, ptr %arrayidx6, align 2
+  %tobool.not.i68 = icmp ult i16 %2, 0
+  %conv.i70 = sext i1 false to i8
+  %conv1.i71 = trunc i16 %2 to i8
+  %retval.0.i72 = select i1 %tobool.not.i68, i8 %conv1.i71, i8 %conv.i70
+  %arrayidx9 = getelementptr i8, ptr %pixels, i64 2
+  store i8 %retval.0.i72, ptr %arrayidx9, align 1
+  %arrayidx10 = getelementptr i8, ptr %block, i64 6
+  %3 = load i16, ptr %arrayidx10, align 2
+  %tobool.not.i63 = icmp ult i16 %3, 0
+  %conv.i65 = sext i1 false to i8
+  %conv1.i66 = trunc i16 %3 to i8
+  %retval.0.i67 = select i1 %tobool.not.i63, i8 %conv1.i66, i8 %conv.i65
+  %arrayidx13 = getelementptr i8, ptr %pixels, i64 3
+  store i8 %retval.0.i67, ptr %arrayidx13, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
index 651631de2c35a..a316415dcc6b5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -17,12 +17,15 @@ target triple = "x86_64-unknown-linux-gnu"
 define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; SSE-LABEL: @PR31243_zext(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; SSE-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
-; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
-; SSE-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; SSE-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
+; SSE-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
+; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
 ; SSE-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; SSE-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; SSE-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -73,12 +76,15 @@ entry:
 define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; SSE-LABEL: @PR31243_sext(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; SSE-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
-; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
-; SSE-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; SSE-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
+; SSE-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
+; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
 ; SSE-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; SSE-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; SSE-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -89,13 +95,12 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
-; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
-; AVX-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
+; AVX-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
+; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
+; AVX-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
+; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
 ; AVX-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; AVX-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; AVX-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
index 88f75c37846ef..3cc32c1fc7b28 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
@@ -15,8 +15,8 @@ define i32 @phi3UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 undef, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -52,8 +52,8 @@ define i32 @phi2UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -89,8 +89,8 @@ define i32 @phi1UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 0, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -127,8 +127,8 @@ define i32 @phi1Undef1PoisonInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %ar
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -165,8 +165,8 @@ define i32 @phi1Undef2PoisonInputs(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %a
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 poison, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -202,8 +202,8 @@ define i32 @phi1Undef1PoisonGapInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index 6f5d3d3785e0c..f6db831ed97b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -108,3 +108,111 @@ entry:
   store i32 %conv27, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 3), align 4
   ret void
 }
+
+define void @test_div() {
+; CHECK-LABEL: define void @test_div(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i32> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[TMP4]], <i64 1, i64 2, i64 1, i64 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx1 = getelementptr i32, ptr null, i64 1
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr i32, ptr null, i64 63
+  %1 = load i32, ptr %arrayidx2, align 4
+  %mul = mul i32 %1, %0
+  %conv = zext i32 %mul to i64
+  %shr = udiv i64 %conv, 1
+  %conv3 = trunc i64 %shr to i32
+  store i32 %conv3, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+  %arrayidx5 = getelementptr i32, ptr null, i64 33
+  %2 = load i32, ptr %arrayidx5, align 4
+  %arrayidx6 = getelementptr i32, ptr null, i64 62
+  %3 = load i32, ptr %arrayidx6, align 4
+  %mul7 = mul i32 %3, %2
+  %conv8 = zext i32 %mul7 to i64
+  %shr10 = udiv i64 %conv8, 2
+  %conv11 = trunc i64 %shr10 to i32
+  store i32 %conv11, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 1), align 4
+  %arrayidx13 = getelementptr i32, ptr null, i64 7
+  %4 = load i32, ptr %arrayidx13, align 4
+  %arrayidx14 = getelementptr i32, ptr null, i64 61
+  %5 = load i32, ptr %arrayidx14, align 4
+  %mul15 = mul i32 %5, %4
+  %conv16 = zext i32 %mul15 to i64
+  %shr18 = udiv i64 %conv16, 1
+  %conv19 = trunc i64 %shr18 to i32
+  store i32 %conv19, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 2), align 8
+  %6 = load i32, ptr null, align 4
+  %arrayidx22 = getelementptr i32, ptr null, i64 60
+  %7 = load i32, ptr %arrayidx22, align 4
+  %mul23 = mul i32 %7, %6
+  %conv24 = zext i32 %mul23 to i64
+  %shr26 = udiv i64 %conv24, 2
+  %conv27 = trunc i64 %shr26 to i32
+  store i32 %conv27, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 3), align 4
+  ret void
+}
+
+define void @test_rem() {
+; CHECK-LABEL: define void @test_rem(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i32> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = urem <4 x i64> [[TMP4]], <i64 1, i64 2, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx1 = getelementptr i32, ptr null, i64 1
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr i32, ptr null, i64 63
+  %1 = load i32, ptr %arrayidx2, align 4
+  %mul = mul i32 %1, %0
+  %conv = zext i32 %mul to i64
+  %shr = urem i64 %conv, 1
+  %conv3 = trunc i64 %shr to i32
+  store i32 %conv3, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+  %arrayidx5 = getelementptr i32, ptr null, i64 33
+  %2 = load i32, ptr %arrayidx5, align 4
+  %arrayidx6 = getelementptr i32, ptr null, i64 62
+  %3 = load i32, ptr %arrayidx6, align 4
+  %mul7 = mul i32 %3, %2
+  %conv8 = zext i32 %mul7 to i64
+  %shr10 = urem i64 %conv8, 2
+  %conv11 = trunc i64 %shr10 to i32
+  store i32 %conv11, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 1), align 4
+  %arrayidx13 = getelementptr i32, ptr null, i64 7
+  %4 = load i32, ptr %arrayidx13, align 4
+  %arrayidx14 = getelementptr i32, ptr null, i64 61
+  %5 = load i32, ptr %arrayidx14, align 4
+  %mul15 = mul i32 %5, %4
+  %conv16 = zext i32 %mul15 to i64
+  %shr18 = urem i64 %conv16, 1
+  %conv19 = trunc i64 %shr18 to i32
+  store i32 %conv19, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 2), align 8
+  %6 = load i32, ptr null, align 4
+  %arrayidx22 = getelementptr i32, ptr null, i64 60
+  %7 = load i32, ptr %arrayidx22, align 4
+  %mul23 = mul i32 %7, %6
+  %conv24 = zext i32 %mul23 to i64
+  %shr26 = urem i64 %conv24, 1
+  %conv27 = trunc i64 %shr26 to i32
+  store i32 %conv27, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 3), align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
index 65229d8acd889..f0e734d8c5aef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
@@ -10,29 +10,29 @@ define  void @foo (ptr %A, ptr %B, ptr %Result) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP18:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], ptr [[A:%.*]], i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], ptr [[B:%.*]], i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], ptr [[B]], i64 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP9]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP9]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP11]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[TMP11]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> [[TMP15]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP17]] = fadd <2 x float> [[TMP2]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP8]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP16:%.*]] = fsub <2 x float> [[TMP11]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x float> [[TMP11]], [[TMP15]]
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP20]] = fadd <2 x float> [[TMP2]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP18]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    store <2 x float> [[TMP17]], ptr [[RESULT:%.*]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP20]], ptr [[RESULT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index 78c6d9516a3de..b7237cbb02bb3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -11,26 +11,26 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK:       if.then22.i:
 ; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
 ; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[SHUFFLE1]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
 ; CHECK-NEXT:    [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6
 ; CHECK-NEXT:    [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <8 x i32> [[SHUFFLE]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SHR_4_I_I]], i32 5
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_5_I_I]], i32 6
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_6_I_I]], i32 7
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i8> [[TMP13]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    store <16 x i8> [[TMP14]], ptr undef, align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i8> [[TMP14]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr undef, align 1
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end50.i:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
index 5d22b5a4873be..1d1fcec2a7aeb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
@@ -7,12 +7,10 @@ define i1 @test(i1 %cmp5.not.31) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i1> <i1 poison, i1 false, i1 false, i1 false>, i1 [[CMP5_NOT_31]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], <i32 2, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 0
-; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i32 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], <i32 2, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 0
+; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i32 [[TMP4]], 0
 ; CHECK-NEXT:    ret i1 [[CMP_NOT_I_I]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
index 35f2f9e052e74..f1be11d0d0fc5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
@@ -23,10 +23,11 @@ define void @test(i32 %arg) {
 ; CHECK-NEXT:    ]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ [[TMP0]], [[BB2]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i32, ptr null, i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr null, i64 [[TMP6]]
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb7:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
index c1dd90d0e9a7b..2f6868d8dfd62 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
@@ -8,17 +8,18 @@
 ; YAML-NEXT:  Function:        stores
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-3'
+; YAML-NEXT:    - Cost:            '-7'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '6'
 define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) {
 ; CHECK-LABEL: @stores(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %load.1 = load i8, ptr %in, align 1
@@ -63,17 +64,18 @@ define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) {
 ; YAML-NEXT:  Function:        insertelems
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-5'
+; YAML-NEXT:    - Cost:            '-9'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '6'
 define <4 x i64> @insertelems(ptr noalias %in, ptr noalias %inn) {
 ; CHECK-LABEL: @insertelems(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[TMP6]]
 ;
   %load.1 = load i8, ptr %in, align 1
   %gep.1 = getelementptr inbounds i8, ptr %in, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 22cd408cd6dc7..e8395fe69ea6b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -179,11 +179,11 @@ define void @addsub0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
 ; CHECK-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -212,11 +212,11 @@ define void @addsub1(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
@@ -531,11 +531,11 @@ define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
 ; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -564,11 +564,11 @@ define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
diff --git a/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll
index 061fbdb45a13b..ff6f0bdd3db8f 100644
--- a/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll
@@ -10,8 +10,8 @@ define i32 @alt_cmp(i16 %call46) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i16
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = or i16 [[TMP6]], 0
 ; CHECK-NEXT:    [[EXT:%.*]] = zext i16 [[OP_RDX]] to i32
 ; CHECK-NEXT:    ret i32 [[EXT]]
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list-preinliner.prof b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list-preinliner.prof
new file mode 100644
index 0000000000000..7388d574be5e8
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list-preinliner.prof
@@ -0,0 +1,14 @@
+main:8001:0
+ 1: 0
+ 2: 2000
+ 3: 2000
+ 4: 0
+ 5: 2000
+ 6: 2000
+ 7: 0
+ 8: 0
+ 9: bar:1
+  1: 1
+  !CFGChecksum: 4294967295
+  !Attributes: 2
+ !CFGChecksum: 563088156202820
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list-preinliner.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list-preinliner.ll
new file mode 100644
index 0000000000000..9e342f2b99da3
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list-preinliner.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list-preinliner.prof -S -profile-summary-cutoff-hot=100000 -sample-profile-use-preinliner=0 | FileCheck %s --check-prefix=DISABLE-PREINLINE
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list-preinliner.prof -S -profile-summary-cutoff-hot=100000 | FileCheck %s
+
+; The GUID of bar is -2012135647395072713
+
+; DISABLE-PREINLINE-NOT: -2012135647395072713
+; CHECK: [[#]] = !{!"function_entry_count", i64 1, i64 -2012135647395072713}
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  call void @llvm.pseudoprobe(i64 0, i64 0, i32 0, i64 0)
+  %call2 = call i32 @bar(), !dbg !9
+  br label %for.cond
+}
+
+declare i32 @bar()
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
+
+attributes #0 = { "use-sample-profile" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7}
+!llvm.pseudo_probe_desc = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home/", checksumkind: CSK_MD5, checksum: "1bff37d8b3f7858b0bc29ab4efdf9422")
+!2 = !{!3}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = distinct !DIGlobalVariable(name: "x", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6)
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i64 -2624081020897602054, i64 563108639284859, !"main"}
+!9 = !DILocation(line: 11, column: 10, scope: !10)
+!10 = !DILexicalBlockFile(scope: !11, file: !1, discriminator: 186646615)
+!11 = distinct !DILexicalBlock(scope: !12, file: !1, line: 8, column: 40)
+!12 = distinct !DILexicalBlock(scope: !13, file: !1, line: 8, column: 3)
+!13 = distinct !DILexicalBlock(scope: !14, file: !1, line: 8, column: 3)
+!14 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 6, type: !15, scopeLine: 7, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
+!15 = distinct !DISubroutineType(types: !16)
+!16 = !{}
diff --git a/llvm/test/Verifier/intrinsic-cmp.ll b/llvm/test/Verifier/intrinsic-cmp.ll
new file mode 100644
index 0000000000000..2224a5c5eba38
--- /dev/null
+++ b/llvm/test/Verifier/intrinsic-cmp.ll
@@ -0,0 +1,22 @@
+; RUN: not opt -S -passes=verify 2>&1 < %s | FileCheck %s
+
+define void @matching_vector_lens(<4 x i32> %arg1, <4 x i32> %arg2) {
+  ; CHECK: return type and arguments must have the same number of elements
+  %res = call <8 x i32> @llvm.scmp.v8i32.v4i32(<4 x i32> %arg1, <4 x i32> %arg2)
+  ret void
+}
+
+define void @result_len_is_at_least_2bits_wide(i32 %arg1, i32 %arg2) {
+  ; CHECK: result type must be at least 2 bits wide
+  %res2 = call i1 @llvm.scmp.i1.i32(i32 %arg1, i32 %arg2)
+  ret void
+}
+
+define void @both_args_are_vecs_or_neither(<4 x i32> %arg1, i32 %arg2) {
+  ; CHECK: ucmp/scmp argument and result types must both be either vector or scalar types
+  %res3 = call i2 @llvm.scmp.i2.v4i32(<4 x i32> %arg1, <4 x i32> %arg1)
+  ; CHECK: ucmp/scmp argument and result types must both be either vector or scalar types
+  %res4 = call <4 x i32> @llvm.scmp.v4i32.i32(i32 %arg2, i32 %arg2)
+  ret void
+}
+
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.expected
index 02d8870c61365..775649cee0e76 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.expected
@@ -41,7 +41,7 @@ declare void @_Z10sideeffectv()
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -52,7 +52,7 @@ declare void @_Z10sideeffectv()
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.globals.expected
index 05e57774c260e..a8086ae215709 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.globals.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.globals.expected
@@ -44,7 +44,7 @@ declare void @_Z10sideeffectv()
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -55,7 +55,7 @@ declare void @_Z10sideeffectv()
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.expected
index 36bcbe32e0363..57de350dec15e 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.expected
@@ -12,7 +12,7 @@ define void @foo(i32) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -36,7 +36,7 @@ define void @bar(i32) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.globals.expected
index db7c692a7def9..696d5c6edee6f 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.globals.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.globals.expected
@@ -15,7 +15,7 @@ define void @foo(i32) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -39,7 +39,7 @@ define void @bar(i32) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.expected
index 10399951e7fbc..5275870b4088d 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.expected
@@ -42,7 +42,7 @@ declare void @_Z10sideeffectv()
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
@@ -53,7 +53,7 @@ declare void @_Z10sideeffectv()
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.globals.expected
index 0001790e66ab3..712ccb25ae857 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.globals.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.globals.expected
@@ -45,7 +45,7 @@ declare void @_Z10sideeffectv()
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
@@ -56,7 +56,7 @@ declare void @_Z10sideeffectv()
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.expected
index e05a57d06413b..b5b12b770522b 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.expected
@@ -13,7 +13,7 @@ define void @foo(i32) {
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
@@ -37,7 +37,7 @@ define void @bar(i32) {
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.globals.expected
index 17be222f15c12..7e2b991cd6fcc 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.globals.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.globals.expected
@@ -16,7 +16,7 @@ define void @foo(i32) {
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
@@ -40,7 +40,7 @@ define void @bar(i32) {
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/dsymutil/ARM/obfuscated.test b/llvm/test/tools/dsymutil/ARM/obfuscated.test
deleted file mode 100644
index 3443b8e634692..0000000000000
--- a/llvm/test/tools/dsymutil/ARM/obfuscated.test
+++ /dev/null
@@ -1,200 +0,0 @@
-REQUIRES: system-darwin
-
-RUN: dsymutil --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck %s
-
-RUN: dsymutil --accelerator=Pub --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=PUB %s
-
-RUN: dsymutil --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=NOHIDDEN %s
-
-RUN: dsymutil --symbol-map %p/../Inputs/obfuscated.2.map %p/../Inputs/obfuscated.2.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=NOHIDDEN %s
-
-// Run with plist and make sure dsymutil finds it.
-RUN: mkdir -p %t.dSYM/Contents/Resources/DWARF/
-RUN: mkdir -p %t.mapdir
-RUN: cp %p/../Inputs/obfuscated.arm64 %t.dSYM/Contents/Resources/DWARF/
-RUN: cp %p/../Inputs/E828A486-8433-3A5E-B6DB-A6294D28133D.plist %t.dSYM/Contents/Resources/
-RUN: cp %p/../Inputs/obfuscated.map %t.mapdir/506AA50A-6B26-3B37-86D2-DC6EBD57B720.bcsymbolmap
-RUN: dsymutil --symbol-map %t.mapdir %t.dSYM 2>&1 | FileCheck --check-prefix=OBFUSCATING %s
-
-// Run without plist and make sure dsymutil doesn't crash.
-RUN: rm %t.dSYM/Contents/Resources/E828A486-8433-3A5E-B6DB-A6294D28133D.plist
-RUN: dsymutil --symbol-map %t.mapdir %t.dSYM 2>&1 | FileCheck --check-prefix=NOTOBFUSCATING %s
-
-// ----------------------------------------
-// Repeat the same steps for --linker parallel.
-RUN: dsymutil --linker parallel --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck %s
-
-RUN: dsymutil --linker parallel --accelerator=Pub --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=PUB %s
-
-RUN: dsymutil --linker parallel --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=NOHIDDEN %s
-
-RUN: dsymutil --linker parallel --symbol-map %p/../Inputs/obfuscated.2.map %p/../Inputs/obfuscated.2.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=NOHIDDEN %s
-
-// Run with plist and make sure dsymutil finds it.
-RUN: mkdir -p %t.dSYM/Contents/Resources/DWARF/
-RUN: mkdir -p %t.mapdir
-RUN: cp %p/../Inputs/obfuscated.arm64 %t.dSYM/Contents/Resources/DWARF/
-RUN: cp %p/../Inputs/E828A486-8433-3A5E-B6DB-A6294D28133D.plist %t.dSYM/Contents/Resources/
-RUN: cp %p/../Inputs/obfuscated.map %t.mapdir/506AA50A-6B26-3B37-86D2-DC6EBD57B720.bcsymbolmap
-RUN: dsymutil --linker parallel --symbol-map %t.mapdir %t.dSYM 2>&1 | FileCheck --check-prefix=OBFUSCATING %s
-
-// Run without plist and make sure dsymutil doesn't crash.
-RUN: rm %t.dSYM/Contents/Resources/E828A486-8433-3A5E-B6DB-A6294D28133D.plist
-RUN: dsymutil --linker parallel --symbol-map %t.mapdir %t.dSYM 2>&1 | FileCheck --check-prefix=NOTOBFUSCATING %s
-
-OBFUSCATING-NOT: not unobfuscating
-
-NOTOBFUSCATING: not unobfuscating
-
-NOHIDDEN-NOT: __hidden#
-
-CHECK: .debug_info contents:
-
-CHECK: DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "main.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "main")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "one.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "one")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "two.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "two")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "three.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "three")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "four.c")
-CHECK:    DW_AT_stmt_list [DW_FORM_data4]  (0x0000011e)
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "four")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "five.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "five")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "six.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "six")
-
-CHECK: .debug_line contents:
-CHECK: file_names[  1]:
-CHECK:            name: "main.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "one.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "two.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "three.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "four.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "five.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "six.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-
-PUB: .debug_pubnames contents:
-PUB: length = 0x00000017, format = DWARF32, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000044
-PUB: 0x0000002e "main"
-PUB: length = 0x00000016, format = DWARF32, version = 0x0002, unit_offset = 0x00000044, unit_size = 0x00000044
-PUB: 0x0000002e "one"
-PUB: length = 0x00000016, format = DWARF32, version = 0x0002, unit_offset = 0x00000088, unit_size = 0x00000044
-PUB: 0x0000002e "two"
-PUB: length = 0x00000018, format = DWARF32, version = 0x0002, unit_offset = 0x000000cc, unit_size = 0x00000044
-PUB: 0x0000002e "three"
-PUB: length = 0x00000017, format = DWARF32, version = 0x0002, unit_offset = 0x00000110, unit_size = 0x00000044
-PUB: 0x0000002e "four"
-PUB: length = 0x00000017, format = DWARF32, version = 0x0002, unit_offset = 0x00000154, unit_size = 0x00000044
-PUB: 0x0000002e "five"
-PUB: length = 0x00000016, format = DWARF32, version = 0x0002, unit_offset = 0x00000198, unit_size = 0x00000044
-PUB: 0x0000002e "six"
-
-CHECK: .apple_names contents:
-
-CHECK: String: 0x00000091 "five"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x00000182
-CHECK-NEXT: ]
-CHECK: String: 0x0000009c "six"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x000001c6
-CHECK-NEXT: ]
-CHECK: String: 0x00000078 "three"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x000000fa
-CHECK-NEXT: ]
-CHECK: String: 0x0000006c "two"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x000000b6
-CHECK-NEXT: ]
-CHECK: String: 0x00000057 "main"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x0000002e
-CHECK-NEXT: ]
-CHECK: String: 0x00000085 "four"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x0000013e
-CHECK-NEXT: ]
-CHECK: String: 0x00000062 "one"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x00000072
-CHECK-NEXT: ]
diff --git a/llvm/test/tools/dsymutil/Inputs/obfuscated.2.arm64 b/llvm/test/tools/dsymutil/Inputs/obfuscated.2.arm64
deleted file mode 100644
index b40e023eb4916..0000000000000
Binary files a/llvm/test/tools/dsymutil/Inputs/obfuscated.2.arm64 and /dev/null differ
diff --git a/llvm/test/tools/dsymutil/Inputs/obfuscated.2.map b/llvm/test/tools/dsymutil/Inputs/obfuscated.2.map
deleted file mode 100644
index 6efca5960311d..0000000000000
--- a/llvm/test/tools/dsymutil/Inputs/obfuscated.2.map
+++ /dev/null
@@ -1,22 +0,0 @@
-BCSymbolMap Version: 2.0
-_two
-_three
-_four
-_five
-_six
-LLVM version 3.9.0 (ssh://git@stash.sd.apple.com/devtools/clang.git c74ae34bd917b77f9c848bd599dfde2813fb509f)
-main
-main.c
-/Volumes/Data/dev/BitcodeBuildTests/unit
-one
-one.c
-two
-two.c
-three
-three.c
-four
-four.c
-five
-five.c
-six
-six.c
diff --git a/llvm/test/tools/dsymutil/Inputs/obfuscated.arm64 b/llvm/test/tools/dsymutil/Inputs/obfuscated.arm64
deleted file mode 100644
index 8395798be17ff..0000000000000
Binary files a/llvm/test/tools/dsymutil/Inputs/obfuscated.arm64 and /dev/null differ
diff --git a/llvm/test/tools/dsymutil/Inputs/obfuscated.map b/llvm/test/tools/dsymutil/Inputs/obfuscated.map
deleted file mode 100644
index 30fed8bf9b5a5..0000000000000
--- a/llvm/test/tools/dsymutil/Inputs/obfuscated.map
+++ /dev/null
@@ -1,17 +0,0 @@
-one
-two
-three
-four
-five
-six
-.str
-Apple LLVM version 7.0.0 (clang-700.2.38.2)
-main
-main.c
-/Users/steven/dev/alpena/tests/src
-one.c
-two.c
-three.c
-four.c
-five.c
-six.c
diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test
index 36cf3f542695c..814252b6e2306 100644
--- a/llvm/test/tools/dsymutil/cmdline.test
+++ b/llvm/test/tools/dsymutil/cmdline.test
@@ -28,7 +28,6 @@ CHECK: -remarks-output-format <format>
 CHECK: -remarks-prepend-path <path>
 CHECK: -reproducer <mode>
 CHECK: -statistics
-CHECK: -symbol-map
 CHECK: -symtab
 CHECK: {{-S}}
 CHECK: -toolchain
diff --git a/llvm/test/tools/llvm-ar/coff-symtab.test b/llvm/test/tools/llvm-ar/coff-symtab.test
index 4f7270d9e2c6e..4a574723a9bea 100644
--- a/llvm/test/tools/llvm-ar/coff-symtab.test
+++ b/llvm/test/tools/llvm-ar/coff-symtab.test
@@ -16,7 +16,7 @@ RUN: llvm-nm --print-armap out3.a | FileCheck %s
 
 Create an empty archive with no symbol map, add a COFF file to it and check that the output archive is a COFF archive.
 
-RUN: llvm-ar rcS out4.a
+RUN: llvm-ar --format coff rcS out4.a
 RUN: llvm-ar rs out4.a coff-symtab.obj
 RUN: llvm-nm --print-armap out4.a | FileCheck %s
 
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/dw-at-specification.test b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/dw-at-specification.test
index 9c9118ea49f8a..b1b35e4f4fe3e 100644
--- a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/dw-at-specification.test
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/dw-at-specification.test
@@ -10,7 +10,7 @@
 
 ; The above test compiled with clang++ produces both a DW_AT_type and
 ; DW_AT_specification on the definition die for S::Arr, which previously caused
-; an assert in the LVELFReader:
+; an assert in the LVDWARFReader:
 ; $ clang++ -g -c dw-at-specification.cpp -o dw-at-specification.o
 
 ; RUN: llvm-debuginfo-analyzer --attribute=level,format,producer \
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-ignored-DW_FORM_implicit_const.test b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-ignored-DW_FORM_implicit_const.test
index 7ee9f3118d0aa..9df058c559805 100644
--- a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-ignored-DW_FORM_implicit_const.test
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-ignored-DW_FORM_implicit_const.test
@@ -14,7 +14,7 @@
 ;         DW_AT_decl_file DW_FORM_implicit_const  1
 ;         DW_AT_decl_line DW_FORM_data1
 
-; Attributes with DW_FORM_implicit_const being ignored by the ELFReader,
+; Attributes with DW_FORM_implicit_const being ignored by the DWARFReader,
 ; causing {Parameter} and {TypeAlias} to omit line numbers.
 
 ; test.cpp
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-compare-logical-elements.test b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-compare-logical-elements.test
new file mode 100644
index 0000000000000..f52c9c7cc7164
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-compare-logical-elements.test
@@ -0,0 +1,106 @@
+; REQUIRES: webassembly-registered-target
+
+; Test case 1 - General options
+
+; test.cpp
+;  1  using INTPTR = const int *;
+;  2  int foo(INTPTR ParamPtr, unsigned ParamUnsigned, bool ParamBool) {
+;  3    if (ParamBool) {
+;  4      typedef int INTEGER;
+;  5      const INTEGER CONSTANT = 7;
+;  6      return CONSTANT;
+;  7    }
+;  8    return ParamUnsigned;
+;  9  }
+
+; Compare mode - Logical view.
+; The output shows in view form the 'missing (-), added (+)' elements,
+; giving more context by swapping the reference and target object files.
+
+; RUN: llvm-mc -arch=wasm32 -filetype=obj \
+; RUN:         %p/Inputs/test-clang.s -o %t.test-clang.o
+
+; RUN: llvm-debuginfo-analyzer --attribute=level \
+; RUN:                         --compare=types \
+; RUN:                         --report=view \
+; RUN:                         --print=symbols,types \
+; RUN:                         %t.test-clang.o \
+; RUN:                         %p/../DWARF/Inputs/test-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s
+
+; ONE:      Reference: '{{.*}}test-clang.o'
+; ONE-NEXT: Target:    'test-dwarf-gcc.o'
+; ONE-EMPTY:
+; ONE-NEXT: Logical View:
+; ONE-NEXT:  [000]           {File} '{{.*}}test-clang.o'
+; ONE-EMPTY:
+; ONE-NEXT:  [001]             {CompileUnit} 'test.cpp'
+; ONE-NEXT:  [002]     1         {TypeAlias} 'INTPTR' -> '* const int'
+; ONE-NEXT:  [002]     2         {Function} extern not_inlined 'foo' -> 'int'
+; ONE-NEXT:  [003]                 {Block}
+; ONE-NEXT:  [004]     5             {Variable} 'CONSTANT' -> 'const INTEGER'
+; ONE-NEXT: +[004]     4             {TypeAlias} 'INTEGER' -> 'int'
+; ONE-NEXT:  [003]     2           {Parameter} 'ParamBool' -> 'bool'
+; ONE-NEXT:  [003]     2           {Parameter} 'ParamPtr' -> 'INTPTR'
+; ONE-NEXT:  [003]     2           {Parameter} 'ParamUnsigned' -> 'unsigned int'
+; ONE-NEXT: -[003]     4           {TypeAlias} 'INTEGER' -> 'int'
+
+; Compare mode - Logical elements.
+; The output shows in tabular form the 'missing (-), added (+)' elements,
+; giving more context by swapping the reference and target object files.
+
+; RUN: llvm-debuginfo-analyzer --attribute=level \
+; RUN:                         --compare=types \
+; RUN:                         --report=list \
+; RUN:                         --print=symbols,types,summary \
+; RUN:                         %t.test-clang.o \
+; RUN:                         %p/../DWARF/Inputs/test-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=TWO %s
+
+; TWO:      Reference: '{{.*}}test-clang.o'
+; TWO-NEXT: Target:    'test-dwarf-gcc.o'
+; TWO-EMPTY:
+; TWO-NEXT: (1) Missing Types:
+; TWO-NEXT: -[003]     4     {TypeAlias} 'INTEGER' -> 'int'
+; TWO-EMPTY:
+; TWO-NEXT: (1) Added Types:
+; TWO-NEXT: +[004]     4     {TypeAlias} 'INTEGER' -> 'int'
+; TWO-EMPTY:
+; TWO-NEXT: ----------------------------------------
+; TWO-NEXT: Element   Expected    Missing      Added
+; TWO-NEXT: ----------------------------------------
+; TWO-NEXT: Scopes           4          0          0
+; TWO-NEXT: Symbols          0          0          0
+; TWO-NEXT: Types            2          1          1
+; TWO-NEXT: Lines            0          0          0
+; TWO-NEXT: ----------------------------------------
+; TWO-NEXT: Total            6          1          1
+
+; Changing the 'Reference' and 'Target' order:
+
+; RUN: llvm-debuginfo-analyzer --attribute=level \
+; RUN:                         --compare=types \
+; RUN:                         --report=list \
+; RUN:                         --print=symbols,types,summary \
+; RUN:                         %p/../DWARF/Inputs/test-dwarf-gcc.o \
+; RUN:                         %t.test-clang.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=THR %s
+
+; THR:      Reference: 'test-dwarf-gcc.o'
+; THR-NEXT: Target:    '{{.*}}test-clang.o'
+; THR-EMPTY:
+; THR-NEXT: (1) Missing Types:
+; THR-NEXT: -[004]     4     {TypeAlias} 'INTEGER' -> 'int'
+; THR-EMPTY:
+; THR-NEXT: (1) Added Types:
+; THR-NEXT: +[003]     4     {TypeAlias} 'INTEGER' -> 'int'
+; THR-EMPTY:
+; THR-NEXT: ----------------------------------------
+; THR-NEXT: Element   Expected    Missing      Added
+; THR-NEXT: ----------------------------------------
+; THR-NEXT: Scopes           4          0          0
+; THR-NEXT: Symbols          0          0          0
+; THR-NEXT: Types            2          1          1
+; THR-NEXT: Lines            0          0          0
+; THR-NEXT: ----------------------------------------
+; THR-NEXT: Total            6          1          1
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-print-basic-details.test b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-print-basic-details.test
new file mode 100644
index 0000000000000..4927086563330
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-print-basic-details.test
@@ -0,0 +1,120 @@
+; REQUIRES: webassembly-registered-target
+
+; Test case 1 - General options.
+
+; test.cpp
+;  1  using INTPTR = const int *;
+;  2  int foo(INTPTR ParamPtr, unsigned ParamUnsigned, bool ParamBool) {
+;  3    if (ParamBool) {
+;  4      typedef int INTEGER;
+;  5      const INTEGER CONSTANT = 7;
+;  6      return CONSTANT;
+;  7    }
+;  8    return ParamUnsigned;
+;  9  }
+
+; Print basic details.
+; The following command prints basic details for all the logical elements
+; sorted by the debug information internal offset; it includes its lexical
+; level and debug info format.
+
+; RUN: llvm-mc -arch=wasm32 -filetype=obj \
+; RUN:         %p/Inputs/test-clang.s -o %t.test-clang.o
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format \
+; RUN:                         --output-sort=offset \
+; RUN:                         --print=scopes,symbols,types,lines,instructions \
+; RUN:                         %t.test-clang.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format \
+; RUN:                         --output-sort=offset \
+; RUN:                         --print=elements \
+; RUN:                         %t.test-clang.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s
+
+; ONE:      Logical View:
+; ONE-NEXT: [000]           {File} '{{.*}}test-clang.o' -> WASM
+; ONE-EMPTY:
+; ONE-NEXT: [001]             {CompileUnit} 'test.cpp'
+; ONE-NEXT: [002]     2         {Function} extern not_inlined 'foo' -> 'int'
+; ONE-NEXT: [003]     2           {Parameter} 'ParamPtr' -> 'INTPTR'
+; ONE-NEXT: [003]     2           {Parameter} 'ParamUnsigned' -> 'unsigned int'
+; ONE-NEXT: [003]     2           {Parameter} 'ParamBool' -> 'bool'
+; ONE-NEXT: [003]                 {Block}
+; ONE-NEXT: [004]     5             {Variable} 'CONSTANT' -> 'const INTEGER'
+; ONE-NEXT: [004]     5             {Line}
+; ONE-NEXT: [004]                   {Code} 'i32.const	7'
+; ONE-NEXT: [004]                   {Code} 'local.set	10'
+; ONE-NEXT: [004]                   {Code} 'local.get	5'
+; ONE-NEXT: [004]                   {Code} 'local.get	10'
+; ONE-NEXT: [004]                   {Code} 'i32.store	12'
+; ONE-NEXT: [004]     6             {Line}
+; ONE-NEXT: [004]                   {Code} 'i32.const	7'
+; ONE-NEXT: [004]                   {Code} 'local.set	11'
+; ONE-NEXT: [004]                   {Code} 'local.get	5'
+; ONE-NEXT: [004]                   {Code} 'local.get	11'
+; ONE-NEXT: [004]                   {Code} 'i32.store	28'
+; ONE-NEXT: [004]                   {Code} 'br      	1'
+; ONE-NEXT: [004]     -             {Line}
+; ONE-NEXT: [004]                   {Code} 'end'
+; ONE-NEXT: [003]     4           {TypeAlias} 'INTEGER' -> 'int'
+; ONE-NEXT: [003]     2           {Line}
+; ONE-NEXT: [003]                 {Code} 'nop'
+; ONE-NEXT: [003]                 {Code} 'end'
+; ONE-NEXT: [003]                 {Code} 'i64.div_s'
+; ONE-NEXT: [003]                 {Code} 'global.get	0'
+; ONE-NEXT: [003]                 {Code} 'local.set	3'
+; ONE-NEXT: [003]                 {Code} 'i32.const	32'
+; ONE-NEXT: [003]                 {Code} 'local.set	4'
+; ONE-NEXT: [003]                 {Code} 'local.get	3'
+; ONE-NEXT: [003]                 {Code} 'local.get	4'
+; ONE-NEXT: [003]                 {Code} 'i32.sub'
+; ONE-NEXT: [003]                 {Code} 'local.set	5'
+; ONE-NEXT: [003]                 {Code} 'local.get	5'
+; ONE-NEXT: [003]                 {Code} 'local.get	0'
+; ONE-NEXT: [003]                 {Code} 'i32.store	24'
+; ONE-NEXT: [003]                 {Code} 'local.get	5'
+; ONE-NEXT: [003]                 {Code} 'local.get	1'
+; ONE-NEXT: [003]                 {Code} 'i32.store	20'
+; ONE-NEXT: [003]                 {Code} 'local.get	2'
+; ONE-NEXT: [003]                 {Code} 'local.set	6'
+; ONE-NEXT: [003]                 {Code} 'local.get	5'
+; ONE-NEXT: [003]                 {Code} 'local.get	6'
+; ONE-NEXT: [003]                 {Code} 'i32.store8	19'
+; ONE-NEXT: [003]     3           {Line}
+; ONE-NEXT: [003]                 {Code} 'local.get	5'
+; ONE-NEXT: [003]                 {Code} 'i32.load8_u	19'
+; ONE-NEXT: [003]                 {Code} 'local.set	7'
+; ONE-NEXT: [003]     3           {Line}
+; ONE-NEXT: [003]                 {Code} 'i32.const	1'
+; ONE-NEXT: [003]                 {Code} 'local.set	8'
+; ONE-NEXT: [003]                 {Code} 'local.get	7'
+; ONE-NEXT: [003]                 {Code} 'local.get	8'
+; ONE-NEXT: [003]                 {Code} 'i32.and'
+; ONE-NEXT: [003]                 {Code} 'local.set	9'
+; ONE-NEXT: [003]                 {Code} 'block'
+; ONE-NEXT: [003]                 {Code} 'block'
+; ONE-NEXT: [003]                 {Code} 'local.get	9'
+; ONE-NEXT: [003]                 {Code} 'i32.eqz'
+; ONE-NEXT: [003]                 {Code} 'br_if   	0'
+; ONE-NEXT: [003]     8           {Line}
+; ONE-NEXT: [003]                 {Code} 'local.get	5'
+; ONE-NEXT: [003]                 {Code} 'i32.load	20'
+; ONE-NEXT: [003]                 {Code} 'local.set	12'
+; ONE-NEXT: [003]     8           {Line}
+; ONE-NEXT: [003]                 {Code} 'local.get	5'
+; ONE-NEXT: [003]                 {Code} 'local.get	12'
+; ONE-NEXT: [003]                 {Code} 'i32.store	28'
+; ONE-NEXT: [003]     -           {Line}
+; ONE-NEXT: [003]                 {Code} 'end'
+; ONE-NEXT: [003]     9           {Line}
+; ONE-NEXT: [003]                 {Code} 'local.get	5'
+; ONE-NEXT: [003]                 {Code} 'i32.load	28'
+; ONE-NEXT: [003]                 {Code} 'local.set	13'
+; ONE-NEXT: [003]                 {Code} 'local.get	13'
+; ONE-NEXT: [003]                 {Code} 'return'
+; ONE-NEXT: [003]                 {Code} 'end'
+; ONE-NEXT: [003]     9           {Line}
+; ONE-NEXT: [003]                 {Code} 'unreachable'
+; ONE-NEXT: [002]     1         {TypeAlias} 'INTPTR' -> '* const int'
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-select-logical-elements.test b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-select-logical-elements.test
new file mode 100644
index 0000000000000..f50cc2d291a5b
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-select-logical-elements.test
@@ -0,0 +1,76 @@
+; REQUIRES: webassembly-registered-target
+
+; Test case 1 - General options
+
+; test.cpp
+;  1  using INTPTR = const int *;
+;  2  int foo(INTPTR ParamPtr, unsigned ParamUnsigned, bool ParamBool) {
+;  3    if (ParamBool) {
+;  4      typedef int INTEGER;
+;  5      const INTEGER CONSTANT = 7;
+;  6      return CONSTANT;
+;  7    }
+;  8    return ParamUnsigned;
+;  9  }
+
+; Select logical elements.
+; The following prints all 'instructions', 'symbols' and 'types' that
+; contain 'BLOCK' or '.store' in their names or types, using a tab layout
+; and given the number of matches.
+
+; RUN: llvm-mc -arch=wasm32 -filetype=obj \
+; RUN:         %p/Inputs/test-clang.s -o %t.test-clang.o
+
+; RUN: llvm-debuginfo-analyzer --attribute=level \
+; RUN:                         --select-nocase --select-regex \
+; RUN:                         --select=BLOCK --select=.store \
+; RUN:                         --report=list \
+; RUN:                         --print=symbols,types,instructions,summary \
+; RUN:                         %t.test-clang.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s
+
+; ONE:      Logical View:
+; ONE-NEXT: [000]           {File} '{{.*}}test-clang.o'
+; ONE-EMPTY:
+; ONE-NEXT: [001]           {CompileUnit} 'test.cpp'
+; ONE-NEXT: [003]           {Code} 'block'
+; ONE-NEXT: [003]           {Code} 'block'
+; ONE-NEXT: [004]           {Code} 'i32.store	12'
+; ONE-NEXT: [003]           {Code} 'i32.store	20'
+; ONE-NEXT: [003]           {Code} 'i32.store	24'
+; ONE-NEXT: [004]           {Code} 'i32.store	28'
+; ONE-NEXT: [003]           {Code} 'i32.store	28'
+; ONE-NEXT: [003]           {Code} 'i32.store8	19'
+; ONE-EMPTY:
+; ONE-NEXT: -----------------------------
+; ONE-NEXT: Element      Total    Printed
+; ONE-NEXT: -----------------------------
+; ONE-NEXT: Scopes           3          0
+; ONE-NEXT: Symbols          4          0
+; ONE-NEXT: Types            2          0
+; ONE-NEXT: Lines           62          8
+; ONE-NEXT: -----------------------------
+; ONE-NEXT: Total           71          8
+
+; RUN: llvm-debuginfo-analyzer --attribute=level \
+; RUN:                         --select-regex --select-nocase \
+; RUN:                         --select=INTe \
+; RUN:                         --report=list \
+; RUN:                         --print=symbols,types \
+; RUN:                         %t.test-clang.o \
+; RUN:                         %p/../DWARF/Inputs/test-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=TWO %s
+
+; TWO:      Logical View:
+; TWO-NEXT: [000]           {File} '{{.*}}test-clang.o'
+; TWO-EMPTY:
+; TWO-NEXT: [001]           {CompileUnit} 'test.cpp'
+; TWO-NEXT: [003]     4     {TypeAlias} 'INTEGER' -> 'int'
+; TWO-NEXT: [004]     5     {Variable} 'CONSTANT' -> 'const INTEGER'
+; TWO-EMPTY:
+; TWO-NEXT: Logical View:
+; TWO-NEXT: [000]           {File} 'test-dwarf-gcc.o'
+; TWO-EMPTY:
+; TWO-NEXT: [001]           {CompileUnit} 'test.cpp'
+; TWO-NEXT: [004]     4     {TypeAlias} 'INTEGER' -> 'int'
+; TWO-NEXT: [004]     5     {Variable} 'CONSTANT' -> 'const INTEGER'
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/02-wasm-logical-lines.test b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/02-wasm-logical-lines.test
new file mode 100644
index 0000000000000..101f6abdc96b6
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/02-wasm-logical-lines.test
@@ -0,0 +1,74 @@
+; REQUIRES: webassembly-registered-target
+
+; Test case 2 - Assembler instructions.
+
+; hello-world.cpp
+;  1  extern int printf(const char * format, ... );
+;  2
+;  3  int main()
+;  4  {
+;  5    printf("Hello, World\n");
+;  6    return 0;
+;  7  }
+
+; Logical lines.
+; The logical views shows the intermixed lines and assembler instructions,
+; allowing to compare the code generated by the different toolchains.
+
+; RUN: llvm-mc -arch=wasm32 -filetype=obj \
+; RUN:         %p/Inputs/hello-world-clang.s -o %t.hello-world-clang.o
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format,producer \
+; RUN:                         --print=lines,instructions \
+; RUN:                         %t.hello-world-clang.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s
+
+; ONE:      Logical View:
+; ONE-NEXT: [000]           {File} '{{.*}}hello-world-clang.o' -> WASM
+; ONE-EMPTY:
+; ONE-NEXT: [001]             {CompileUnit} 'hello-world.cpp'
+; ONE-NEXT: [002]               {Producer} 'clang version 19{{.*}}'
+; ONE-NEXT: [002]     3         {Function} extern not_inlined 'main' -> 'int'
+; ONE-NEXT: [003]     4           {Line}
+; ONE-NEXT: [003]                 {Code} 'nop'
+; ONE-NEXT: [003]                 {Code} 'rethrow 	127'
+; ONE-NEXT: [003]                 {Code} 'global.get	0'
+; ONE-NEXT: [003]                 {Code} 'local.set	0'
+; ONE-NEXT: [003]                 {Code} 'i32.const	16'
+; ONE-NEXT: [003]                 {Code} 'local.set	1'
+; ONE-NEXT: [003]                 {Code} 'local.get	0'
+; ONE-NEXT: [003]                 {Code} 'local.get	1'
+; ONE-NEXT: [003]                 {Code} 'i32.sub'
+; ONE-NEXT: [003]                 {Code} 'local.set	2'
+; ONE-NEXT: [003]                 {Code} 'local.get	2'
+; ONE-NEXT: [003]                 {Code} 'global.set	0'
+; ONE-NEXT: [003]                 {Code} 'i32.const	0'
+; ONE-NEXT: [003]                 {Code} 'local.set	3'
+; ONE-NEXT: [003]                 {Code} 'local.get	2'
+; ONE-NEXT: [003]                 {Code} 'local.get	3'
+; ONE-NEXT: [003]                 {Code} 'i32.store	12'
+; ONE-NEXT: [003]     5           {Line}
+; ONE-NEXT: [003]                 {Code} 'i32.const	0'
+; ONE-NEXT: [003]                 {Code} 'local.set	4'
+; ONE-NEXT: [003]                 {Code} 'i32.const	0'
+; ONE-NEXT: [003]                 {Code} 'local.set	5'
+; ONE-NEXT: [003]                 {Code} 'local.get	4'
+; ONE-NEXT: [003]                 {Code} 'local.get	5'
+; ONE-NEXT: [003]                 {Code} 'call	0'
+; ONE-NEXT: [003]                 {Code} 'drop'
+; ONE-NEXT: [003]     6           {Line}
+; ONE-NEXT: [003]                 {Code} 'i32.const	0'
+; ONE-NEXT: [003]                 {Code} 'local.set	6'
+; ONE-NEXT: [003]                 {Code} 'i32.const	16'
+; ONE-NEXT: [003]                 {Code} 'local.set	7'
+; ONE-NEXT: [003]                 {Code} 'local.get	2'
+; ONE-NEXT: [003]                 {Code} 'local.get	7'
+; ONE-NEXT: [003]                 {Code} 'i32.add'
+; ONE-NEXT: [003]                 {Code} 'local.set	8'
+; ONE-NEXT: [003]                 {Code} 'local.get	8'
+; ONE-NEXT: [003]                 {Code} 'global.set	0'
+; ONE-NEXT: [003]                 {Code} 'local.get	6'
+; ONE-NEXT: [003]                 {Code} 'return'
+; ONE-NEXT: [003]                 {Code} 'end'
+; ONE-NEXT: [003]     6           {Line}
+; ONE-NEXT: [003]                 {Code} 'return'
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/03-wasm-incorrect-lexical-scope-typedef.test b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/03-wasm-incorrect-lexical-scope-typedef.test
new file mode 100644
index 0000000000000..eb05ecae8ccf6
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/03-wasm-incorrect-lexical-scope-typedef.test
@@ -0,0 +1,135 @@
+; REQUIRES: webassembly-registered-target
+
+; Test case 3 - Incorrect lexical scope for typedef.
+
+; pr-44884.cpp
+;  1  int bar(float Input) { return (int)Input; }
+;  2
+;  3  unsigned foo(char Param) {
+;  4    typedef int INT;                // ** Definition for INT **
+;  5    INT Value = Param;
+;  6    {
+;  7      typedef float FLOAT;          // ** Definition for FLOAT **
+;  8      {
+;  9        FLOAT Added = Value + Param;
+; 10        Value = bar(Added);
+; 11      }
+; 12    }
+; 13    return Value + Param;
+; 14  }
+
+; The lines 4 and 7 contains 2 typedefs, defined at different lexical
+; scopes.
+
+; The above test is used to illustrates a scope issue found in the
+; Clang compiler.
+; PR44884: https://bugs.llvm.org/show_bug.cgi?id=44884
+; PR44229: https://github.com/llvm/llvm-project/issues/44229
+
+; In the following logical views, we can see that the Clang compiler
+; emits both typedefs at the same lexical scope (3), which is wrong.
+; GCC emit correct lexical scope for both typedefs.
+
+; RUN: llvm-mc -arch=wasm32 -filetype=obj \
+; RUN:         %p/Inputs/pr-44884-clang.s -o %t.pr-44884-clang.o
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format,producer \
+; RUN:                         --output-sort=kind \
+; RUN:                         --print=symbols,types,lines \
+; RUN:                         %t.pr-44884-clang.o \
+; RUN:                         %p/../DWARF/Inputs/pr-44884-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s
+
+; ONE:      Logical View:
+; ONE-NEXT: [000]           {File} '{{.*}}pr-44884-clang.o' -> WASM
+; ONE-EMPTY:
+; ONE-NEXT: [001]             {CompileUnit} 'pr-44884.cpp'
+; ONE-NEXT: [002]               {Producer} 'clang version 19{{.*}}'
+; ONE-NEXT: [002]     1         {Function} extern not_inlined 'bar' -> 'int'
+; ONE-NEXT: [003]     1           {Parameter} 'Input' -> 'float'
+; ONE-NEXT: [003]     1           {Line}
+; ONE-NEXT: [003]     1           {Line}
+; ONE-NEXT: [003]     -           {Line}
+; ONE-NEXT: [003]     1           {Line}
+; ONE-NEXT: [003]     -           {Line}
+; ONE-NEXT: [003]     1           {Line}
+; ONE-NEXT: [003]     1           {Line}
+; ONE-NEXT: [003]     1           {Line}
+; ONE-NEXT: [002]     3         {Function} extern not_inlined 'foo' -> 'unsigned int'
+; ONE-NEXT: [003]                 {Block}
+; ONE-NEXT: [004]     9             {Variable} 'Added' -> 'FLOAT'
+; ONE-NEXT: [004]     9             {Line}
+; ONE-NEXT: [004]     9             {Line}
+; ONE-NEXT: [004]     9             {Line}
+; ONE-NEXT: [004]     9             {Line}
+; ONE-NEXT: [004]     9             {Line}
+; ONE-NEXT: [004]    10             {Line}
+; ONE-NEXT: [004]    10             {Line}
+; ONE-NEXT: [004]    10             {Line}
+; ONE-NEXT: [004]    13             {Line}
+; ONE-NEXT: [003]     3           {Parameter} 'Param' -> 'char'
+; ONE-NEXT: [003]     7           {TypeAlias} 'FLOAT' -> 'float'
+; ONE-NEXT: [003]     4           {TypeAlias} 'INT' -> 'int'
+; ONE-NEXT: [003]     5           {Variable} 'Value' -> 'INT'
+; ONE-NEXT: [003]     3           {Line}
+; ONE-NEXT: [003]     5           {Line}
+; ONE-NEXT: [003]     5           {Line}
+; ONE-NEXT: [003]    13           {Line}
+; ONE-NEXT: [003]    13           {Line}
+; ONE-NEXT: [003]    13           {Line}
+; ONE-NEXT: [003]    13           {Line}
+; ONE-EMPTY:
+; ONE-NEXT: Logical View:
+; ONE-NEXT: [000]           {File} 'pr-44884-dwarf-gcc.o' -> elf64-x86-64
+; ONE-EMPTY:
+; ONE-NEXT: [001]             {CompileUnit} 'pr-44884.cpp'
+; ONE-NEXT: [002]               {Producer} 'GNU C++14 10.3.0 {{.*}}'
+; ONE-NEXT: [002]     1         {Function} extern not_inlined 'bar' -> 'int'
+; ONE-NEXT: [003]     1           {Parameter} 'Input' -> 'float'
+; ONE-NEXT: [003]     1           {Line}
+; ONE-NEXT: [003]     1           {Line}
+; ONE-NEXT: [003]     1           {Line}
+; ONE-NEXT: [002]     3         {Function} extern not_inlined 'foo' -> 'unsigned int'
+; ONE-NEXT: [003]                 {Block}
+; ONE-NEXT: [004]                   {Block}
+; ONE-NEXT: [005]     9               {Variable} 'Added' -> 'FLOAT'
+; ONE-NEXT: [005]     9               {Line}
+; ONE-NEXT: [005]     9               {Line}
+; ONE-NEXT: [005]     9               {Line}
+; ONE-NEXT: [005]    10               {Line}
+; ONE-NEXT: [005]    13               {Line}
+; ONE-NEXT: [004]     7             {TypeAlias} 'FLOAT' -> 'float'
+; ONE-NEXT: [003]     3           {Parameter} 'Param' -> 'char'
+; ONE-NEXT: [003]     4           {TypeAlias} 'INT' -> 'int'
+; ONE-NEXT: [003]     5           {Variable} 'Value' -> 'INT'
+; ONE-NEXT: [003]     3           {Line}
+; ONE-NEXT: [003]     5           {Line}
+; ONE-NEXT: [003]    13           {Line}
+; ONE-NEXT: [003]    14           {Line}
+; ONE-NEXT: [003]    14           {Line}
+
+; Using the selection facilities, we can produce a simple tabular
+; output showing just the logical types that are 'Typedef'.
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format \
+; RUN:                         --output-sort=name \
+; RUN:                         --select-types=Typedef \
+; RUN:                         --report=list \
+; RUN:                         --print=types \
+; RUN:                         %t.pr-44884-clang.o \
+; RUN:                         %p/../DWARF/Inputs/pr-44884-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=TWO %s
+
+; TWO:      Logical View:
+; TWO-NEXT: [000]           {File} '{{.*}}pr-44884-clang.o' -> WASM
+; TWO-EMPTY:
+; TWO-NEXT: [001]           {CompileUnit} 'pr-44884.cpp'
+; TWO-NEXT: [003]     7     {TypeAlias} 'FLOAT' -> 'float'
+; TWO-NEXT: [003]     4     {TypeAlias} 'INT' -> 'int'
+; TWO-EMPTY:
+; TWO-NEXT: Logical View:
+; TWO-NEXT: [000]           {File} 'pr-44884-dwarf-gcc.o' -> elf64-x86-64
+; TWO-EMPTY:
+; TWO-NEXT: [001]           {CompileUnit} 'pr-44884.cpp'
+; TWO-NEXT: [004]     7     {TypeAlias} 'FLOAT' -> 'float'
+; TWO-NEXT: [003]     4     {TypeAlias} 'INT' -> 'int'
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/04-wasm-missing-nested-enumerators.test b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/04-wasm-missing-nested-enumerators.test
new file mode 100644
index 0000000000000..cafa51c41966c
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/04-wasm-missing-nested-enumerators.test
@@ -0,0 +1,130 @@
+; REQUIRES: webassembly-registered-target
+
+; Test case 4 - Missing nested enumerations.
+
+; pr-46466.cpp
+;   1  struct Struct {
+;   2    union Union {
+;   3      enum NestedEnum { RED, BLUE };
+;   4    };
+;   5    Union U;
+;   6  };
+;   7
+;   8  Struct S;
+;   9  int test() {
+;  10    return S.U.BLUE;
+;  11  }
+
+; The above test is used to illustrate a scope issue found in the Clang
+; compiler.
+; PR46466: https://bugs.llvm.org/show_bug.cgi?id=46466
+; PR45811: https://github.com/llvm/llvm-project/issues/45811
+
+; In the following logical views, we can see that the DWARF debug
+; information generated by the Clang compiler does not include any
+; references to the enumerators 'RED' and 'BLUE'. The DWARF generated
+; by GCC, does include such references.
+
+; RUN: llvm-mc -arch=wasm32 -filetype=obj \
+; RUN:         %p/Inputs/pr-46466-clang.s -o %t.pr-46466-clang.o
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format,producer \
+; RUN:                         --output-sort=name \
+; RUN:                         --print=symbols,types \
+; RUN:                         %t.pr-46466-clang.o \
+; RUN:                         %p/../DWARF/Inputs/pr-46466-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s
+
+; ONE:      Logical View:
+; ONE-NEXT: [000]           {File} '{{.*}}pr-46466-clang.o' -> WASM
+; ONE-EMPTY:
+; ONE-NEXT: [001]             {CompileUnit} 'pr-46466.cpp'
+; ONE-NEXT: [002]               {Producer} 'clang version 19{{.*}}'
+; ONE-NEXT: [002]     8         {Variable} extern 'S' -> 'Struct'
+; ONE-NEXT: [002]     1         {Struct} 'Struct'
+; ONE-NEXT: [003]     5           {Member} public 'U' -> 'Union'
+; ONE-EMPTY:
+; ONE-NEXT: Logical View:
+; ONE-NEXT: [000]           {File} 'pr-46466-dwarf-gcc.o' -> elf64-x86-64
+; ONE-EMPTY:
+; ONE-NEXT: [001]             {CompileUnit} 'pr-46466.cpp'
+; ONE-NEXT: [002]               {Producer} 'GNU C++14 10.3.0 {{.*}}'
+; ONE-NEXT: [002]     8         {Variable} extern 'S' -> 'Struct'
+; ONE-NEXT: [002]     1         {Struct} 'Struct'
+; ONE-NEXT: [003]     5           {Member} public 'U' -> 'Union'
+; ONE-NEXT: [003]     2           {Union} 'Union'
+; ONE-NEXT: [004]     3             {Enumeration} 'NestedEnum' -> 'unsigned int'
+; ONE-NEXT: [005]                     {Enumerator} 'BLUE' = '0x1'
+; ONE-NEXT: [005]                     {Enumerator} 'RED' = '0x0'
+
+; Using the selection facilities, we can produce a logical view
+; showing just the logical types that are 'Enumerator' and its
+; parents. The logical view is sorted by the types name.
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format \
+; RUN:                         --output-sort=name \
+; RUN:                         --select-types=Enumerator \
+; RUN:                         --report=parents \
+; RUN:                         --print=types \
+; RUN:                         %t.pr-46466-clang.o \
+; RUN:                         %p/../DWARF/Inputs/pr-46466-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=TWO %s
+
+; TWO:      Logical View:
+; TWO-NEXT: [000]           {File} '{{.*}}pr-46466-clang.o' -> WASM
+; TWO-EMPTY:
+; TWO-NEXT: [001]             {CompileUnit} 'pr-46466.cpp'
+; TWO-EMPTY:
+; TWO-NEXT: Logical View:
+; TWO-NEXT: [000]           {File} 'pr-46466-dwarf-gcc.o' -> elf64-x86-64
+; TWO-EMPTY:
+; TWO-NEXT: [001]             {CompileUnit} 'pr-46466.cpp'
+; TWO-NEXT: [002]     1         {Struct} 'Struct'
+; TWO-NEXT: [003]     2           {Union} 'Union'
+; TWO-NEXT: [004]     3             {Enumeration} 'NestedEnum' -> 'unsigned int'
+; TWO-NEXT: [005]                     {Enumerator} 'BLUE' = '0x1'
+; TWO-NEXT: [005]                     {Enumerator} 'RED' = '0x0'
+
+; Using the selection facilities, we can produce a simple tabular output
+; including a summary for the logical types that are 'Enumerator'. The
+; logical view is sorted by the types name.
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format \
+; RUN:                         --output-sort=name \
+; RUN:                         --select-types=Enumerator \
+; RUN:                         --print=types,summary \
+; RUN:                         %t.pr-46466-clang.o \
+; RUN:                         %p/../DWARF/Inputs/pr-46466-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=THR %s
+
+; THR:      Logical View:
+; THR-NEXT: [000]           {File} '{{.*}}pr-46466-clang.o' -> WASM
+; THR-EMPTY:
+; THR-NEXT: [001]           {CompileUnit} 'pr-46466.cpp'
+; THR-EMPTY:
+; THR-NEXT: -----------------------------
+; THR-NEXT: Element      Total    Printed
+; THR-NEXT: -----------------------------
+; THR-NEXT: Scopes           4          0
+; THR-NEXT: Symbols          0          0
+; THR-NEXT: Types            0          0
+; THR-NEXT: Lines            0          0
+; THR-NEXT: -----------------------------
+; THR-NEXT: Total            4          0
+; THR-EMPTY:
+; THR-NEXT: Logical View:
+; THR-NEXT: [000]           {File} 'pr-46466-dwarf-gcc.o' -> elf64-x86-64
+; THR-EMPTY:
+; THR-NEXT: [001]           {CompileUnit} 'pr-46466.cpp'
+; THR-NEXT: [005]           {Enumerator} 'BLUE' = '0x1'
+; THR-NEXT: [005]           {Enumerator} 'RED' = '0x0'
+; THR-EMPTY:
+; THR-NEXT: -----------------------------
+; THR-NEXT: Element      Total    Printed
+; THR-NEXT: -----------------------------
+; THR-NEXT: Scopes           5          0
+; THR-NEXT: Symbols          0          0
+; THR-NEXT: Types            2          2
+; THR-NEXT: Lines            0          0
+; THR-NEXT: -----------------------------
+; THR-NEXT: Total            7          2
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/05-wasm-incorrect-lexical-scope-variable.test b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/05-wasm-incorrect-lexical-scope-variable.test
new file mode 100644
index 0000000000000..4348161f8c5ba
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/05-wasm-incorrect-lexical-scope-variable.test
@@ -0,0 +1,114 @@
+; REQUIRES: webassembly-registered-target
+
+; Test case 5 - Incorrect lexical scope variable.
+
+; pr-43860.cpp
+;  1  #include "definitions.h"
+;  2  forceinline int InlineFunction(int Param) {
+;  3    int Var_1 = Param;
+;  4    {
+;  5      int Var_2 = Param + Var_1;
+;  6      Var_1 = Var_2;
+;  7    }
+;  8    return Var_1;
+;  9  }
+; 10
+; 11  int test(int Param_1, int Param_2) {
+; 12    int A = Param_1;
+; 13    A += InlineFunction(Param_2);
+; 14    return A;
+; 15  }
+
+; The above test is used to illustrate a variable issue found in the
+; Clang compiler.
+; PR43860: https://bugs.llvm.org/show_bug.cgi?id=43860
+; PR43205: https://github.com/llvm/llvm-project/issues/43205
+
+; In the following logical views, we can see that the DWARF debug
+; information generated by the Clang compiler shows the variables
+; 'Var_1' and 'Var_2' are at the same lexical scope (4) in the function
+; 'InlineFuction'.
+; The DWARF generated by GCC/Clang show those variables at the correct
+; lexical scope: '3' and '4' respectively.
+
+; RUN: llvm-mc -arch=wasm32 -filetype=obj \
+; RUN:         %p/Inputs/pr-43860-clang.s -o %t.pr-43860-clang.o
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format,producer \
+; RUN:                         --output-sort=name \
+; RUN:                         --print=symbols \
+; RUN:                         %t.pr-43860-clang.o \
+; RUN:                         %p/../DWARF/Inputs/pr-43860-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s
+
+; ONE:      Logical View:
+; ONE-NEXT: [000]           {File} '{{.*}}pr-43860-clang.o' -> WASM
+; ONE-EMPTY:
+; ONE-NEXT: [001]             {CompileUnit} 'pr-43860.cpp'
+; ONE-NEXT: [002]               {Producer} 'clang version 19{{.*}}'
+; ONE-NEXT: [002]     2         {Function} extern inlined 'InlineFunction' -> 'int'
+; ONE-NEXT: [003]                 {Block}
+; ONE-NEXT: [004]     5             {Variable} 'Var_2' -> 'int'
+; ONE-NEXT: [003]     2           {Parameter} 'Param' -> 'int'
+; ONE-NEXT: [003]     3           {Variable} 'Var_1' -> 'int'
+; ONE-NEXT: [002]    11         {Function} extern not_inlined 'test' -> 'int'
+; ONE-NEXT: [003]    12           {Variable} 'A' -> 'int'
+; ONE-NEXT: [003]    13           {InlinedFunction} inlined 'InlineFunction' -> 'int'
+; ONE-NEXT: [004]                   {Block}
+; ONE-NEXT: [005]                     {Variable} 'Var_2' -> 'int'
+; ONE-NEXT: [004]                   {Parameter} 'Param' -> 'int'
+; ONE-NEXT: [004]                   {Variable} 'Var_1' -> 'int'
+; ONE-NEXT: [003]    11           {Parameter} 'Param_1' -> 'int'
+; ONE-NEXT: [003]    11           {Parameter} 'Param_2' -> 'int'
+; ONE-EMPTY:
+; ONE-NEXT: Logical View:
+; ONE-NEXT: [000]           {File} 'pr-43860-dwarf-gcc.o' -> elf64-x86-64
+; ONE-EMPTY:
+; ONE-NEXT: [001]             {CompileUnit} 'pr-43860.cpp'
+; ONE-NEXT: [002]               {Producer} 'GNU C++14 10.3.0 {{.*}}'
+; ONE-NEXT: [002]     2         {Function} extern declared_inlined 'InlineFunction' -> 'int'
+; ONE-NEXT: [003]                 {Block}
+; ONE-NEXT: [004]     5             {Variable} 'Var_2' -> 'int'
+; ONE-NEXT: [003]     2           {Parameter} 'Param' -> 'int'
+; ONE-NEXT: [003]     3           {Variable} 'Var_1' -> 'int'
+; ONE-NEXT: [002]    11         {Function} extern not_inlined 'test' -> 'int'
+; ONE-NEXT: [003]    12           {Variable} 'A' -> 'int'
+; ONE-NEXT: [003]    13           {InlinedFunction} declared_inlined 'InlineFunction' -> 'int'
+; ONE-NEXT: [004]                   {Block}
+; ONE-NEXT: [005]                     {Variable} 'Var_2' -> 'int'
+; ONE-NEXT: [004]                   {Parameter} 'Param' -> 'int'
+; ONE-NEXT: [004]                   {Variable} 'Var_1' -> 'int'
+; ONE-NEXT: [003]    11           {Parameter} 'Param_1' -> 'int'
+; ONE-NEXT: [003]    11           {Parameter} 'Param_2' -> 'int'
+
+; Using the selection facilities, we can produce a simple tabular output
+; showing just the logical elements that have in their name the 'var'
+; pattern. The logical view is sorted by the variables name.
+
+; RUN: llvm-debuginfo-analyzer --attribute=level,format \
+; RUN:                         --output-sort=name \
+; RUN:                         --select-regex --select-nocase \
+; RUN:                         --select=Var \
+; RUN:                         --report=list \
+; RUN:                         --print=symbols \
+; RUN:                         %t.pr-43860-clang.o \
+; RUN:                         %p/../DWARF/Inputs/pr-43860-dwarf-gcc.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=TWO %s
+
+; TWO:      Logical View:
+; TWO-NEXT: [000]           {File} '{{.*}}pr-43860-clang.o' -> WASM
+; TWO-EMPTY:
+; TWO-NEXT: [001]           {CompileUnit} 'pr-43860.cpp'
+; TWO-NEXT: [004]           {Variable} 'Var_1' -> 'int'
+; TWO-NEXT: [003]     3     {Variable} 'Var_1' -> 'int'
+; TWO-NEXT: [005]           {Variable} 'Var_2' -> 'int'
+; TWO-NEXT: [004]     5     {Variable} 'Var_2' -> 'int'
+; TWO-EMPTY:
+; TWO-NEXT: Logical View:
+; TWO-NEXT: [000]           {File} 'pr-43860-dwarf-gcc.o' -> elf64-x86-64
+; TWO-EMPTY:
+; TWO-NEXT: [001]           {CompileUnit} 'pr-43860.cpp'
+; TWO-NEXT: [004]           {Variable} 'Var_1' -> 'int'
+; TWO-NEXT: [003]     3     {Variable} 'Var_1' -> 'int'
+; TWO-NEXT: [005]           {Variable} 'Var_2' -> 'int'
+; TWO-NEXT: [004]     5     {Variable} 'Var_2' -> 'int'
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/06-wasm-full-logical-view.test b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/06-wasm-full-logical-view.test
new file mode 100644
index 0000000000000..81b78bac03215
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/06-wasm-full-logical-view.test
@@ -0,0 +1,158 @@
+; REQUIRES: webassembly-registered-target
+
+; Test case 6 - Full logical view
+
+; test.cpp
+;  1  using INTPTR = const int *;
+;  2  int foo(INTPTR ParamPtr, unsigned ParamUnsigned, bool ParamBool) {
+;  3    if (ParamBool) {
+;  4      typedef int INTEGER;
+;  5      const INTEGER CONSTANT = 7;
+;  6      return CONSTANT;
+;  7    }
+;  8    return ParamUnsigned;
+;  9  }
+
+; Print low level details.
+; The following command prints low level information that includes
+; offsets within the debug information section, debug location
+; operands, linkage names, etc.
+
+; RUN: llvm-mc -arch=wasm32 -filetype=obj \
+; RUN:         %p/Inputs/test-clang.s -o %t.test-clang.o
+
+; RUN: llvm-debuginfo-analyzer --attribute=all \
+; RUN:                         --print=all \
+; RUN:                         %t.test-clang.o 2>&1 | \
+; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s
+
+; ONE:      Logical View:
+; ONE-NEXT: [0x0000000000][000]            {File} '{{.*}}test-clang.o' -> WASM
+; ONE-EMPTY:
+; ONE-NEXT: [0x000000000b][001]              {CompileUnit} 'test.cpp'
+; ONE-NEXT: [0x000000000b][002]                {Producer} 'clang version 19{{.*}}'
+; ONE-NEXT:                                    {Directory} '{{.*}}/general'
+; ONE-NEXT:                                    {File} 'test.cpp'
+; ONE-NEXT:                                    {Public} 'foo' [0x0000000002:0x000000007f]
+; ONE-NEXT: [0x000000000b][002]                {Range} Lines 2:9 [0x0000000002:0x000000007f]
+; ONE-NEXT: [0x00000000b3][002]                {BaseType} 'bool'
+; ONE-NEXT: [0x0000000090][002]                {BaseType} 'int'
+; ONE-NEXT: [0x00000000ac][002]                {BaseType} 'unsigned int'
+; ONE-EMPTY:
+; ONE-NEXT: [0x0000000097][002]   {Source} '{{.*}}general/test.cpp'
+; ONE-NEXT: [0x0000000097][002]      1         {TypeAlias} 'INTPTR' -> [0x00000000a2]'* const int'
+; ONE-NEXT: [0x0000000026][002]      2         {Function} extern not_inlined 'foo' -> [0x0000000090]'int'
+; ONE-NEXT: [0x0000000026][003]                  {Range} Lines 2:9 [0x0000000002:0x000000007f]
+; ONE-NEXT: [0x0000000026][003]                  {Linkage}  0x3 '_Z3fooPKijb'
+; ONE-NEXT: [0x000000006c][003]                  {Block}
+; ONE-NEXT: [0x000000006c][004]                    {Range} Lines 5:0 [0x000000004c:0x0000000064]
+; ONE-NEXT: [0x0000000075][004]      5             {Variable} 'CONSTANT' -> [0x00000000ba]'const INTEGER'
+; ONE-NEXT: [0x0000000075][005]                      {Coverage} 100.00%
+; ONE-NEXT: [0x0000000076][005]                      {Location}
+; ONE-NEXT: [0x0000000076][006]                        {Entry} fbreg 12
+; ONE-NEXT: [0x000000004c][004]      5             {Line} {NewStatement} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x000000004c][004]                    {Code} 'i32.const	7'
+; ONE-NEXT: [0x000000004e][004]                    {Code} 'local.set	10'
+; ONE-NEXT: [0x0000000050][004]                    {Code} 'local.get	5'
+; ONE-NEXT: [0x0000000052][004]                    {Code} 'local.get	10'
+; ONE-NEXT: [0x0000000054][004]                    {Code} 'i32.store	12'
+; ONE-NEXT: [0x0000000057][004]      6             {Line} {NewStatement} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x0000000057][004]                    {Code} 'i32.const	7'
+; ONE-NEXT: [0x0000000059][004]                    {Code} 'local.set	11'
+; ONE-NEXT: [0x000000005b][004]                    {Code} 'local.get	5'
+; ONE-NEXT: [0x000000005d][004]                    {Code} 'local.get	11'
+; ONE-NEXT: [0x000000005f][004]                    {Code} 'i32.store	28'
+; ONE-NEXT: [0x0000000062][004]                    {Code} 'br      	1'
+; ONE-NEXT: [0x0000000064][004]      0             {Line} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x0000000064][004]                    {Code} 'end'
+; ONE-NEXT: [0x000000005e][003]      2           {Parameter} 'ParamBool' -> [0x00000000b3]'bool'
+; ONE-NEXT: [0x000000005e][004]                    {Coverage} 100.00%
+; ONE-NEXT: [0x000000005f][004]                    {Location}
+; ONE-NEXT: [0x000000005f][005]                      {Entry} fbreg 19
+; ONE-NEXT: [0x0000000042][003]      2           {Parameter} 'ParamPtr' -> [0x0000000097]'INTPTR'
+; ONE-NEXT: [0x0000000042][004]                    {Coverage} 100.00%
+; ONE-NEXT: [0x0000000043][004]                    {Location}
+; ONE-NEXT: [0x0000000043][005]                      {Entry} fbreg 24
+; ONE-NEXT: [0x0000000050][003]      2           {Parameter} 'ParamUnsigned' -> [0x00000000ac]'unsigned int'
+; ONE-NEXT: [0x0000000050][004]                    {Coverage} 100.00%
+; ONE-NEXT: [0x0000000051][004]                    {Location}
+; ONE-NEXT: [0x0000000051][005]                      {Entry} fbreg 20
+; ONE-NEXT: [0x0000000084][003]      4           {TypeAlias} 'INTEGER' -> [0x0000000090]'int'
+; ONE-NEXT: [0x0000000002][003]      2           {Line} {NewStatement} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x0000000002][003]                  {Code} 'nop'
+; ONE-NEXT: [0x0000000003][003]                  {Code} 'end'
+; ONE-NEXT: [0x0000000004][003]                  {Code} 'i64.div_s'
+; ONE-NEXT: [0x0000000005][003]                  {Code} 'global.get	0'
+; ONE-NEXT: [0x000000000b][003]                  {Code} 'local.set	3'
+; ONE-NEXT: [0x000000000d][003]                  {Code} 'i32.const	32'
+; ONE-NEXT: [0x000000000f][003]                  {Code} 'local.set	4'
+; ONE-NEXT: [0x0000000011][003]                  {Code} 'local.get	3'
+; ONE-NEXT: [0x0000000013][003]                  {Code} 'local.get	4'
+; ONE-NEXT: [0x0000000015][003]                  {Code} 'i32.sub'
+; ONE-NEXT: [0x0000000016][003]                  {Code} 'local.set	5'
+; ONE-NEXT: [0x0000000018][003]                  {Code} 'local.get	5'
+; ONE-NEXT: [0x000000001a][003]                  {Code} 'local.get	0'
+; ONE-NEXT: [0x000000001c][003]                  {Code} 'i32.store	24'
+; ONE-NEXT: [0x000000001f][003]                  {Code} 'local.get	5'
+; ONE-NEXT: [0x0000000021][003]                  {Code} 'local.get	1'
+; ONE-NEXT: [0x0000000023][003]                  {Code} 'i32.store	20'
+; ONE-NEXT: [0x0000000026][003]                  {Code} 'local.get	2'
+; ONE-NEXT: [0x0000000028][003]                  {Code} 'local.set	6'
+; ONE-NEXT: [0x000000002a][003]                  {Code} 'local.get	5'
+; ONE-NEXT: [0x000000002c][003]                  {Code} 'local.get	6'
+; ONE-NEXT: [0x000000002e][003]                  {Code} 'i32.store8	19'
+; ONE-NEXT: [0x0000000031][003]      3           {Line} {NewStatement} {PrologueEnd} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x0000000031][003]                  {Code} 'local.get	5'
+; ONE-NEXT: [0x0000000033][003]                  {Code} 'i32.load8_u	19'
+; ONE-NEXT: [0x0000000036][003]                  {Code} 'local.set	7'
+; ONE-NEXT: [0x0000000038][003]      3           {Line} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x0000000038][003]                  {Code} 'i32.const	1'
+; ONE-NEXT: [0x000000003a][003]                  {Code} 'local.set	8'
+; ONE-NEXT: [0x000000003c][003]                  {Code} 'local.get	7'
+; ONE-NEXT: [0x000000003e][003]                  {Code} 'local.get	8'
+; ONE-NEXT: [0x0000000040][003]                  {Code} 'i32.and'
+; ONE-NEXT: [0x0000000041][003]                  {Code} 'local.set	9'
+; ONE-NEXT: [0x0000000043][003]                  {Code} 'block'
+; ONE-NEXT: [0x0000000045][003]                  {Code} 'block'
+; ONE-NEXT: [0x0000000047][003]                  {Code} 'local.get	9'
+; ONE-NEXT: [0x0000000049][003]                  {Code} 'i32.eqz'
+; ONE-NEXT: [0x000000004a][003]                  {Code} 'br_if   	0'
+; ONE-NEXT: [0x0000000065][003]      8           {Line} {NewStatement} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x0000000065][003]                  {Code} 'local.get	5'
+; ONE-NEXT: [0x0000000067][003]                  {Code} 'i32.load	20'
+; ONE-NEXT: [0x000000006a][003]                  {Code} 'local.set	12'
+; ONE-NEXT: [0x000000006c][003]      8           {Line} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x000000006c][003]                  {Code} 'local.get	5'
+; ONE-NEXT: [0x000000006e][003]                  {Code} 'local.get	12'
+; ONE-NEXT: [0x0000000070][003]                  {Code} 'i32.store	28'
+; ONE-NEXT: [0x0000000073][003]      0           {Line} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x0000000073][003]                  {Code} 'end'
+; ONE-NEXT: [0x0000000074][003]      9           {Line} {NewStatement} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x0000000074][003]                  {Code} 'local.get	5'
+; ONE-NEXT: [0x0000000076][003]                  {Code} 'i32.load	28'
+; ONE-NEXT: [0x0000000079][003]                  {Code} 'local.set	13'
+; ONE-NEXT: [0x000000007b][003]                  {Code} 'local.get	13'
+; ONE-NEXT: [0x000000007d][003]                  {Code} 'return'
+; ONE-NEXT: [0x000000007e][003]                  {Code} 'end'
+; ONE-NEXT: [0x000000007f][003]      9           {Line} {NewStatement} {EndSequence} '{{.*}}/general/test.cpp'
+; ONE-NEXT: [0x000000007f][003]                  {Code} 'unreachable'
+; ONE-EMPTY:
+; ONE-NEXT: -----------------------------
+; ONE-NEXT: Element      Total    Printed
+; ONE-NEXT: -----------------------------
+; ONE-NEXT: Scopes           3          3
+; ONE-NEXT: Symbols          4          4
+; ONE-NEXT: Types            5          5
+; ONE-NEXT: Lines           73         73
+; ONE-NEXT: -----------------------------
+; ONE-NEXT: Total           85         85
+; ONE-EMPTY:
+; ONE-NEXT: Scope Sizes:
+; ONE-NEXT:        180 (100.00%) : [0x000000000b][001]              {CompileUnit} 'test.cpp'
+; ONE-NEXT:        105 ( 58.33%) : [0x0000000026][002]      2         {Function} extern not_inlined 'foo' -> [0x0000000090]'int'
+; ONE-NEXT:         23 ( 12.78%) : [0x000000006c][003]                  {Block}
+; ONE-EMPTY:
+; ONE-NEXT: Totals by lexical level:
+; ONE-NEXT: [001]:        180 (100.00%)
+; ONE-NEXT: [002]:        105 ( 58.33%)
+; ONE-NEXT: [003]:         23 ( 12.78%)
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/definitions.h b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/definitions.h
new file mode 100644
index 0000000000000..dfbd3db044dd8
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/definitions.h
@@ -0,0 +1,30 @@
+//-----------------------------------------------------------------------------
+// Definitions.
+//-----------------------------------------------------------------------------
+#ifndef SUITE_DEFINITIONS_H
+#define SUITE_DEFINITIONS_H
+
+#ifdef _MSC_VER
+#define forceinline __forceinline
+#define OPTIMIZE_OFF __pragma(optimize("", off))
+#define OPTIMIZE_ON  __pragma(optimize("", on))
+#elif defined(__clang__)
+#if __has_attribute(__always_inline__)
+#define forceinline inline __attribute__((__always_inline__))
+#else
+#define forceinline inline
+#endif
+#define OPTIMIZE_OFF _Pragma("clang optimize off")
+#define OPTIMIZE_ON  _Pragma("clang optimize on")
+#elif defined(__GNUC__)
+#define forceinline inline __attribute__((__always_inline__))
+#define OPTIMIZE_OFF _Pragma("GCC optimize off")
+#define OPTIMIZE_ON  _Pragma("GCC optimize on")
+#else
+#define forceinline inline
+#define OPTIMIZE_OFF
+#define OPTIMIZE_ON
+#error
+#endif
+
+#endif // SUITE_DEFINITIONS_H
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/hello-world-clang.s b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/hello-world-clang.s
new file mode 100644
index 0000000000000..bfba259b626b0
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/hello-world-clang.s
@@ -0,0 +1,286 @@
+	.text
+	.file	"hello-world.cpp"
+	.file	1 "/data/projects/scripts/regression-suite/input/general" "hello-world.cpp"
+	.globaltype	__stack_pointer, i32
+	.functype	__original_main () -> (i32)
+	.functype	_Z6printfPKcz (i32, i32) -> (i32)
+	.functype	main (i32, i32) -> (i32)
+	.section	.text.__original_main,"",@
+	.hidden	__original_main                 # -- Begin function __original_main
+	.globl	__original_main
+	.type	__original_main,@function
+__original_main:                        # @__original_main
+.Lfunc_begin0:
+	.loc	1 4 0                           # hello-world.cpp:4:0
+	.functype	__original_main () -> (i32)
+	.local  	i32, i32, i32, i32, i32, i32, i32, i32, i32
+# %bb.0:                                # %entry
+	global.get	__stack_pointer
+	local.set	0
+	i32.const	16
+	local.set	1
+	local.get	0
+	local.get	1
+	i32.sub 
+	local.set	2
+	local.get	2
+	global.set	__stack_pointer
+	i32.const	0
+	local.set	3
+	local.get	2
+	local.get	3
+	i32.store	12
+.Ltmp0:
+	.loc	1 5 3 prologue_end              # hello-world.cpp:5:3
+	i32.const	.L.str
+	local.set	4
+	i32.const	0
+	local.set	5
+	local.get	4
+	local.get	5
+	call	_Z6printfPKcz
+	drop
+	.loc	1 6 3                           # hello-world.cpp:6:3
+	i32.const	0
+	local.set	6
+	i32.const	16
+	local.set	7
+	local.get	2
+	local.get	7
+	i32.add 
+	local.set	8
+	local.get	8
+	global.set	__stack_pointer
+	local.get	6
+	return
+	end_function
+.Ltmp1:
+.Lfunc_end0:
+                                        # -- End function
+	.section	.text.main,"",@
+	.hidden	main                            # -- Begin function main
+	.globl	main
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin1:
+	.functype	main (i32, i32) -> (i32)
+	.local  	i32
+# %bb.0:                                # %body
+	call	__original_main
+	local.set	2
+	local.get	2
+	return
+	end_function
+.Lfunc_end1:
+                                        # -- End function
+	.type	.L.str,@object                  # @.str
+	.section	.rodata..L.str,"S",@
+.L.str:
+	.asciz	"Hello, World\n"
+	.size	.L.str, 14
+
+	.globl	__main_void
+	.type	__main_void,@function
+	.hidden	__main_void
+.set __main_void, __original_main
+	.section	.debug_abbrev,"",@
+	.int8	1                               # Abbreviation Code
+	.int8	17                              # DW_TAG_compile_unit
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	37                              # DW_AT_producer
+	.int8	14                              # DW_FORM_strp
+	.int8	19                              # DW_AT_language
+	.int8	5                               # DW_FORM_data2
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	16                              # DW_AT_stmt_list
+	.int8	23                              # DW_FORM_sec_offset
+	.int8	27                              # DW_AT_comp_dir
+	.int8	14                              # DW_FORM_strp
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	2                               # Abbreviation Code
+	.int8	52                              # DW_TAG_variable
+	.int8	0                               # DW_CHILDREN_no
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	3                               # Abbreviation Code
+	.int8	1                               # DW_TAG_array_type
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	4                               # Abbreviation Code
+	.int8	33                              # DW_TAG_subrange_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	55                              # DW_AT_count
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	5                               # Abbreviation Code
+	.int8	38                              # DW_TAG_const_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	6                               # Abbreviation Code
+	.int8	36                              # DW_TAG_base_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	62                              # DW_AT_encoding
+	.int8	11                              # DW_FORM_data1
+	.int8	11                              # DW_AT_byte_size
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	7                               # Abbreviation Code
+	.int8	36                              # DW_TAG_base_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	11                              # DW_AT_byte_size
+	.int8	11                              # DW_FORM_data1
+	.int8	62                              # DW_AT_encoding
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	8                               # Abbreviation Code
+	.int8	46                              # DW_TAG_subprogram
+	.int8	0                               # DW_CHILDREN_no
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	64                              # DW_AT_frame_base
+	.int8	24                              # DW_FORM_exprloc
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	63                              # DW_AT_external
+	.int8	25                              # DW_FORM_flag_present
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	0                               # EOM(3)
+	.section	.debug_info,"",@
+.Lcu_begin0:
+	.int32	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.int16	4                               # DWARF version number
+	.int32	.debug_abbrev0                  # Offset Into Abbrev. Section
+	.int8	4                               # Address Size (in bytes)
+	.int8	1                               # Abbrev [1] 0xb:0x67 DW_TAG_compile_unit
+	.int32	.Linfo_string0                  # DW_AT_producer
+	.int16	33                              # DW_AT_language
+	.int32	.Linfo_string1                  # DW_AT_name
+	.int32	.Lline_table_start0             # DW_AT_stmt_list
+	.int32	.Linfo_string2                  # DW_AT_comp_dir
+	.int32	.Lfunc_begin0                   # DW_AT_low_pc
+	.int32	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.int8	2                               # Abbrev [2] 0x26:0xd DW_TAG_variable
+	.int32	51                              # DW_AT_type
+	.int8	1                               # DW_AT_decl_file
+	.int8	5                               # DW_AT_decl_line
+	.int8	5                               # DW_AT_location
+	.int8	3
+	.int32	.L.str
+	.int8	3                               # Abbrev [3] 0x33:0xc DW_TAG_array_type
+	.int32	63                              # DW_AT_type
+	.int8	4                               # Abbrev [4] 0x38:0x6 DW_TAG_subrange_type
+	.int32	75                              # DW_AT_type
+	.int8	14                              # DW_AT_count
+	.int8	0                               # End Of Children Mark
+	.int8	5                               # Abbrev [5] 0x3f:0x5 DW_TAG_const_type
+	.int32	68                              # DW_AT_type
+	.int8	6                               # Abbrev [6] 0x44:0x7 DW_TAG_base_type
+	.int32	.Linfo_string3                  # DW_AT_name
+	.int8	6                               # DW_AT_encoding
+	.int8	1                               # DW_AT_byte_size
+	.int8	7                               # Abbrev [7] 0x4b:0x7 DW_TAG_base_type
+	.int32	.Linfo_string4                  # DW_AT_name
+	.int8	8                               # DW_AT_byte_size
+	.int8	7                               # DW_AT_encoding
+	.int8	8                               # Abbrev [8] 0x52:0x18 DW_TAG_subprogram
+	.int32	.Lfunc_begin0                   # DW_AT_low_pc
+	.int32	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.int8	4                               # DW_AT_frame_base
+	.int8	237
+	.int8	0
+	.int8	2
+	.int8	159
+	.int32	.Linfo_string5                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	3                               # DW_AT_decl_line
+	.int32	106                             # DW_AT_type
+                                        # DW_AT_external
+	.int8	6                               # Abbrev [6] 0x6a:0x7 DW_TAG_base_type
+	.int32	.Linfo_string6                  # DW_AT_name
+	.int8	5                               # DW_AT_encoding
+	.int8	4                               # DW_AT_byte_size
+	.int8	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str,"S",@
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)" # string offset=0
+.Linfo_string1:
+	.asciz	"hello-world.cpp"               # string offset=111
+.Linfo_string2:
+	.asciz	"/data/projects/scripts/regression-suite/input/general" # string offset=127
+.Linfo_string3:
+	.asciz	"char"                          # string offset=181
+.Linfo_string4:
+	.asciz	"__ARRAY_SIZE_TYPE__"           # string offset=186
+.Linfo_string5:
+	.asciz	"main"                          # string offset=206
+.Linfo_string6:
+	.asciz	"int"                           # string offset=211
+	.ident	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.no_dead_strip	__indirect_function_table
+	.section	.custom_section.producers,"",@
+	.int8	2
+	.int8	8
+	.ascii	"language"
+	.int8	1
+	.int8	14
+	.ascii	"C_plus_plus_14"
+	.int8	0
+	.int8	12
+	.ascii	"processed-by"
+	.int8	1
+	.int8	5
+	.ascii	"clang"
+	.int8	96
+	.ascii	"19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.section	.debug_str,"S",@
+	.section	.custom_section.target_features,"",@
+	.int8	2
+	.int8	43
+	.int8	15
+	.ascii	"mutable-globals"
+	.int8	43
+	.int8	8
+	.ascii	"sign-ext"
+	.section	.debug_str,"S",@
+	.section	.debug_line,"",@
+.Lline_table_start0:
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/hello-world.cpp b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/hello-world.cpp
new file mode 100644
index 0000000000000..73a8e247e26ec
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/hello-world.cpp
@@ -0,0 +1,7 @@
+extern int printf(const char * format, ... );
+
+int main()
+{
+  printf("Hello, World\n");
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-43860-clang.s b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-43860-clang.s
new file mode 100644
index 0000000000000..fb70b36173fd9
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-43860-clang.s
@@ -0,0 +1,457 @@
+	.text
+	.file	"pr-43860.cpp"
+	.globaltype	__stack_pointer, i32
+	.functype	_Z4testii (i32, i32) -> (i32)
+	.section	.text._Z4testii,"",@
+	.hidden	_Z4testii                       # -- Begin function _Z4testii
+	.globl	_Z4testii
+	.type	_Z4testii,@function
+_Z4testii:                              # @_Z4testii
+.Lfunc_begin0:
+	.file	1 "/data/projects/scripts/regression-suite/input/general" "pr-43860.cpp"
+	.loc	1 11 0                          # pr-43860.cpp:11:0
+	.functype	_Z4testii (i32, i32) -> (i32)
+	.local  	i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32
+# %bb.0:                                # %entry
+	global.get	__stack_pointer
+	local.set	2
+	i32.const	32
+	local.set	3
+	local.get	2
+	local.get	3
+	i32.sub 
+	local.set	4
+	local.get	4
+	local.get	0
+	i32.store	16
+	local.get	4
+	local.get	1
+	i32.store	12
+.Ltmp0:
+	.loc	1 12 11 prologue_end            # pr-43860.cpp:12:11
+	local.get	4
+	i32.load	16
+	local.set	5
+	.loc	1 12 7 is_stmt 0                # pr-43860.cpp:12:7
+	local.get	4
+	local.get	5
+	i32.store	8
+	.loc	1 13 23 is_stmt 1               # pr-43860.cpp:13:23
+	local.get	4
+	i32.load	12
+	local.set	6
+	local.get	4
+	local.get	6
+	i32.store	28
+.Ltmp1:
+	.loc	1 3 15                          # pr-43860.cpp:3:15
+	local.get	4
+	i32.load	28
+	local.set	7
+	.loc	1 3 7 is_stmt 0                 # pr-43860.cpp:3:7
+	local.get	4
+	local.get	7
+	i32.store	24
+.Ltmp2:
+	.loc	1 5 17 is_stmt 1                # pr-43860.cpp:5:17
+	local.get	4
+	i32.load	28
+	local.set	8
+	.loc	1 5 25 is_stmt 0                # pr-43860.cpp:5:25
+	local.get	4
+	i32.load	24
+	local.set	9
+	.loc	1 5 23                          # pr-43860.cpp:5:23
+	local.get	8
+	local.get	9
+	i32.add 
+	local.set	10
+	.loc	1 5 9                           # pr-43860.cpp:5:9
+	local.get	4
+	local.get	10
+	i32.store	20
+	.loc	1 6 13 is_stmt 1                # pr-43860.cpp:6:13
+	local.get	4
+	i32.load	20
+	local.set	11
+	.loc	1 6 11 is_stmt 0                # pr-43860.cpp:6:11
+	local.get	4
+	local.get	11
+	i32.store	24
+.Ltmp3:
+	.loc	1 8 10 is_stmt 1                # pr-43860.cpp:8:10
+	local.get	4
+	i32.load	24
+	local.set	12
+.Ltmp4:
+	.loc	1 13 5                          # pr-43860.cpp:13:5
+	local.get	4
+	i32.load	8
+	local.set	13
+	local.get	13
+	local.get	12
+	i32.add 
+	local.set	14
+	local.get	4
+	local.get	14
+	i32.store	8
+	.loc	1 14 10                         # pr-43860.cpp:14:10
+	local.get	4
+	i32.load	8
+	local.set	15
+	.loc	1 14 3 is_stmt 0                # pr-43860.cpp:14:3
+	local.get	15
+	return
+	end_function
+.Ltmp5:
+.Lfunc_end0:
+                                        # -- End function
+	.section	.debug_abbrev,"",@
+	.int8	1                               # Abbreviation Code
+	.int8	17                              # DW_TAG_compile_unit
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	37                              # DW_AT_producer
+	.int8	14                              # DW_FORM_strp
+	.int8	19                              # DW_AT_language
+	.int8	5                               # DW_FORM_data2
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	16                              # DW_AT_stmt_list
+	.int8	23                              # DW_FORM_sec_offset
+	.int8	27                              # DW_AT_comp_dir
+	.int8	14                              # DW_FORM_strp
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	2                               # Abbreviation Code
+	.int8	46                              # DW_TAG_subprogram
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	110                             # DW_AT_linkage_name
+	.int8	14                              # DW_FORM_strp
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	63                              # DW_AT_external
+	.int8	25                              # DW_FORM_flag_present
+	.int8	32                              # DW_AT_inline
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	3                               # Abbreviation Code
+	.int8	5                               # DW_TAG_formal_parameter
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	4                               # Abbreviation Code
+	.int8	52                              # DW_TAG_variable
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	5                               # Abbreviation Code
+	.int8	11                              # DW_TAG_lexical_block
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	6                               # Abbreviation Code
+	.int8	36                              # DW_TAG_base_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	62                              # DW_AT_encoding
+	.int8	11                              # DW_FORM_data1
+	.int8	11                              # DW_AT_byte_size
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	7                               # Abbreviation Code
+	.int8	46                              # DW_TAG_subprogram
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	64                              # DW_AT_frame_base
+	.int8	24                              # DW_FORM_exprloc
+	.int8	110                             # DW_AT_linkage_name
+	.int8	14                              # DW_FORM_strp
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	63                              # DW_AT_external
+	.int8	25                              # DW_FORM_flag_present
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	8                               # Abbreviation Code
+	.int8	5                               # DW_TAG_formal_parameter
+	.int8	0                               # DW_CHILDREN_no
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	9                               # Abbreviation Code
+	.int8	52                              # DW_TAG_variable
+	.int8	0                               # DW_CHILDREN_no
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	10                              # Abbreviation Code
+	.int8	29                              # DW_TAG_inlined_subroutine
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	49                              # DW_AT_abstract_origin
+	.int8	19                              # DW_FORM_ref4
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	88                              # DW_AT_call_file
+	.int8	11                              # DW_FORM_data1
+	.int8	89                              # DW_AT_call_line
+	.int8	11                              # DW_FORM_data1
+	.int8	87                              # DW_AT_call_column
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	11                              # Abbreviation Code
+	.int8	5                               # DW_TAG_formal_parameter
+	.int8	0                               # DW_CHILDREN_no
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	49                              # DW_AT_abstract_origin
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	12                              # Abbreviation Code
+	.int8	52                              # DW_TAG_variable
+	.int8	0                               # DW_CHILDREN_no
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	49                              # DW_AT_abstract_origin
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	13                              # Abbreviation Code
+	.int8	11                              # DW_TAG_lexical_block
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	0                               # EOM(3)
+	.section	.debug_info,"",@
+.Lcu_begin0:
+	.int32	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.int16	4                               # DWARF version number
+	.int32	.debug_abbrev0                  # Offset Into Abbrev. Section
+	.int8	4                               # Address Size (in bytes)
+	.int8	1                               # Abbrev [1] 0xb:0xd1 DW_TAG_compile_unit
+	.int32	.Linfo_string0                  # DW_AT_producer
+	.int16	33                              # DW_AT_language
+	.int32	.Linfo_string1                  # DW_AT_name
+	.int32	.Lline_table_start0             # DW_AT_stmt_list
+	.int32	.Linfo_string2                  # DW_AT_comp_dir
+	.int32	.Lfunc_begin0                   # DW_AT_low_pc
+	.int32	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.int8	2                               # Abbrev [2] 0x26:0x34 DW_TAG_subprogram
+	.int32	.Linfo_string3                  # DW_AT_linkage_name
+	.int32	.Linfo_string4                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	2                               # DW_AT_decl_line
+	.int32	90                              # DW_AT_type
+                                        # DW_AT_external
+	.int8	1                               # DW_AT_inline
+	.int8	3                               # Abbrev [3] 0x36:0xb DW_TAG_formal_parameter
+	.int32	.Linfo_string6                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	2                               # DW_AT_decl_line
+	.int32	90                              # DW_AT_type
+	.int8	4                               # Abbrev [4] 0x41:0xb DW_TAG_variable
+	.int32	.Linfo_string7                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	3                               # DW_AT_decl_line
+	.int32	90                              # DW_AT_type
+	.int8	5                               # Abbrev [5] 0x4c:0xd DW_TAG_lexical_block
+	.int8	4                               # Abbrev [4] 0x4d:0xb DW_TAG_variable
+	.int32	.Linfo_string8                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	5                               # DW_AT_decl_line
+	.int32	90                              # DW_AT_type
+	.int8	0                               # End Of Children Mark
+	.int8	0                               # End Of Children Mark
+	.int8	6                               # Abbrev [6] 0x5a:0x7 DW_TAG_base_type
+	.int32	.Linfo_string5                  # DW_AT_name
+	.int8	5                               # DW_AT_encoding
+	.int8	4                               # DW_AT_byte_size
+	.int8	7                               # Abbrev [7] 0x61:0x7a DW_TAG_subprogram
+	.int32	.Lfunc_begin0                   # DW_AT_low_pc
+	.int32	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.int8	4                               # DW_AT_frame_base
+	.int8	237
+	.int8	0
+	.int8	4
+	.int8	159
+	.int32	.Linfo_string9                  # DW_AT_linkage_name
+	.int32	.Linfo_string10                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	11                              # DW_AT_decl_line
+	.int32	90                              # DW_AT_type
+                                        # DW_AT_external
+	.int8	8                               # Abbrev [8] 0x7d:0xe DW_TAG_formal_parameter
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	16
+	.int32	.Linfo_string11                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	11                              # DW_AT_decl_line
+	.int32	90                              # DW_AT_type
+	.int8	8                               # Abbrev [8] 0x8b:0xe DW_TAG_formal_parameter
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	12
+	.int32	.Linfo_string12                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	11                              # DW_AT_decl_line
+	.int32	90                              # DW_AT_type
+	.int8	9                               # Abbrev [9] 0x99:0xe DW_TAG_variable
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	8
+	.int32	.Linfo_string13                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	12                              # DW_AT_decl_line
+	.int32	90                              # DW_AT_type
+	.int8	10                              # Abbrev [10] 0xa7:0x33 DW_TAG_inlined_subroutine
+	.int32	38                              # DW_AT_abstract_origin
+	.int32	.Ltmp1                          # DW_AT_low_pc
+	.int32	.Ltmp4-.Ltmp1                   # DW_AT_high_pc
+	.int8	1                               # DW_AT_call_file
+	.int8	13                              # DW_AT_call_line
+	.int8	8                               # DW_AT_call_column
+	.int8	11                              # Abbrev [11] 0xb7:0x8 DW_TAG_formal_parameter
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	28
+	.int32	54                              # DW_AT_abstract_origin
+	.int8	12                              # Abbrev [12] 0xbf:0x8 DW_TAG_variable
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	24
+	.int32	65                              # DW_AT_abstract_origin
+	.int8	13                              # Abbrev [13] 0xc7:0x12 DW_TAG_lexical_block
+	.int32	.Ltmp2                          # DW_AT_low_pc
+	.int32	.Ltmp3-.Ltmp2                   # DW_AT_high_pc
+	.int8	12                              # Abbrev [12] 0xd0:0x8 DW_TAG_variable
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	20
+	.int32	77                              # DW_AT_abstract_origin
+	.int8	0                               # End Of Children Mark
+	.int8	0                               # End Of Children Mark
+	.int8	0                               # End Of Children Mark
+	.int8	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str,"S",@
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)" # string offset=0
+.Linfo_string1:
+	.asciz	"pr-43860.cpp"                  # string offset=111
+.Linfo_string2:
+	.asciz	"/data/projects/scripts/regression-suite/input/general" # string offset=124
+.Linfo_string3:
+	.asciz	"_Z14InlineFunctioni"           # string offset=178
+.Linfo_string4:
+	.asciz	"InlineFunction"                # string offset=198
+.Linfo_string5:
+	.asciz	"int"                           # string offset=213
+.Linfo_string6:
+	.asciz	"Param"                         # string offset=217
+.Linfo_string7:
+	.asciz	"Var_1"                         # string offset=223
+.Linfo_string8:
+	.asciz	"Var_2"                         # string offset=229
+.Linfo_string9:
+	.asciz	"_Z4testii"                     # string offset=235
+.Linfo_string10:
+	.asciz	"test"                          # string offset=245
+.Linfo_string11:
+	.asciz	"Param_1"                       # string offset=250
+.Linfo_string12:
+	.asciz	"Param_2"                       # string offset=258
+.Linfo_string13:
+	.asciz	"A"                             # string offset=266
+	.ident	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.section	.custom_section.producers,"",@
+	.int8	2
+	.int8	8
+	.ascii	"language"
+	.int8	1
+	.int8	14
+	.ascii	"C_plus_plus_14"
+	.int8	0
+	.int8	12
+	.ascii	"processed-by"
+	.int8	1
+	.int8	5
+	.ascii	"clang"
+	.int8	96
+	.ascii	"19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.section	.debug_str,"S",@
+	.section	.custom_section.target_features,"",@
+	.int8	2
+	.int8	43
+	.int8	15
+	.ascii	"mutable-globals"
+	.int8	43
+	.int8	8
+	.ascii	"sign-ext"
+	.section	.debug_str,"S",@
+	.section	.debug_line,"",@
+.Lline_table_start0:
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-43860.cpp b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-43860.cpp
new file mode 100644
index 0000000000000..a3d3b76c59e58
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-43860.cpp
@@ -0,0 +1,15 @@
+#include "definitions.h"
+forceinline int InlineFunction(int Param) {
+  int Var_1 = Param;
+  {
+    int Var_2 = Param + Var_1;
+    Var_1 = Var_2;
+  }
+  return Var_1;
+}
+
+int test(int Param_1, int Param_2) {
+  int A = Param_1;
+  A += InlineFunction(Param_2);
+  return A;
+}
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-44884-clang.s b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-44884-clang.s
new file mode 100644
index 0000000000000..af9875b1eee2a
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-44884-clang.s
@@ -0,0 +1,488 @@
+	.text
+	.file	"pr-44884.cpp"
+	.globaltype	__stack_pointer, i32
+	.functype	_Z3barf (f32) -> (i32)
+	.functype	_Z3fooc (i32) -> (i32)
+	.section	.text._Z3barf,"",@
+	.hidden	_Z3barf                         # -- Begin function _Z3barf
+	.globl	_Z3barf
+	.type	_Z3barf,@function
+_Z3barf:                                # @_Z3barf
+.Lfunc_begin0:
+	.file	1 "/data/projects/scripts/regression-suite/input/general" "pr-44884.cpp"
+	.loc	1 1 0                           # pr-44884.cpp:1:0
+	.functype	_Z3barf (f32) -> (i32)
+	.local  	i32, i32, i32, f32, f32, f32, i32, i32, i32, i32, i32, i32
+# %bb.0:                                # %entry
+	global.get	__stack_pointer
+	local.set	1
+	i32.const	16
+	local.set	2
+	local.get	1
+	local.get	2
+	i32.sub 
+	local.set	3
+	local.get	3
+	local.get	0
+	f32.store	12
+.Ltmp0:
+	.loc	1 1 36 prologue_end             # pr-44884.cpp:1:36
+	local.get	3
+	f32.load	12
+	local.set	4
+	local.get	4
+	f32.abs 
+	local.set	5
+	f32.const	0x1p31
+	local.set	6
+	local.get	5
+	local.get	6
+	f32.lt  
+	local.set	7
+	local.get	7
+	i32.eqz
+	local.set	8
+	block   	
+	block   	
+	local.get	8
+	br_if   	0                               # 0: down to label1
+# %bb.1:                                # %entry
+	local.get	4
+	i32.trunc_f32_s
+	local.set	9
+	local.get	9
+	local.set	10
+	br      	1                               # 1: down to label0
+.LBB0_2:                                # %entry
+	.loc	1 0 36 is_stmt 0                # pr-44884.cpp:0:36
+	end_block                               # label1:
+	.loc	1 1 36                          # pr-44884.cpp:1:36
+	i32.const	-2147483648
+	local.set	11
+	local.get	11
+	local.set	10
+.LBB0_3:                                # %entry
+	.loc	1 0 36                          # pr-44884.cpp:0:36
+	end_block                               # label0:
+	.loc	1 1 36                          # pr-44884.cpp:1:36
+	local.get	10
+	local.set	12
+	.loc	1 1 24                          # pr-44884.cpp:1:24
+	local.get	12
+	return
+	end_function
+.Ltmp1:
+.Lfunc_end0:
+                                        # -- End function
+	.section	.text._Z3fooc,"",@
+	.hidden	_Z3fooc                         # -- Begin function _Z3fooc
+	.globl	_Z3fooc
+	.type	_Z3fooc,@function
+_Z3fooc:                                # @_Z3fooc
+.Lfunc_begin1:
+	.loc	1 3 0 is_stmt 1                 # pr-44884.cpp:3:0
+	.functype	_Z3fooc (i32) -> (i32)
+	.local  	i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, i32, i32, i32, i32, i32, i32, i32, i32, i32
+# %bb.0:                                # %entry
+	global.get	__stack_pointer
+	local.set	1
+	i32.const	16
+	local.set	2
+	local.get	1
+	local.get	2
+	i32.sub 
+	local.set	3
+	local.get	3
+	global.set	__stack_pointer
+	local.get	3
+	local.get	0
+	i32.store8	15
+.Ltmp2:
+	.loc	1 5 15 prologue_end             # pr-44884.cpp:5:15
+	local.get	3
+	i32.load8_u	15
+	local.set	4
+	i32.const	24
+	local.set	5
+	local.get	4
+	local.get	5
+	i32.shl 
+	local.set	6
+	local.get	6
+	local.get	5
+	i32.shr_s
+	local.set	7
+	.loc	1 5 7 is_stmt 0                 # pr-44884.cpp:5:7
+	local.get	3
+	local.get	7
+	i32.store	8
+.Ltmp3:
+	.loc	1 9 21 is_stmt 1                # pr-44884.cpp:9:21
+	local.get	3
+	i32.load	8
+	local.set	8
+	.loc	1 9 29 is_stmt 0                # pr-44884.cpp:9:29
+	local.get	3
+	i32.load8_u	15
+	local.set	9
+	i32.const	24
+	local.set	10
+	local.get	9
+	local.get	10
+	i32.shl 
+	local.set	11
+	local.get	11
+	local.get	10
+	i32.shr_s
+	local.set	12
+	.loc	1 9 27                          # pr-44884.cpp:9:27
+	local.get	8
+	local.get	12
+	i32.add 
+	local.set	13
+	.loc	1 9 21                          # pr-44884.cpp:9:21
+	local.get	13
+	f32.convert_i32_s
+	local.set	14
+	.loc	1 9 13                          # pr-44884.cpp:9:13
+	local.get	3
+	local.get	14
+	f32.store	4
+	.loc	1 10 19 is_stmt 1               # pr-44884.cpp:10:19
+	local.get	3
+	f32.load	4
+	local.set	15
+	.loc	1 10 15 is_stmt 0               # pr-44884.cpp:10:15
+	local.get	15
+	call	_Z3barf
+	local.set	16
+	.loc	1 10 13                         # pr-44884.cpp:10:13
+	local.get	3
+	local.get	16
+	i32.store	8
+.Ltmp4:
+	.loc	1 13 10 is_stmt 1               # pr-44884.cpp:13:10
+	local.get	3
+	i32.load	8
+	local.set	17
+	.loc	1 13 18 is_stmt 0               # pr-44884.cpp:13:18
+	local.get	3
+	i32.load8_u	15
+	local.set	18
+	i32.const	24
+	local.set	19
+	local.get	18
+	local.get	19
+	i32.shl 
+	local.set	20
+	local.get	20
+	local.get	19
+	i32.shr_s
+	local.set	21
+	.loc	1 13 16                         # pr-44884.cpp:13:16
+	local.get	17
+	local.get	21
+	i32.add 
+	local.set	22
+	.loc	1 13 3                          # pr-44884.cpp:13:3
+	i32.const	16
+	local.set	23
+	local.get	3
+	local.get	23
+	i32.add 
+	local.set	24
+	local.get	24
+	global.set	__stack_pointer
+	local.get	22
+	return
+	end_function
+.Ltmp5:
+.Lfunc_end1:
+                                        # -- End function
+	.section	.debug_abbrev,"",@
+	.int8	1                               # Abbreviation Code
+	.int8	17                              # DW_TAG_compile_unit
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	37                              # DW_AT_producer
+	.int8	14                              # DW_FORM_strp
+	.int8	19                              # DW_AT_language
+	.int8	5                               # DW_FORM_data2
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	16                              # DW_AT_stmt_list
+	.int8	23                              # DW_FORM_sec_offset
+	.int8	27                              # DW_AT_comp_dir
+	.int8	14                              # DW_FORM_strp
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	85                              # DW_AT_ranges
+	.int8	23                              # DW_FORM_sec_offset
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	2                               # Abbreviation Code
+	.int8	36                              # DW_TAG_base_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	62                              # DW_AT_encoding
+	.int8	11                              # DW_FORM_data1
+	.int8	11                              # DW_AT_byte_size
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	3                               # Abbreviation Code
+	.int8	46                              # DW_TAG_subprogram
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	64                              # DW_AT_frame_base
+	.int8	24                              # DW_FORM_exprloc
+	.int8	110                             # DW_AT_linkage_name
+	.int8	14                              # DW_FORM_strp
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	63                              # DW_AT_external
+	.int8	25                              # DW_FORM_flag_present
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	4                               # Abbreviation Code
+	.int8	5                               # DW_TAG_formal_parameter
+	.int8	0                               # DW_CHILDREN_no
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	5                               # Abbreviation Code
+	.int8	52                              # DW_TAG_variable
+	.int8	0                               # DW_CHILDREN_no
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	6                               # Abbreviation Code
+	.int8	11                              # DW_TAG_lexical_block
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	7                               # Abbreviation Code
+	.int8	22                              # DW_TAG_typedef
+	.int8	0                               # DW_CHILDREN_no
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	0                               # EOM(3)
+	.section	.debug_info,"",@
+.Lcu_begin0:
+	.int32	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.int16	4                               # DWARF version number
+	.int32	.debug_abbrev0                  # Offset Into Abbrev. Section
+	.int8	4                               # Address Size (in bytes)
+	.int8	1                               # Abbrev [1] 0xb:0xca DW_TAG_compile_unit
+	.int32	.Linfo_string0                  # DW_AT_producer
+	.int16	33                              # DW_AT_language
+	.int32	.Linfo_string1                  # DW_AT_name
+	.int32	.Lline_table_start0             # DW_AT_stmt_list
+	.int32	.Linfo_string2                  # DW_AT_comp_dir
+	.int32	0                               # DW_AT_low_pc
+	.int32	.Ldebug_ranges0                 # DW_AT_ranges
+	.int8	2                               # Abbrev [2] 0x26:0x7 DW_TAG_base_type
+	.int32	.Linfo_string3                  # DW_AT_name
+	.int8	5                               # DW_AT_encoding
+	.int8	4                               # DW_AT_byte_size
+	.int8	3                               # Abbrev [3] 0x2d:0x2b DW_TAG_subprogram
+	.int32	.Lfunc_begin0                   # DW_AT_low_pc
+	.int32	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.int8	4                               # DW_AT_frame_base
+	.int8	237
+	.int8	0
+	.int8	3
+	.int8	159
+	.int32	.Linfo_string4                  # DW_AT_linkage_name
+	.int32	.Linfo_string5                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	1                               # DW_AT_decl_line
+	.int32	38                              # DW_AT_type
+                                        # DW_AT_external
+	.int8	4                               # Abbrev [4] 0x49:0xe DW_TAG_formal_parameter
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	12
+	.int32	.Linfo_string9                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	1                               # DW_AT_decl_line
+	.int32	198                             # DW_AT_type
+	.int8	0                               # End Of Children Mark
+	.int8	3                               # Abbrev [3] 0x58:0x67 DW_TAG_subprogram
+	.int32	.Lfunc_begin1                   # DW_AT_low_pc
+	.int32	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.int8	4                               # DW_AT_frame_base
+	.int8	237
+	.int8	0
+	.int8	3
+	.int8	159
+	.int32	.Linfo_string6                  # DW_AT_linkage_name
+	.int32	.Linfo_string7                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	3                               # DW_AT_decl_line
+	.int32	191                             # DW_AT_type
+                                        # DW_AT_external
+	.int8	4                               # Abbrev [4] 0x74:0xe DW_TAG_formal_parameter
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	15
+	.int32	.Linfo_string11                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	3                               # DW_AT_decl_line
+	.int32	205                             # DW_AT_type
+	.int8	5                               # Abbrev [5] 0x82:0xe DW_TAG_variable
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	8
+	.int32	.Linfo_string13                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	5                               # DW_AT_decl_line
+	.int32	168                             # DW_AT_type
+	.int8	6                               # Abbrev [6] 0x90:0x18 DW_TAG_lexical_block
+	.int32	.Ltmp3                          # DW_AT_low_pc
+	.int32	.Ltmp4-.Ltmp3                   # DW_AT_high_pc
+	.int8	5                               # Abbrev [5] 0x99:0xe DW_TAG_variable
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	4
+	.int32	.Linfo_string15                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	9                               # DW_AT_decl_line
+	.int32	179                             # DW_AT_type
+	.int8	0                               # End Of Children Mark
+	.int8	7                               # Abbrev [7] 0xa8:0xb DW_TAG_typedef
+	.int32	38                              # DW_AT_type
+	.int32	.Linfo_string14                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	4                               # DW_AT_decl_line
+	.int8	7                               # Abbrev [7] 0xb3:0xb DW_TAG_typedef
+	.int32	198                             # DW_AT_type
+	.int32	.Linfo_string16                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	7                               # DW_AT_decl_line
+	.int8	0                               # End Of Children Mark
+	.int8	2                               # Abbrev [2] 0xbf:0x7 DW_TAG_base_type
+	.int32	.Linfo_string8                  # DW_AT_name
+	.int8	7                               # DW_AT_encoding
+	.int8	4                               # DW_AT_byte_size
+	.int8	2                               # Abbrev [2] 0xc6:0x7 DW_TAG_base_type
+	.int32	.Linfo_string10                 # DW_AT_name
+	.int8	4                               # DW_AT_encoding
+	.int8	4                               # DW_AT_byte_size
+	.int8	2                               # Abbrev [2] 0xcd:0x7 DW_TAG_base_type
+	.int32	.Linfo_string12                 # DW_AT_name
+	.int8	6                               # DW_AT_encoding
+	.int8	1                               # DW_AT_byte_size
+	.int8	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_ranges,"",@
+.Ldebug_ranges0:
+	.int32	.Lfunc_begin0
+	.int32	.Lfunc_end0
+	.int32	.Lfunc_begin1
+	.int32	.Lfunc_end1
+	.int32	0
+	.int32	0
+	.section	.debug_str,"S",@
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)" # string offset=0
+.Linfo_string1:
+	.asciz	"pr-44884.cpp"                  # string offset=111
+.Linfo_string2:
+	.asciz	"/data/projects/scripts/regression-suite/input/general" # string offset=124
+.Linfo_string3:
+	.asciz	"int"                           # string offset=178
+.Linfo_string4:
+	.asciz	"_Z3barf"                       # string offset=182
+.Linfo_string5:
+	.asciz	"bar"                           # string offset=190
+.Linfo_string6:
+	.asciz	"_Z3fooc"                       # string offset=194
+.Linfo_string7:
+	.asciz	"foo"                           # string offset=202
+.Linfo_string8:
+	.asciz	"unsigned int"                  # string offset=206
+.Linfo_string9:
+	.asciz	"Input"                         # string offset=219
+.Linfo_string10:
+	.asciz	"float"                         # string offset=225
+.Linfo_string11:
+	.asciz	"Param"                         # string offset=231
+.Linfo_string12:
+	.asciz	"char"                          # string offset=237
+.Linfo_string13:
+	.asciz	"Value"                         # string offset=242
+.Linfo_string14:
+	.asciz	"INT"                           # string offset=248
+.Linfo_string15:
+	.asciz	"Added"                         # string offset=252
+.Linfo_string16:
+	.asciz	"FLOAT"                         # string offset=258
+	.ident	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.section	.custom_section.producers,"",@
+	.int8	2
+	.int8	8
+	.ascii	"language"
+	.int8	1
+	.int8	14
+	.ascii	"C_plus_plus_14"
+	.int8	0
+	.int8	12
+	.ascii	"processed-by"
+	.int8	1
+	.int8	5
+	.ascii	"clang"
+	.int8	96
+	.ascii	"19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.section	.debug_str,"S",@
+	.section	.custom_section.target_features,"",@
+	.int8	2
+	.int8	43
+	.int8	15
+	.ascii	"mutable-globals"
+	.int8	43
+	.int8	8
+	.ascii	"sign-ext"
+	.section	.debug_str,"S",@
+	.section	.debug_line,"",@
+.Lline_table_start0:
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-44884.cpp b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-44884.cpp
new file mode 100644
index 0000000000000..4b47aae8e0b17
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-44884.cpp
@@ -0,0 +1,14 @@
+int bar(float Input) { return (int)Input; }
+
+unsigned foo(char Param) {
+  typedef int INT;
+  INT Value = Param;
+  {
+    typedef float FLOAT;
+    {
+      FLOAT Added = Value + Param;
+      Value = bar(Added);
+    }
+  }
+  return Value + Param;
+}
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-46466-clang.s b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-46466-clang.s
new file mode 100644
index 0000000000000..1056db0760eae
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-46466-clang.s
@@ -0,0 +1,259 @@
+	.text
+	.file	"pr-46466.cpp"
+	.file	1 "/data/projects/scripts/regression-suite/input/general" "pr-46466.cpp"
+	.functype	_Z4testv () -> (i32)
+	.section	.text._Z4testv,"",@
+	.hidden	_Z4testv                        # -- Begin function _Z4testv
+	.globl	_Z4testv
+	.type	_Z4testv,@function
+_Z4testv:                               # @_Z4testv
+.Lfunc_begin0:
+	.functype	_Z4testv () -> (i32)
+	.local  	i32
+# %bb.0:                                # %entry
+	.loc	1 10 3 prologue_end             # pr-46466.cpp:10:3
+	i32.const	1
+	local.set	0
+	local.get	0
+	return
+	end_function
+.Ltmp0:
+.Lfunc_end0:
+                                        # -- End function
+	.hidden	S                               # @S
+	.type	S,@object
+	.section	.bss.S,"",@
+	.globl	S
+S:
+	.skip	1
+	.size	S, 1
+
+	.section	.debug_abbrev,"",@
+	.int8	1                               # Abbreviation Code
+	.int8	17                              # DW_TAG_compile_unit
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	37                              # DW_AT_producer
+	.int8	14                              # DW_FORM_strp
+	.int8	19                              # DW_AT_language
+	.int8	5                               # DW_FORM_data2
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	16                              # DW_AT_stmt_list
+	.int8	23                              # DW_FORM_sec_offset
+	.int8	27                              # DW_AT_comp_dir
+	.int8	14                              # DW_FORM_strp
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	2                               # Abbreviation Code
+	.int8	52                              # DW_TAG_variable
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	63                              # DW_AT_external
+	.int8	25                              # DW_FORM_flag_present
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	3                               # Abbreviation Code
+	.int8	19                              # DW_TAG_structure_type
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	54                              # DW_AT_calling_convention
+	.int8	11                              # DW_FORM_data1
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	11                              # DW_AT_byte_size
+	.int8	11                              # DW_FORM_data1
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	4                               # Abbreviation Code
+	.int8	13                              # DW_TAG_member
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	56                              # DW_AT_data_member_location
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	5                               # Abbreviation Code
+	.int8	23                              # DW_TAG_union_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	54                              # DW_AT_calling_convention
+	.int8	11                              # DW_FORM_data1
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	11                              # DW_AT_byte_size
+	.int8	11                              # DW_FORM_data1
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	6                               # Abbreviation Code
+	.int8	46                              # DW_TAG_subprogram
+	.int8	0                               # DW_CHILDREN_no
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	64                              # DW_AT_frame_base
+	.int8	24                              # DW_FORM_exprloc
+	.int8	110                             # DW_AT_linkage_name
+	.int8	14                              # DW_FORM_strp
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	63                              # DW_AT_external
+	.int8	25                              # DW_FORM_flag_present
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	7                               # Abbreviation Code
+	.int8	36                              # DW_TAG_base_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	62                              # DW_AT_encoding
+	.int8	11                              # DW_FORM_data1
+	.int8	11                              # DW_AT_byte_size
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	0                               # EOM(3)
+	.section	.debug_info,"",@
+.Lcu_begin0:
+	.int32	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.int16	4                               # DWARF version number
+	.int32	.debug_abbrev0                  # Offset Into Abbrev. Section
+	.int8	4                               # Address Size (in bytes)
+	.int8	1                               # Abbrev [1] 0xb:0x72 DW_TAG_compile_unit
+	.int32	.Linfo_string0                  # DW_AT_producer
+	.int16	33                              # DW_AT_language
+	.int32	.Linfo_string1                  # DW_AT_name
+	.int32	.Lline_table_start0             # DW_AT_stmt_list
+	.int32	.Linfo_string2                  # DW_AT_comp_dir
+	.int32	.Lfunc_begin0                   # DW_AT_low_pc
+	.int32	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.int8	2                               # Abbrev [2] 0x26:0x11 DW_TAG_variable
+	.int32	.Linfo_string3                  # DW_AT_name
+	.int32	55                              # DW_AT_type
+                                        # DW_AT_external
+	.int8	1                               # DW_AT_decl_file
+	.int8	8                               # DW_AT_decl_line
+	.int8	5                               # DW_AT_location
+	.int8	3
+	.int32	S
+	.int8	3                               # Abbrev [3] 0x37:0x1f DW_TAG_structure_type
+	.int8	5                               # DW_AT_calling_convention
+	.int32	.Linfo_string6                  # DW_AT_name
+	.int8	1                               # DW_AT_byte_size
+	.int8	1                               # DW_AT_decl_file
+	.int8	1                               # DW_AT_decl_line
+	.int8	4                               # Abbrev [4] 0x40:0xc DW_TAG_member
+	.int32	.Linfo_string4                  # DW_AT_name
+	.int32	76                              # DW_AT_type
+	.int8	1                               # DW_AT_decl_file
+	.int8	5                               # DW_AT_decl_line
+	.int8	0                               # DW_AT_data_member_location
+	.int8	5                               # Abbrev [5] 0x4c:0x9 DW_TAG_union_type
+	.int8	5                               # DW_AT_calling_convention
+	.int32	.Linfo_string5                  # DW_AT_name
+	.int8	1                               # DW_AT_byte_size
+	.int8	1                               # DW_AT_decl_file
+	.int8	2                               # DW_AT_decl_line
+	.int8	0                               # End Of Children Mark
+	.int8	6                               # Abbrev [6] 0x56:0x1f DW_TAG_subprogram
+	.int32	.Lfunc_begin0                   # DW_AT_low_pc
+	.int32	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.int8	7                               # DW_AT_frame_base
+	.int8	237
+	.int8	3
+	.int32	__stack_pointer
+	.int8	159
+	.int32	.Linfo_string7                  # DW_AT_linkage_name
+	.int32	.Linfo_string8                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	9                               # DW_AT_decl_line
+	.int32	117                             # DW_AT_type
+                                        # DW_AT_external
+	.int8	7                               # Abbrev [7] 0x75:0x7 DW_TAG_base_type
+	.int32	.Linfo_string9                  # DW_AT_name
+	.int8	5                               # DW_AT_encoding
+	.int8	4                               # DW_AT_byte_size
+	.int8	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str,"S",@
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)" # string offset=0
+.Linfo_string1:
+	.asciz	"pr-46466.cpp"                  # string offset=111
+.Linfo_string2:
+	.asciz	"/data/projects/scripts/regression-suite/input/general" # string offset=124
+.Linfo_string3:
+	.asciz	"S"                             # string offset=178
+.Linfo_string4:
+	.asciz	"U"                             # string offset=180
+.Linfo_string5:
+	.asciz	"Union"                         # string offset=182
+.Linfo_string6:
+	.asciz	"Struct"                        # string offset=188
+.Linfo_string7:
+	.asciz	"_Z4testv"                      # string offset=195
+.Linfo_string8:
+	.asciz	"test"                          # string offset=204
+.Linfo_string9:
+	.asciz	"int"                           # string offset=209
+	.ident	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.section	.custom_section.producers,"",@
+	.int8	2
+	.int8	8
+	.ascii	"language"
+	.int8	1
+	.int8	14
+	.ascii	"C_plus_plus_14"
+	.int8	0
+	.int8	12
+	.ascii	"processed-by"
+	.int8	1
+	.int8	5
+	.ascii	"clang"
+	.int8	96
+	.ascii	"19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.section	.debug_str,"S",@
+	.section	.custom_section.target_features,"",@
+	.int8	2
+	.int8	43
+	.int8	15
+	.ascii	"mutable-globals"
+	.int8	43
+	.int8	8
+	.ascii	"sign-ext"
+	.section	.debug_str,"S",@
+	.section	.debug_line,"",@
+.Lline_table_start0:
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-46466.cpp b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-46466.cpp
new file mode 100644
index 0000000000000..28be9a5d60ec3
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/pr-46466.cpp
@@ -0,0 +1,11 @@
+struct Struct {
+  union Union {
+    enum NestedEnum { RED, BLUE };
+  };
+  Union U;
+};
+
+Struct S;
+int test() {
+  return S.U.BLUE;
+}
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/test-clang.s b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/test-clang.s
new file mode 100644
index 0000000000000..02afaf70d0ea8
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/test-clang.s
@@ -0,0 +1,366 @@
+	.text
+	.file	"test.cpp"
+	.globaltype	__stack_pointer, i32
+	.functype	_Z3fooPKijb (i32, i32, i32) -> (i32)
+	.section	.text._Z3fooPKijb,"",@
+	.hidden	_Z3fooPKijb                     # -- Begin function _Z3fooPKijb
+	.globl	_Z3fooPKijb
+	.type	_Z3fooPKijb,@function
+_Z3fooPKijb:                            # @_Z3fooPKijb
+.Lfunc_begin0:
+	.file	1 "/data/projects/scripts/regression-suite/input/general" "test.cpp"
+	.loc	1 2 0                           # test.cpp:2:0
+	.functype	_Z3fooPKijb (i32, i32, i32) -> (i32)
+	.local  	i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32
+# %bb.0:                                # %entry
+	global.get	__stack_pointer
+	local.set	3
+	i32.const	32
+	local.set	4
+	local.get	3
+	local.get	4
+	i32.sub 
+	local.set	5
+	local.get	5
+	local.get	0
+	i32.store	24
+	local.get	5
+	local.get	1
+	i32.store	20
+	local.get	2
+	local.set	6
+	local.get	5
+	local.get	6
+	i32.store8	19
+.Ltmp0:
+	.loc	1 3 7 prologue_end              # test.cpp:3:7
+	local.get	5
+	i32.load8_u	19
+	local.set	7
+.Ltmp1:
+	.loc	1 3 7 is_stmt 0                 # test.cpp:3:7
+	i32.const	1
+	local.set	8
+	local.get	7
+	local.get	8
+	i32.and 
+	local.set	9
+	block   	
+	block   	
+	local.get	9
+	i32.eqz
+	br_if   	0                               # 0: down to label1
+# %bb.1:                                # %if.then
+.Ltmp2:
+	.loc	1 5 19 is_stmt 1                # test.cpp:5:19
+	i32.const	7
+	local.set	10
+	local.get	5
+	local.get	10
+	i32.store	12
+	.loc	1 6 5                           # test.cpp:6:5
+	i32.const	7
+	local.set	11
+	local.get	5
+	local.get	11
+	i32.store	28
+	br      	1                               # 1: down to label0
+.Ltmp3:
+.LBB0_2:                                # %if.end
+	.loc	1 0 5 is_stmt 0                 # test.cpp:0:5
+	end_block                               # label1:
+	.loc	1 8 10 is_stmt 1                # test.cpp:8:10
+	local.get	5
+	i32.load	20
+	local.set	12
+	.loc	1 8 3 is_stmt 0                 # test.cpp:8:3
+	local.get	5
+	local.get	12
+	i32.store	28
+.LBB0_3:                                # %return
+	.loc	1 0 3                           # test.cpp:0:3
+	end_block                               # label0:
+	.loc	1 9 1 is_stmt 1                 # test.cpp:9:1
+	local.get	5
+	i32.load	28
+	local.set	13
+	local.get	13
+	return
+	end_function
+.Ltmp4:
+.Lfunc_end0:
+                                        # -- End function
+	.section	.debug_abbrev,"",@
+	.int8	1                               # Abbreviation Code
+	.int8	17                              # DW_TAG_compile_unit
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	37                              # DW_AT_producer
+	.int8	14                              # DW_FORM_strp
+	.int8	19                              # DW_AT_language
+	.int8	5                               # DW_FORM_data2
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	16                              # DW_AT_stmt_list
+	.int8	23                              # DW_FORM_sec_offset
+	.int8	27                              # DW_AT_comp_dir
+	.int8	14                              # DW_FORM_strp
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	2                               # Abbreviation Code
+	.int8	46                              # DW_TAG_subprogram
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	64                              # DW_AT_frame_base
+	.int8	24                              # DW_FORM_exprloc
+	.int8	110                             # DW_AT_linkage_name
+	.int8	14                              # DW_FORM_strp
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	63                              # DW_AT_external
+	.int8	25                              # DW_FORM_flag_present
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	3                               # Abbreviation Code
+	.int8	5                               # DW_TAG_formal_parameter
+	.int8	0                               # DW_CHILDREN_no
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	4                               # Abbreviation Code
+	.int8	11                              # DW_TAG_lexical_block
+	.int8	1                               # DW_CHILDREN_yes
+	.int8	17                              # DW_AT_low_pc
+	.int8	1                               # DW_FORM_addr
+	.int8	18                              # DW_AT_high_pc
+	.int8	6                               # DW_FORM_data4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	5                               # Abbreviation Code
+	.int8	52                              # DW_TAG_variable
+	.int8	0                               # DW_CHILDREN_no
+	.int8	2                               # DW_AT_location
+	.int8	24                              # DW_FORM_exprloc
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	6                               # Abbreviation Code
+	.int8	22                              # DW_TAG_typedef
+	.int8	0                               # DW_CHILDREN_no
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	58                              # DW_AT_decl_file
+	.int8	11                              # DW_FORM_data1
+	.int8	59                              # DW_AT_decl_line
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	7                               # Abbreviation Code
+	.int8	36                              # DW_TAG_base_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	3                               # DW_AT_name
+	.int8	14                              # DW_FORM_strp
+	.int8	62                              # DW_AT_encoding
+	.int8	11                              # DW_FORM_data1
+	.int8	11                              # DW_AT_byte_size
+	.int8	11                              # DW_FORM_data1
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	8                               # Abbreviation Code
+	.int8	15                              # DW_TAG_pointer_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	9                               # Abbreviation Code
+	.int8	38                              # DW_TAG_const_type
+	.int8	0                               # DW_CHILDREN_no
+	.int8	73                              # DW_AT_type
+	.int8	19                              # DW_FORM_ref4
+	.int8	0                               # EOM(1)
+	.int8	0                               # EOM(2)
+	.int8	0                               # EOM(3)
+	.section	.debug_info,"",@
+.Lcu_begin0:
+	.int32	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.int16	4                               # DWARF version number
+	.int32	.debug_abbrev0                  # Offset Into Abbrev. Section
+	.int8	4                               # Address Size (in bytes)
+	.int8	1                               # Abbrev [1] 0xb:0xb5 DW_TAG_compile_unit
+	.int32	.Linfo_string0                  # DW_AT_producer
+	.int16	33                              # DW_AT_language
+	.int32	.Linfo_string1                  # DW_AT_name
+	.int32	.Lline_table_start0             # DW_AT_stmt_list
+	.int32	.Linfo_string2                  # DW_AT_comp_dir
+	.int32	.Lfunc_begin0                   # DW_AT_low_pc
+	.int32	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.int8	2                               # Abbrev [2] 0x26:0x6a DW_TAG_subprogram
+	.int32	.Lfunc_begin0                   # DW_AT_low_pc
+	.int32	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.int8	4                               # DW_AT_frame_base
+	.int8	237
+	.int8	0
+	.int8	5
+	.int8	159
+	.int32	.Linfo_string3                  # DW_AT_linkage_name
+	.int32	.Linfo_string4                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	2                               # DW_AT_decl_line
+	.int32	144                             # DW_AT_type
+                                        # DW_AT_external
+	.int8	3                               # Abbrev [3] 0x42:0xe DW_TAG_formal_parameter
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	24
+	.int32	.Linfo_string6                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	2                               # DW_AT_decl_line
+	.int32	151                             # DW_AT_type
+	.int8	3                               # Abbrev [3] 0x50:0xe DW_TAG_formal_parameter
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	20
+	.int32	.Linfo_string8                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	2                               # DW_AT_decl_line
+	.int32	172                             # DW_AT_type
+	.int8	3                               # Abbrev [3] 0x5e:0xe DW_TAG_formal_parameter
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	19
+	.int32	.Linfo_string10                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	2                               # DW_AT_decl_line
+	.int32	179                             # DW_AT_type
+	.int8	4                               # Abbrev [4] 0x6c:0x18 DW_TAG_lexical_block
+	.int32	.Ltmp2                          # DW_AT_low_pc
+	.int32	.Ltmp3-.Ltmp2                   # DW_AT_high_pc
+	.int8	5                               # Abbrev [5] 0x75:0xe DW_TAG_variable
+	.int8	2                               # DW_AT_location
+	.int8	145
+	.int8	12
+	.int32	.Linfo_string12                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	5                               # DW_AT_decl_line
+	.int32	186                             # DW_AT_type
+	.int8	0                               # End Of Children Mark
+	.int8	6                               # Abbrev [6] 0x84:0xb DW_TAG_typedef
+	.int32	144                             # DW_AT_type
+	.int32	.Linfo_string13                 # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	4                               # DW_AT_decl_line
+	.int8	0                               # End Of Children Mark
+	.int8	7                               # Abbrev [7] 0x90:0x7 DW_TAG_base_type
+	.int32	.Linfo_string5                  # DW_AT_name
+	.int8	5                               # DW_AT_encoding
+	.int8	4                               # DW_AT_byte_size
+	.int8	6                               # Abbrev [6] 0x97:0xb DW_TAG_typedef
+	.int32	162                             # DW_AT_type
+	.int32	.Linfo_string7                  # DW_AT_name
+	.int8	1                               # DW_AT_decl_file
+	.int8	1                               # DW_AT_decl_line
+	.int8	8                               # Abbrev [8] 0xa2:0x5 DW_TAG_pointer_type
+	.int32	167                             # DW_AT_type
+	.int8	9                               # Abbrev [9] 0xa7:0x5 DW_TAG_const_type
+	.int32	144                             # DW_AT_type
+	.int8	7                               # Abbrev [7] 0xac:0x7 DW_TAG_base_type
+	.int32	.Linfo_string9                  # DW_AT_name
+	.int8	7                               # DW_AT_encoding
+	.int8	4                               # DW_AT_byte_size
+	.int8	7                               # Abbrev [7] 0xb3:0x7 DW_TAG_base_type
+	.int32	.Linfo_string11                 # DW_AT_name
+	.int8	2                               # DW_AT_encoding
+	.int8	1                               # DW_AT_byte_size
+	.int8	9                               # Abbrev [9] 0xba:0x5 DW_TAG_const_type
+	.int32	132                             # DW_AT_type
+	.int8	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str,"S",@
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)" # string offset=0
+.Linfo_string1:
+	.asciz	"test.cpp"                      # string offset=111
+.Linfo_string2:
+	.asciz	"/data/projects/scripts/regression-suite/input/general" # string offset=120
+.Linfo_string3:
+	.asciz	"_Z3fooPKijb"                   # string offset=174
+.Linfo_string4:
+	.asciz	"foo"                           # string offset=186
+.Linfo_string5:
+	.asciz	"int"                           # string offset=190
+.Linfo_string6:
+	.asciz	"ParamPtr"                      # string offset=194
+.Linfo_string7:
+	.asciz	"INTPTR"                        # string offset=203
+.Linfo_string8:
+	.asciz	"ParamUnsigned"                 # string offset=210
+.Linfo_string9:
+	.asciz	"unsigned int"                  # string offset=224
+.Linfo_string10:
+	.asciz	"ParamBool"                     # string offset=237
+.Linfo_string11:
+	.asciz	"bool"                          # string offset=247
+.Linfo_string12:
+	.asciz	"CONSTANT"                      # string offset=252
+.Linfo_string13:
+	.asciz	"INTEGER"                       # string offset=261
+	.ident	"clang version 19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.section	.custom_section.producers,"",@
+	.int8	2
+	.int8	8
+	.ascii	"language"
+	.int8	1
+	.int8	14
+	.ascii	"C_plus_plus_14"
+	.int8	0
+	.int8	12
+	.ascii	"processed-by"
+	.int8	1
+	.int8	5
+	.ascii	"clang"
+	.int8	96
+	.ascii	"19.0.0git (/data/projects/llvm-root/llvm-project/clang 2db6703f0c257d293df455e2dff8c1fb695c4100)"
+	.section	.debug_str,"S",@
+	.section	.custom_section.target_features,"",@
+	.int8	2
+	.int8	43
+	.int8	15
+	.ascii	"mutable-globals"
+	.int8	43
+	.int8	8
+	.ascii	"sign-ext"
+	.section	.debug_str,"S",@
+	.section	.debug_line,"",@
+.Lline_table_start0:
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/test.cpp b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/test.cpp
new file mode 100644
index 0000000000000..5cf39f4773584
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/Inputs/test.cpp
@@ -0,0 +1,9 @@
+using INTPTR = const int *;
+int foo(INTPTR ParamPtr, unsigned ParamUnsigned, bool ParamBool) {
+  if (ParamBool) {
+    typedef int INTEGER;
+    const INTEGER CONSTANT = 7;
+    return CONSTANT;
+  }
+  return ParamUnsigned;
+}
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/README.txt b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/README.txt
new file mode 100644
index 0000000000000..6937bb0f41342
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/README.txt
@@ -0,0 +1,28 @@
+Notes:
+------
+As we should avoid committing binaries (.wasm) to be used in tests,
+instead we provide the '.cpp' source files and the '.s' files.
+
+- For the tests, only the '.s' files are required.
+- We use the target 'wasm32' as the 'wasm64' is not standardized yet.
+
+How to generate .s from .cpp
+----------------------------
+Use clang to generate the '.s'.
+
+  clang --target=wasm32 -S -g Inputs/hello-world.cpp -o Inputs/hello-world-clang.s
+  clang --target=wasm32 -S -g Inputs/pr-43860.cpp    -o Inputs/pr-43860-clang.s
+  clang --target=wasm32 -S -g Inputs/pr-44884.cpp    -o Inputs/pr-44884-clang.s
+  clang --target=wasm32 -S -g Inputs/pr-46466.cpp    -o Inputs/pr-46466-clang.s
+  clang --target=wasm32 -S -g Inputs/test.cpp        -o Inputs/test-clang.s
+
+How to generate .o from .s
+--------------------------------
+Each test executes one of the following commands in order to generate
+the binary '.wasm' used by that specific test:
+
+  llvm-mc -arch=wasm32 -filetype=obj %p/Inputs/hello-world-clang.s -o hello-world-clang.o
+  llvm-mc -arch=wasm32 -filetype=obj %p/Inputs/pr-43860-clang.s    -o pr-43860-clang.o
+  llvm-mc -arch=wasm32 -filetype=obj %p/Inputs/pr-44884-clang.s    -o pr-44884-clang.o
+  llvm-mc -arch=wasm32 -filetype=obj %p/Inputs/pr-46466-clang.s    -o pr-46466-clang.o
+  llvm-mc -arch=wasm32 -filetype=obj %p/Inputs/test-clang.s        -o test-clang.o
diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
index 2257e453e0a81..d686293c9b430 100644
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
@@ -1023,7 +1023,7 @@
 # CHECK-NEXT:  2      2     1.00                        blxne	r2
 # CHECK-NEXT:  2      1     1.00                  U     blx	#32424576
 # CHECK-NEXT:  2      1     1.00                  U     blx	#16212288
-# CHECK-NEXT:  1      1     1.00                  U     bx	r2
+# CHECK-NEXT:  1      1     1.00                        bx	r2
 # CHECK-NEXT:  1      1     1.00                  U     bxne	r2
 # CHECK-NEXT:  1      1     1.00                  U     bxj	r2
 # CHECK-NEXT:  1      1     1.00                  U     bxjne	r2
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s
index 892a5d14e8f3a..03f7de2fe9a4c 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s
@@ -180,10 +180,10 @@ jr a0
 # CHECK-NEXT:  1      3     0.50                        sext.b	a0, a0
 # CHECK-NEXT:  1      3     0.50                        sext.h	a0, a0
 # CHECK-NEXT:  1      3     0.50                        zext.h	a0, a0
-# CHECK-NEXT:  1      3     0.50                        min	a0, a0, a0
-# CHECK-NEXT:  1      3     0.50                        minu	a0, a0, a0
-# CHECK-NEXT:  1      3     0.50                        max	a0, a0, a0
-# CHECK-NEXT:  1      3     0.50                        maxu	a0, a0, a0
+# CHECK-NEXT:  1      3     1.00                        min	a0, a0, a0
+# CHECK-NEXT:  1      3     1.00                        minu	a0, a0, a0
+# CHECK-NEXT:  1      3     1.00                        max	a0, a0, a0
+# CHECK-NEXT:  1      3     1.00                        maxu	a0, a0, a0
 # CHECK-NEXT:  1      3     1.00                        rol	a0, a0, a0
 # CHECK-NEXT:  1      3     1.00                        ror	a0, a0, a0
 # CHECK-NEXT:  1      3     1.00                        rori	a0, a0, 1
@@ -225,7 +225,7 @@ jr a0
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
-# CHECK-NEXT:  -      -     39.00  52.00   -      -      -      -
+# CHECK-NEXT:  -      -     37.00  54.00   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
@@ -289,9 +289,9 @@ jr a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     sext.h	a0, a0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     zext.h	a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     min	a0, a0, a0
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     minu	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minu	a0, a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     max	a0, a0, a0
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     maxu	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxu	a0, a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rol	a0, a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     ror	a0, a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rori	a0, a0, 1
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s
index 21459bc45d453..3b6fd7e150137 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s
@@ -399,6 +399,26 @@ vmseq.vv v4, v8, v12
 vsetvli zero, zero, e64, m8, tu, mu
 vmseq.vx v4, v8, x10
 
+# Pseudo instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vmslt.vi v4, v8, 1
+vsetvli zero, zero, e8, mf4, tu, mu
+vmsltu.vi v4, v8, 1
+vsetvli zero, zero, e8, mf2, tu, mu
+vmsltu.vi v4, v8, 0
+vsetvli zero, zero, e8, m1, tu, mu
+vmsgeu.vi v4, v8, 1
+vsetvli zero, zero, e8, m2, tu, mu
+vmsge.vi v4, v8, 1
+vsetvli zero, zero, e8, m4, tu, mu
+vmsgeu.vi v4, v8, 0
+vsetvli zero, zero, e16, mf4, tu, mu
+vmsge.vi v4, v8, 0
+vsetvli zero, zero, e16, mf2, tu, mu
+vmsge.vx v4, v8, x10
+vsetvli zero, zero, e16, m1, tu, mu
+vmsgeu.vx v4, v8, x11
+
 # Vector Integer Min/Max Instructions
 vsetvli zero, zero, e8, mf8, tu, mu
 vminu.vv v4, v8, v12
@@ -754,14 +774,14 @@ vsetvli zero, zero, e64, m8, tu, mu
 vmv.v.v v4, v12
 
 # CHECK:      Iterations:        1
-# CHECK-NEXT: Instructions:      707
-# CHECK-NEXT: Total Cycles:      11962
-# CHECK-NEXT: Total uOps:        707
+# CHECK-NEXT: Instructions:      727
+# CHECK-NEXT: Total Cycles:      12018
+# CHECK-NEXT: Total uOps:        727
 
 # CHECK:      Dispatch Width:    2
 # CHECK-NEXT: uOps Per Cycle:    0.06
 # CHECK-NEXT: IPC:               0.06
-# CHECK-NEXT: Block RThroughput: 11549.0
+# CHECK-NEXT: Block RThroughput: 11583.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -1144,6 +1164,26 @@ vmv.v.v v4, v12
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
 # CHECK-NEXT:  1      19    17.00                       vmseq.vx	v4, v8, a0
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     2.00                        vmsle.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     2.00                        vmsleu.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     2.00                        vmsne.vv	v4, v8, v8
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      5     3.00                        vmsgtu.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      7     5.00                        vmsgt.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      11    9.00                        vmseq.vv	v4, v8, v8
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     2.00                        vmsgt.vi	v4, v8, -1
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     2.00                        vmslt.vx	v4, v8, a0
+# CHECK-NEXT:  1      4     2.00                        vmnot.m	v4, v4
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      5     3.00                        vmsltu.vx	v4, v8, a1
+# CHECK-NEXT:  1      4     2.00                        vmnot.m	v4, v4
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  1      4     2.00                        vminu.vv	v4, v8, v12
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  1      4     2.00                        vminu.vx	v4, v8, a0
@@ -1492,7 +1532,7 @@ vmv.v.v v4, v12
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
-# CHECK-NEXT:  -      -     333.00  -     11549.00 374.00  -    -
+# CHECK-NEXT:  -      -     342.00  -     11583.00 385.00  -    -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
@@ -1868,6 +1908,26 @@ vmv.v.v v4, v12
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
 # CHECK-NEXT:  -      -      -      -     17.00  1.00    -      -     vmseq.vx	v4, v8, a0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   1.00    -      -     vmsle.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   1.00    -      -     vmsleu.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   1.00    -      -     vmsne.vv	v4, v8, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00   1.00    -      -     vmsgtu.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00   1.00    -      -     vmsgt.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00   1.00    -      -     vmseq.vv	v4, v8, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   1.00    -      -     vmsgt.vi	v4, v8, -1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00   1.00    -      -     vmslt.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -     2.00   1.00    -      -     vmnot.m	v4, v4
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00   1.00    -      -     vmsltu.vx	v4, v8, a1
+# CHECK-NEXT:  -      -      -      -     2.00   1.00    -      -     vmnot.m	v4, v4
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
 # CHECK-NEXT:  -      -      -      -     2.00   1.00    -      -     vminu.vv	v4, v8, v12
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
 # CHECK-NEXT:  -      -      -      -     2.00   1.00    -      -     vminu.vx	v4, v8, a0
diff --git a/llvm/test/tools/llvm-objcopy/ELF/Inputs/compress-debug-sections.yaml b/llvm/test/tools/llvm-objcopy/ELF/Inputs/compress-debug-sections.yaml
index 67d8435fa486c..e2dfee9163a2b 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/Inputs/compress-debug-sections.yaml
+++ b/llvm/test/tools/llvm-objcopy/ELF/Inputs/compress-debug-sections.yaml
@@ -43,6 +43,10 @@ Sections:
     Type:            SHT_PROGBITS
     Flags:           [ SHF_GROUP ]
     Content:         '00'
+  - Name:            .debug_alloc
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:         000102030405060708090a0b0c0d0e0f000102030405060708090a0b0c0d0e0f000102030405060708090a0b0c0d0e0f000102030405060708090a0b0c0d0e0f
 Symbols:
   - Type:    STT_SECTION
     Section: .debug_foo
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zlib.test b/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zlib.test
index e1ebeed8d4fcb..056ae84ce4915 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zlib.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zlib.test
@@ -12,8 +12,10 @@
 # CHECK:             Name              Type            Address          Off    Size   ES Flg Lk Inf Al
 # COMPRESSED:        .debug_foo        PROGBITS        0000000000000000 000040 {{.*}} 00   C  0   0  8
 # COMPRESSED-NEXT:   .notdebug_foo     PROGBITS        0000000000000000 {{.*}} 000008 00      0   0  0
+# COMPRESSED:        .debug_alloc      PROGBITS        0000000000000000 {{.*}} 000040 00   A  0   0  0
 # UNCOMPRESSED:      .debug_foo        PROGBITS        0000000000000000 000040 000008 00      0   0  0
 # UNCOMPRESSED-NEXT: .notdebug_foo     PROGBITS        0000000000000000 {{.*}} 000008 00      0   0  0
+# UNCOMPRESSED:      .debug_alloc      PROGBITS        0000000000000000 {{.*}} 000040 00   A  0   0  0
 
 ## Relocations do not change.
 # CHECK:             Relocation section '.rela.debug_foo' at offset {{.*}} contains 2 entries:
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zstd.test b/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zstd.test
index d763131c4067d..bde1c2f311d06 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zstd.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-debug-sections-zstd.test
@@ -12,8 +12,10 @@
 # CHECK:             Name              Type            Address          Off    Size   ES Flg Lk Inf Al
 # COMPRESSED:        .debug_foo        PROGBITS        0000000000000000 000040 {{.*}} 00   C  0   0  8
 # COMPRESSED-NEXT:   .notdebug_foo     PROGBITS        0000000000000000 {{.*}} 000008 00      0   0  0
+# COMPRESSED:        .debug_alloc      PROGBITS        0000000000000000 {{.*}} 000040 00   A  0   0  0
 # DECOMPRESSED:      .debug_foo        PROGBITS        0000000000000000 000040 000008 00      0   0  0
 # DECOMPRESSED-NEXT: .notdebug_foo     PROGBITS        0000000000000000 {{.*}} 000008 00      0   0  0
+# DECOMPRESSED:      .debug_alloc      PROGBITS        0000000000000000 {{.*}} 000040 00   A  0   0  0
 
 ## Relocations do not change.
 # CHECK:             Relocation section '.rela.debug_foo' at offset {{.*}} contains 2 entries:
diff --git a/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
new file mode 100644
index 0000000000000..4258ddbe66a3e
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
@@ -0,0 +1,36 @@
+# REQUIRES: zlib
+## Test decompression for different sections.
+
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy --decompress-debug-sections %t %t.de
+# RUN: llvm-readelf -S %t.de | FileCheck %s
+
+# CHECK:        Name              Type            Address          Off      Size     ES Flg Lk Inf Al
+# CHECK:        .debug_alloc      PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00  AC  0   0  0
+# CHECK-NEXT:   .debug_nonalloc   PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK-NEXT:   .debugx           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK-NEXT:   nodebug           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name:      .debug_alloc
+    Type:      SHT_PROGBITS
+    Flags:     [ SHF_ALLOC, SHF_COMPRESSED ]
+    Content:   010000000000000040000000000000000100000000000000789cd36280002d3269002f800151
+  - Name:      .debug_nonalloc
+    Type:      SHT_PROGBITS
+    Flags:     [ SHF_COMPRESSED ]
+    Content:   010000000000000040000000000000000100000000000000789cd36280002d3269002f800151
+  - Name:      .debugx
+    Type:      SHT_PROGBITS
+    Flags:     [ SHF_COMPRESSED ]
+    Content:   010000000000000040000000000000000100000000000000789cd36280002d3269002f800151
+  - Name:      nodebug
+    Type:      SHT_PROGBITS
+    Flags:     [ SHF_COMPRESSED ]
+    Content:   010000000000000040000000000000000100000000000000789cd36280002d3269002f800151
diff --git a/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64.dylib b/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64.dylib
new file mode 100755
index 0000000000000..051e28f33d749
Binary files /dev/null and b/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64.dylib differ
diff --git a/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64_32.dylib b/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64_32.dylib
new file mode 100755
index 0000000000000..d3a339057abc3
Binary files /dev/null and b/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64_32.dylib differ
diff --git a/llvm/test/tools/llvm-objdump/MachO/AArch64/macho-relative-method-lists.test b/llvm/test/tools/llvm-objdump/MachO/AArch64/macho-relative-method-lists.test
new file mode 100644
index 0000000000000..b1b96a41a3293
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/MachO/AArch64/macho-relative-method-lists.test
@@ -0,0 +1,86 @@
+RUN: llvm-objdump --macho --objc-meta-data    %p/Inputs/rel-method-lists-arm64_32.dylib | FileCheck %s --check-prefix=CHK32
+RUN: llvm-otool -ov                           %p/Inputs/rel-method-lists-arm64_32.dylib | FileCheck %s --check-prefix=CHK32
+
+RUN: llvm-objdump --macho --objc-meta-data    %p/Inputs/rel-method-lists-arm64.dylib    | FileCheck %s --check-prefix=CHK64
+RUN: llvm-otool -ov                           %p/Inputs/rel-method-lists-arm64.dylib    | FileCheck %s --check-prefix=CHK64
+
+CHK32:                 baseMethods 0x660 (struct method_list_t *)
+CHK32-NEXT:                 entsize 12 (relative)
+CHK32-NEXT:                   count 3
+CHK32-NEXT:                    name 0x144 (0x{{[0-9a-f]*}}) instance_method_00
+CHK32-NEXT:                   types 0x91 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffff18 (0x{{[0-9a-f]*}}) -[MyClass instance_method_00]
+CHK32-NEXT:                    name 0x13c (0x{{[0-9a-f]*}}) instance_method_01
+CHK32-NEXT:                   types 0x85 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffff28 (0x{{[0-9a-f]*}}) -[MyClass instance_method_01]
+CHK32-NEXT:                    name 0x134 (0x{{[0-9a-f]*}}) instance_method_02
+CHK32-NEXT:                   types 0x79 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffff38 (0x{{[0-9a-f]*}}) -[MyClass instance_method_02]
+
+CHK32:                 baseMethods 0x630 (struct method_list_t *)
+CHK32-NEXT:                 entsize 12 (relative)
+CHK32-NEXT:                   count 3
+CHK32-NEXT:                    name 0x180 (0x{{[0-9a-f]*}}) class_method_00
+CHK32-NEXT:                   types 0xc1 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffff9c (0x{{[0-9a-f]*}}) +[MyClass class_method_00]
+CHK32-NEXT:                    name 0x178 (0x{{[0-9a-f]*}}) class_method_01
+CHK32-NEXT:                   types 0xb5 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffffac (0x{{[0-9a-f]*}}) +[MyClass class_method_01]
+CHK32-NEXT:                    name 0x170 (0x{{[0-9a-f]*}}) class_method_02
+CHK32-NEXT:                   types 0xa9 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffffbc (0x{{[0-9a-f]*}}) +[MyClass class_method_02]
+
+CHK64:                  baseMethods 0x6e0 (struct method_list_t *)
+CHK64-NEXT:                  entsize 12 (relative)
+CHK64-NEXT:                    count 3
+CHK64-NEXT:                     name 0x188 (0x{{[0-9a-f]*}}) instance_method_00
+CHK64-NEXT:                    types 0x91 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffa8 (0x{{[0-9a-f]*}}) -[MyClass instance_method_00]
+CHK64-NEXT:                     name 0x184 (0x{{[0-9a-f]*}}) instance_method_01
+CHK64-NEXT:                    types 0x85 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffa0 (0x{{[0-9a-f]*}}) -[MyClass instance_method_01]
+CHK64-NEXT:                     name 0x180 (0x{{[0-9a-f]*}}) instance_method_02
+CHK64-NEXT:                    types 0x79 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffff98 (0x{{[0-9a-f]*}}) -[MyClass instance_method_02]
+
+CHK64:                  baseMethods 0x6b0 (struct method_list_t *)
+CHK64-NEXT:                  entsize 12 (relative)
+CHK64-NEXT:                    count 3
+CHK64-NEXT:                     name 0x1d0 (0x{{[0-9a-f]*}}) class_method_00
+CHK64-NEXT:                    types 0xc1 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffe4 (0x{{[0-9a-f]*}}) +[MyClass class_method_00]
+CHK64-NEXT:                     name 0x1cc (0x{{[0-9a-f]*}}) class_method_01
+CHK64-NEXT:                    types 0xb5 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffdc (0x{{[0-9a-f]*}}) +[MyClass class_method_01]
+CHK64-NEXT:                     name 0x1c8 (0x{{[0-9a-f]*}}) class_method_02
+CHK64-NEXT:                    types 0xa9 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffd4 (0x{{[0-9a-f]*}}) +[MyClass class_method_02]
+
+######## Generate rel-method-lists-arm64.dylib ########
+// clang -c main.mm -o main.o -target arm64-apple-macos -arch arm64
+// ld64.ld64 -dylib -demangle -dynamic main.o -o rel-method-lists-arm64.dylib -syslibroot MacOSX14.2.sdk -segalign 0x10 -objc_relative_method_lists
+
+######## Generate rel-method-lists-arm64_32.dylib ########
+// clang -c main.mm -o main.o -target arm64_32-apple-watchos -arch arm64_32
+// ld64.ld64 -dylib -demangle -dynamic main.o -o rel-method-lists-arm64_32.dylib -syslibroot WatchOS.sdk -segalign 0x10 -objc_relative_method_lists
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~ main.mm ~~~~~~~~~~~~~~~~~~~~~~~~~
+__attribute__((objc_root_class))
+@interface MyClass
+- (void)instance_method_00;
+- (void)instance_method_01;
+- (void)instance_method_02;
++ (void)class_method_00;
++ (void)class_method_01;
++ (void)class_method_02;
+@end
+@implementation MyClass
+- (void)instance_method_00 {}
+- (void)instance_method_01 {}
+- (void)instance_method_02 {}
++ (void)class_method_00 {}
++ (void)class_method_01 {}
++ (void)class_method_02 {}
+@end
+void *_objc_empty_cache;
+void *_objc_empty_vtable;
diff --git a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe
new file mode 100644
index 0000000000000..309476a2437f0
Binary files /dev/null and b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe differ
diff --git a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
new file mode 100644
index 0000000000000..96eb878c31667
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
@@ -0,0 +1,13 @@
+PERF_RECORD_MMAP2 5752/0: [0x7ff70a1b0000(0x640000) @ 0x1000 00:00 0 0]: r-xp c:\Users\haohaiwe\Desktop\coff-profile.exe
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
diff --git a/llvm/test/tools/llvm-profgen/coff-profile.test b/llvm/test/tools/llvm-profgen/coff-profile.test
new file mode 100644
index 0000000000000..5578f731cac83
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/coff-profile.test
@@ -0,0 +1,79 @@
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/coff-profile.perfscript --binary=%S/Inputs/coff-profile.exe --output=%t
+; RUN: FileCheck %s --input-file %t
+
+CHECK:      main:31837:0
+CHECK-NEXT:  0: 0
+CHECK-NEXT:  3.1: 0
+CHECK-NEXT:  3.2: 0
+CHECK-NEXT:  8: 0
+CHECK-NEXT:  65501: 0
+CHECK-NEXT:  1: ??$init@HG@MyNameSpace2@@YAXHPEAG@Z:0
+CHECK-NEXT:   1: 0
+CHECK-NEXT:   1.1: 0
+CHECK-NEXT:   1.2: 0
+CHECK-NEXT:   2: 0
+CHECK-NEXT:   65514: 0
+CHECK-NEXT:  4: ?work1@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:3193
+CHECK-NEXT:   0: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:3193
+CHECK-NEXT:    1.1: 31
+CHECK-NEXT:    1.2: 31
+CHECK-NEXT:    2: 31
+CHECK-NEXT:    3: 31
+CHECK-NEXT:    65530: 0
+CHECK-NEXT:  5: ?work2@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:28644
+CHECK-NEXT:   0: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:28644
+CHECK-NEXT:    1.1: 341
+CHECK-NEXT:    1.2: 341
+CHECK-NEXT:    2: 341
+CHECK-NEXT:    3: 341
+CHECK-NEXT:    65530: 0
+CHECK-NEXT:  7: ?print@MyNameSpace2@@YAXPEAGH@Z:0
+CHECK-NEXT:   1: 0
+
+; Original code
+; clang-cl.exe -O2 -gdwarf -gline-tables-only coff-profile.cpp -fuse-ld=lld -Xclang -fdebug-info-for-profiling -link -debug:dwarf
+
+#include <stdio.h>
+
+namespace MyNameSpace1 {
+
+template <typename T1, typename T2> class MyClass {
+  void work(T1 map[], T2 n, T2 m) {
+    for (int i = 1; i < n; i++) {
+      map[i] = map[i - 1] * map[i - 1];
+      map[i] += (i * map[i - 1]) / m + i % m;
+    }
+  }
+
+public:
+  void work1(T1 map[], T2 n) { work(map, n, 7); }
+  void work2(T1 map[], T2 n) { work(map, n, 3); }
+};
+
+} // namespace MyNameSpace1
+
+namespace MyNameSpace2 {
+
+template <typename T1, typename T2> void init(T1 c, T2 *p) {
+  for (int i = 0; i < c * 1000000; i++) {
+    p[i] = i / 3 + (i * i) % 3;
+  }
+}
+
+void print(unsigned short *p, int i) {
+  printf("%d %d %d\n", p[i * i * 100], p[i * i * 100 + 1], p[i * i * 100 + 2]);
+}
+
+} // namespace MyNameSpace2
+
+unsigned short M[3000000];
+int main(int argc, char *argv[]) {
+  MyNameSpace2::init(argc, M);
+  MyNameSpace1::MyClass<unsigned short, int> Obj;
+  for (int i = 0; i <= argc * 10; i++) {
+    Obj.work1(&M[argc], argc * 100000);
+    Obj.work2(&M[argc * argc], argc * 1000000);
+  }
+  MyNameSpace2::print(M, argc);
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
index f4c73de7ca6c9..083c296b8cc13 100644
--- a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
@@ -10,6 +10,7 @@
 # CHECK-NEXT: [    18] x.c.
 # CHECK-NEXT: [    1e] .
 # CHECK-NEXT: [    20] .
+# CHECK-EMPTY:
 # CHECK-NEXT: Hex dump of section '.b':
 # CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
 # CHECK-NEXT: 0x00000000 01000000 00000000 01000000 00000000 ................
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
index ea7a8854eb1a0..c1d12a6d560ec 100644
--- a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
@@ -28,6 +28,7 @@
 # COMPRESSED:      String dump of section '.not_null_terminated':
 # COMPRESSED-NEXT: [     0] no
 # COMPRESSED-NEXT: [     3] null
+# COMPRESSED-EMPTY:
 # COMPRESSED-NEXT: Hex dump of section '.strings':
 # COMPRESSED-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
 # COMPRESSED-NEXT: 0x00000010 00000000 00000000 789ccb48 2d4a6548 ........x..H-JeH
@@ -39,6 +40,7 @@
 # INVALID:      String dump of section '.invalid1':
 # INVALID-NEXT: warning: '[[FILE]]': corrupted compressed section header
 # INVALID-NEXT: [     0] .
+# INVALID-EMPTY:
 # INVALID-NEXT: Hex dump of section '.invalid2':
 # INVALID-NEXT: warning: '[[FILE]]': zlib error: Z_DATA_ERROR
 # INVALID-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
index 65da952687f52..98c7cb002769a 100644
--- a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
@@ -9,6 +9,7 @@
 # CHECK-NEXT: [    10] .
 # CHECK-NEXT: [    18] (./. ..
 # CHECK-NEXT: [    21] .
+# CHECK-EMPTY:
 # CHECK-NEXT: Hex dump of section '.b':
 # CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time
 # CHECK-NEXT: 0x00000000 02000000 00000000 01000000 00000000 ................
diff --git a/llvm/test/tools/llvm-readobj/ELF/hex-dump-multi.s b/llvm/test/tools/llvm-readobj/ELF/hex-dump-multi.s
index 33ef534e81e13..942bfc4b7fb02 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hex-dump-multi.s
+++ b/llvm/test/tools/llvm-readobj/ELF/hex-dump-multi.s
@@ -1,10 +1,12 @@
 # REQUIRES: x86-registered-target
 
 # RUN: llvm-mc -filetype=obj -triple x86_64 %s -o %t.o
-# RUN: llvm-readobj -x .a -x .b %t.o | FileCheck %s
+# RUN: llvm-readobj -x .a -x .b %t.o | FileCheck %s --check-prefixes=HEADER,CHECK
 # RUN: llvm-readelf -x .a -x .b %t.o | FileCheck %s
 
-# CHECK:      Hex dump of section '.a':
+# HEADER:     LoadName:
+# CHECK:      {{^$}}
+# CHECK-NEXT: Hex dump of section '.a':
 # CHECK-NEXT: 0x00000000 00
 # CHECK-EMPTY:
 # CHECK-NEXT: Hex dump of section '.b':
diff --git a/llvm/test/tools/llvm-readobj/ELF/hex-dump.test b/llvm/test/tools/llvm-readobj/ELF/hex-dump.test
index 7829944806f34..71212dee5e07a 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hex-dump.test
+++ b/llvm/test/tools/llvm-readobj/ELF/hex-dump.test
@@ -46,7 +46,8 @@ FileHeader:
 # RUN: llvm-readelf --hex-dump=.sec %t2.out1 | \
 # RUN:   FileCheck %s --match-full-lines --strict-whitespace --check-prefix=SPACES1
 
-#      SPACES1:Hex dump of section '.sec':
+#      SPACES1:{{^$}}
+# SPACES1-NEXT:Hex dump of section '.sec':
 # SPACES1-NEXT:0x00000000 00000000 00000000 00000000 00000000 ................
 # SPACES1-NEXT:0x00000010 0000                                ..
 
@@ -55,7 +56,8 @@ FileHeader:
 # RUN: llvm-readelf --hex-dump=.sec %t2.out2 | \
 # RUN:   FileCheck %s --match-full-lines --strict-whitespace --check-prefix=SPACES2
 
-#      SPACES2:Hex dump of section '.sec':
+#      SPACES2:{{^$}}
+# SPACES2-NEXT:Hex dump of section '.sec':
 # SPACES2-NEXT:0x00000000 00000000 00000000 00000000 00000000 ................
 # SPACES2-NEXT:0x00000010 00000000 00000000 00000000 0000     ..............
 
@@ -64,7 +66,8 @@ FileHeader:
 # RUN: llvm-readelf --hex-dump=.sec %t2.out3 | \
 # RUN:   FileCheck %s --match-full-lines --strict-whitespace --check-prefix=SPACES3
 
-#      SPACES3:Hex dump of section '.sec':
+#      SPACES3:{{^$}}
+# SPACES3-NEXT:Hex dump of section '.sec':
 # SPACES3-NEXT:0x00000000 00000000 00000000 00000000 00000000 ................
 # SPACES3-NEXT:0x00000010 00000000 00000000 00000000          ............
 
diff --git a/llvm/test/tools/llvm-readobj/ELF/machine-specific-section-types.test b/llvm/test/tools/llvm-readobj/ELF/machine-specific-section-types.test
index f9524383e80b6..f65793c5bdc62 100644
--- a/llvm/test/tools/llvm-readobj/ELF/machine-specific-section-types.test
+++ b/llvm/test/tools/llvm-readobj/ELF/machine-specific-section-types.test
@@ -17,6 +17,10 @@
 # RUN: llvm-readobj --section-headers %t-aarch64.o | FileCheck %s --check-prefix=AARCH64-LLVM
 # RUN: llvm-readelf --section-headers %t-aarch64.o | FileCheck %s --check-prefix=AARCH64-GNU
 
+# RUN: yaml2obj %s --docnum=5 -o %t-hexagon.o
+# RUN: llvm-readobj --section-headers %t-hexagon.o | FileCheck %s --check-prefix=HEXAGON-LLVM
+# RUN: llvm-readelf --section-headers %t-hexagon.o | FileCheck %s --check-prefix=HEXAGON-GNU
+
 # ARM-LLVM: Name: exidx
 # ARM-LLVM: Type: SHT_ARM_EXIDX
 # ARM-LLVM: Name: preemptmap
@@ -64,6 +68,13 @@
 # AARCH64-GNU: .memtag.globals.dynamic AARCH64_MEMTAG_GLOBALS_DYNAMIC
 # AARCH64-GNU: .memtag.globals.static  AARCH64_MEMTAG_GLOBALS_STATIC
 
+# HEXAGON-LLVM: Name: hexagon_ordered
+# HEXAGON-LLVM: Type: SHT_HEX_ORDERED
+# HEXAGON-LLVM: Name: .hexagon.attributes
+# HEXAGON-LLVM: Type: SHT_HEXAGON_ATTRIBUTES
+
+# HEXAGON-GNU: hexagon_ordered HEX_ORDERED
+
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
@@ -122,3 +133,15 @@ Sections:
     Type:  SHT_AARCH64_MEMTAG_GLOBALS_DYNAMIC
   - Name:  .memtag.globals.static
     Type:  SHT_AARCH64_MEMTAG_GLOBALS_STATIC
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_HEXAGON
+Sections:
+  - Name: hexagon_ordered
+    Type: SHT_HEX_ORDERED
+  - Name: .hexagon.attributes
+    Type: SHT_HEXAGON_ATTRIBUTES
diff --git a/llvm/test/tools/llvm-readobj/ELF/string-dump-multi.s b/llvm/test/tools/llvm-readobj/ELF/string-dump-multi.s
index 29d7ef011005e..36a115bd259a9 100644
--- a/llvm/test/tools/llvm-readobj/ELF/string-dump-multi.s
+++ b/llvm/test/tools/llvm-readobj/ELF/string-dump-multi.s
@@ -1,10 +1,12 @@
 # REQUIRES: x86-registered-target
 
 # RUN: llvm-mc -filetype=obj -triple x86_64 %s -o %t.o
-# RUN: llvm-readobj -p .a -p .b %t.o | FileCheck %s
+# RUN: llvm-readobj -p .a -p .b %t.o | FileCheck %s --check-prefixes=HEADER,CHECK
 # RUN: llvm-readelf -p .a -p .b %t.o | FileCheck %s
 
-# CHECK:      String dump of section '.a':
+# HEADER:     LoadName:
+# CHECK:      {{^$}}
+# CHECK-NEXT: String dump of section '.a':
 # CHECK-NEXT: [     0] 0
 # CHECK-EMPTY:
 # CHECK-NEXT: String dump of section '.b':
diff --git a/llvm/test/tools/llvm-readobj/ELF/string-dump.test b/llvm/test/tools/llvm-readobj/ELF/string-dump.test
index c06b274d25288..1d7a177b32de1 100644
--- a/llvm/test/tools/llvm-readobj/ELF/string-dump.test
+++ b/llvm/test/tools/llvm-readobj/ELF/string-dump.test
@@ -3,7 +3,7 @@
 
 # RUN: llvm-readobj --string-dump=.strings \
 # RUN:   --string-dump=.not_null_terminated %t > %t.readobj.out
-# RUN: FileCheck %s --input-file=%t.readobj.out
+# RUN: FileCheck %s --input-file=%t.readobj.out --check-prefixes=HEADER,CHECK
 
 # Also test the different ways --string-dump can be specified, i.e. as a short
 # flag (-p), with different prefix modes (-p .foo, -p=.foo, -p.foo), and with
@@ -23,7 +23,9 @@
 # RUN: llvm-readelf -hp1 -p2 %t | cmp %t.readelf.out -
 # RUN: llvm-readelf -hp 1 -p.not_null_terminated %t | cmp %t.readelf.out -
 
-# CHECK:      String dump of section '.strings':
+# HEADER:     LoadName:
+# CHECK:      {{^$}}
+# CHECK-NEXT: String dump of section '.strings':
 # CHECK-NEXT: [ 0] here
 # CHECK-NEXT: [ 5] are
 # CHECK-NEXT: [ 9] some
diff --git a/llvm/test/tools/llvm-reduce/remove-dp-values.ll b/llvm/test/tools/llvm-reduce/remove-dp-values.ll
index 40ff9f32e960d..d137b279f4ea0 100644
--- a/llvm/test/tools/llvm-reduce/remove-dp-values.ll
+++ b/llvm/test/tools/llvm-reduce/remove-dp-values.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-reduce --abort-on-invalid-reduction --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t --try-experimental-debuginfo-iterators
 ; RUN: FileCheck --check-prefixes=CHECK-FINAL --input-file=%t %s --implicit-check-not=dbg.value
 
-; Test that we can, in RemoveDIs mode / DPValues mode (where variable location
+; Test that we can, in RemoveDIs mode / DbgVariableRecords mode (where variable location
 ; information isn't an instruction), remove one variable location assignment
 ; but not another.
 
diff --git a/llvm/tools/dsymutil/CMakeLists.txt b/llvm/tools/dsymutil/CMakeLists.txt
index 53f882e90b4e9..efe28bda68ebf 100644
--- a/llvm/tools/dsymutil/CMakeLists.txt
+++ b/llvm/tools/dsymutil/CMakeLists.txt
@@ -32,7 +32,6 @@ add_llvm_tool(dsymutil
   MachOUtils.cpp
   Reproducer.cpp
   RelocationMap.cpp
-  SymbolMap.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index 5ae5ecd556adb..677dfc44c54a4 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -634,25 +634,19 @@ bool DwarfLinkerForBinary::linkImpl(
 
   DebugMap DebugMap(Map.getTriple(), Map.getBinaryPath());
 
-  std::function<StringRef(StringRef)> TranslationLambda = [&](StringRef Input) {
-    assert(Options.Translator);
-    return Options.Translator(Input);
-  };
-
   std::unique_ptr<Linker> GeneralLinker = Linker::createLinker(
       [&](const Twine &Error, StringRef Context, const DWARFDie *DIE) {
         reportError(Error, Context, DIE);
       },
       [&](const Twine &Warning, StringRef Context, const DWARFDie *DIE) {
         reportWarning(Warning, Context, DIE);
-      },
-      Options.Translator ? TranslationLambda : nullptr);
+      });
 
   std::unique_ptr<classic::DwarfStreamer> Streamer;
   if (!Options.NoOutput) {
     if (Expected<std::unique_ptr<classic::DwarfStreamer>> StreamerOrErr =
             classic::DwarfStreamer::createStreamer(
-                Map.getTriple(), ObjectType, OutFile, Options.Translator,
+                Map.getTriple(), ObjectType, OutFile,
                 [&](const Twine &Warning, StringRef Context,
                     const DWARFDie *DIE) {
                   reportWarning(Warning, Context, DIE);
@@ -866,8 +860,8 @@ bool DwarfLinkerForBinary::linkImpl(
   if (Map.getTriple().isOSDarwin() && !Map.getBinaryPath().empty() &&
       ObjectType == Linker::OutputFileType::Object)
     return MachOUtils::generateDsymCompanion(
-        Options.VFS, Map, Options.Translator,
-        *Streamer->getAsmPrinter().OutStreamer, OutFile, RelocationsToApply);
+        Options.VFS, Map, *Streamer->getAsmPrinter().OutStreamer, OutFile,
+        RelocationsToApply);
 
   Streamer->finish();
   return true;
diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h
index fd9d985097d6e..6aa0b847eebd6 100644
--- a/llvm/tools/dsymutil/LinkUtils.h
+++ b/llvm/tools/dsymutil/LinkUtils.h
@@ -9,8 +9,6 @@
 #ifndef LLVM_TOOLS_DSYMUTIL_LINKOPTIONS_H
 #define LLVM_TOOLS_DSYMUTIL_LINKOPTIONS_H
 
-#include "SymbolMap.h"
-
 #include "llvm/ADT/Twine.h"
 #include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -87,9 +85,6 @@ struct LinkOptions {
   /// The Resources directory in the .dSYM bundle.
   std::optional<std::string> ResourceDir;
 
-  /// Symbol map translator.
-  SymbolMapTranslator Translator;
-
   /// Virtual File System.
   llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS =
       vfs::getRealFileSystem();
diff --git a/llvm/tools/dsymutil/MachOUtils.cpp b/llvm/tools/dsymutil/MachOUtils.cpp
index 3efc1aff82374..8e144d640ed01 100644
--- a/llvm/tools/dsymutil/MachOUtils.cpp
+++ b/llvm/tools/dsymutil/MachOUtils.cpp
@@ -373,7 +373,7 @@ static unsigned segmentLoadCommandSize(bool Is64Bit, unsigned NumSections) {
 // \a OutFile and it must be using a MachObjectWriter object to do so.
 bool generateDsymCompanion(
     llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, const DebugMap &DM,
-    SymbolMapTranslator &Translator, MCStreamer &MS, raw_fd_ostream &OutFile,
+    MCStreamer &MS, raw_fd_ostream &OutFile,
     const std::vector<MachOUtils::DwarfRelocationApplicationInfo>
         &RelocationsToApply) {
   auto &ObjectStreamer = static_cast<MCObjectStreamer &>(MS);
@@ -509,12 +509,9 @@ bool generateDsymCompanion(
   }
 
   SmallString<0> NewSymtab;
-  std::function<StringRef(StringRef)> TranslationLambda =
-      Translator ? [&](StringRef Input) { return Translator(Input); }
-                 : static_cast<std::function<StringRef(StringRef)>>(nullptr);
   // Legacy dsymutil puts an empty string at the start of the line table.
   // thus we set NonRelocatableStringpool(,PutEmptyString=true)
-  NonRelocatableStringpool NewStrings(TranslationLambda, true);
+  NonRelocatableStringpool NewStrings(true);
   unsigned NListSize = Is64Bit ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
   unsigned NumSyms = 0;
   uint64_t NewStringsSize = 0;
diff --git a/llvm/tools/dsymutil/MachOUtils.h b/llvm/tools/dsymutil/MachOUtils.h
index 059d9fdd788a6..0229647b00f85 100644
--- a/llvm/tools/dsymutil/MachOUtils.h
+++ b/llvm/tools/dsymutil/MachOUtils.h
@@ -8,8 +8,6 @@
 #ifndef LLVM_TOOLS_DSYMUTIL_MACHOUTILS_H
 #define LLVM_TOOLS_DSYMUTIL_MACHOUTILS_H
 
-#include "SymbolMap.h"
-
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -59,7 +57,7 @@ bool generateUniversalBinary(SmallVectorImpl<ArchAndFile> &ArchFiles,
                              StringRef SDKPath, bool Fat64 = false);
 bool generateDsymCompanion(
     llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, const DebugMap &DM,
-    SymbolMapTranslator &Translator, MCStreamer &MS, raw_fd_ostream &OutFile,
+    MCStreamer &MS, raw_fd_ostream &OutFile,
     const std::vector<MachOUtils::DwarfRelocationApplicationInfo>
         &RelocationsToApply);
 
diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td
index a4e4c6c4cdb9c..d8cec0cb2c410 100644
--- a/llvm/tools/dsymutil/Options.td
+++ b/llvm/tools/dsymutil/Options.td
@@ -128,12 +128,6 @@ def object_prefix_map: Separate<["--", "-"], "object-prefix-map">,
     Group<grp_general>;
 def: Joined<["--", "-"], "object-prefix-map=">, Alias<object_prefix_map>;
 
-def symbolmap: Separate<["--", "-"], "symbol-map">,
-  MetaVarName<"<bcsymbolmap>">,
-  HelpText<"Updates the existing dSYMs inplace using symbol map specified.">,
-  Group<grp_general>;
-def: Joined<["--", "-"], "symbol-map=">, Alias<symbolmap>;
-
 def arch: Separate<["--", "-"], "arch">,
   MetaVarName<"<arch>">,
   HelpText<"Link DWARF debug information only for specified CPU architecture"
diff --git a/llvm/tools/dsymutil/SymbolMap.cpp b/llvm/tools/dsymutil/SymbolMap.cpp
deleted file mode 100644
index c55362e5f7756..0000000000000
--- a/llvm/tools/dsymutil/SymbolMap.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-//===- tools/dsymutil/SymbolMap.cpp ---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SymbolMap.h"
-#include "DebugMap.h"
-#include "MachOUtils.h"
-
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/WithColor.h"
-
-#ifdef __APPLE__
-#include <CoreFoundation/CoreFoundation.h>
-#include <uuid/uuid.h>
-#endif
-
-namespace llvm {
-namespace dsymutil {
-
-StringRef SymbolMapTranslator::operator()(StringRef Input) {
-  if (!Input.starts_with("__hidden#") && !Input.starts_with("___hidden#"))
-    return Input;
-
-  StringRef Line = Input.drop_front(sizeof("__hidden#") - 1);
-  bool MightNeedUnderscore = Line.consume_front("#");
-
-  std::size_t LineNumber = std::numeric_limits<std::size_t>::max();
-  Line.split('_').first.getAsInteger(10, LineNumber);
-  if (LineNumber >= UnobfuscatedStrings.size()) {
-    WithColor::warning() << "reference to a unexisting unobfuscated string "
-                         << Input << ": symbol map mismatch?\n"
-                         << Line << '\n';
-    return Input;
-  }
-
-  const std::string &Translation = UnobfuscatedStrings[LineNumber];
-  if (!MightNeedUnderscore || !MangleNames)
-    return Translation;
-
-  // Objective-C symbols for the MachO symbol table start with a \1. Please see
-  // `MangleContext::mangleObjCMethodName` in clang.
-  if (Translation[0] == 1)
-    return StringRef(Translation).drop_front();
-
-  // We need permanent storage for the string we are about to create. Just
-  // append it to the vector containing translations. This should only happen
-  // during MachO symbol table translation, thus there should be no risk on
-  // exponential growth.
-  UnobfuscatedStrings.emplace_back("_" + Translation);
-  return UnobfuscatedStrings.back();
-}
-
-SymbolMapTranslator SymbolMapLoader::Load(StringRef InputFile,
-                                          const DebugMap &Map) const {
-  if (SymbolMap.empty())
-    return {};
-
-  std::string SymbolMapPath = SymbolMap;
-
-#if __APPLE__
-  // Look through the UUID Map.
-  if (sys::fs::is_directory(SymbolMapPath) && !Map.getUUID().empty()) {
-    uuid_string_t UUIDString;
-    uuid_unparse_upper((const uint8_t *)Map.getUUID().data(), UUIDString);
-
-    SmallString<256> PlistPath(
-        sys::path::parent_path(sys::path::parent_path(InputFile)));
-    sys::path::append(PlistPath, StringRef(UUIDString).str() + ".plist");
-
-    CFStringRef plistFile = CFStringCreateWithCString(
-        kCFAllocatorDefault, PlistPath.c_str(), kCFStringEncodingUTF8);
-    CFURLRef fileURL = CFURLCreateWithFileSystemPath(
-        kCFAllocatorDefault, plistFile, kCFURLPOSIXPathStyle, false);
-    CFReadStreamRef resourceData =
-        CFReadStreamCreateWithFile(kCFAllocatorDefault, fileURL);
-    if (resourceData) {
-      CFReadStreamOpen(resourceData);
-      CFDictionaryRef plist = (CFDictionaryRef)CFPropertyListCreateWithStream(
-          kCFAllocatorDefault, resourceData, 0, kCFPropertyListImmutable,
-          nullptr, nullptr);
-
-      if (plist) {
-        if (CFDictionaryContainsKey(plist, CFSTR("DBGOriginalUUID"))) {
-          CFStringRef OldUUID = (CFStringRef)CFDictionaryGetValue(
-              plist, CFSTR("DBGOriginalUUID"));
-
-          StringRef UUID(CFStringGetCStringPtr(OldUUID, kCFStringEncodingUTF8));
-          SmallString<256> BCSymbolMapPath(SymbolMapPath);
-          sys::path::append(BCSymbolMapPath, UUID.str() + ".bcsymbolmap");
-          SymbolMapPath = std::string(BCSymbolMapPath);
-        }
-        CFRelease(plist);
-      }
-      CFReadStreamClose(resourceData);
-      CFRelease(resourceData);
-    }
-    CFRelease(fileURL);
-    CFRelease(plistFile);
-  }
-#endif
-
-  if (sys::fs::is_directory(SymbolMapPath)) {
-    SymbolMapPath += (Twine("/") + sys::path::filename(InputFile) + "-" +
-                      MachOUtils::getArchName(Map.getTriple().getArchName()) +
-                      ".bcsymbolmap")
-                         .str();
-  }
-
-  auto ErrOrMemBuffer = MemoryBuffer::getFile(SymbolMapPath);
-  if (auto EC = ErrOrMemBuffer.getError()) {
-    WithColor::warning() << SymbolMapPath << ": " << EC.message()
-                         << ": not unobfuscating.\n";
-    return {};
-  }
-
-  std::vector<std::string> UnobfuscatedStrings;
-  auto &MemBuf = **ErrOrMemBuffer;
-  StringRef Data(MemBuf.getBufferStart(),
-                 MemBuf.getBufferEnd() - MemBuf.getBufferStart());
-  StringRef LHS;
-  std::tie(LHS, Data) = Data.split('\n');
-  bool MangleNames = false;
-
-  // Check version string first.
-  if (!LHS.starts_with("BCSymbolMap Version:")) {
-    // Version string not present, warns but try to parse it.
-    WithColor::warning() << SymbolMapPath
-                         << " is missing version string: assuming 1.0.\n";
-    UnobfuscatedStrings.emplace_back(LHS);
-  } else if (LHS.equals("BCSymbolMap Version: 1.0")) {
-    MangleNames = true;
-  } else if (LHS.equals("BCSymbolMap Version: 2.0")) {
-    MangleNames = false;
-  } else {
-    StringRef VersionNum;
-    std::tie(LHS, VersionNum) = LHS.split(':');
-    WithColor::warning() << SymbolMapPath
-                         << " has unsupported symbol map version" << VersionNum
-                         << ": not unobfuscating.\n";
-    return {};
-  }
-
-  while (!Data.empty()) {
-    std::tie(LHS, Data) = Data.split('\n');
-    UnobfuscatedStrings.emplace_back(LHS);
-  }
-
-  return SymbolMapTranslator(std::move(UnobfuscatedStrings), MangleNames);
-}
-
-} // namespace dsymutil
-} // namespace llvm
diff --git a/llvm/tools/dsymutil/SymbolMap.h b/llvm/tools/dsymutil/SymbolMap.h
deleted file mode 100644
index 977de31a5a17a..0000000000000
--- a/llvm/tools/dsymutil/SymbolMap.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//=- tools/dsymutil/SymbolMap.h -----------------------------------*- C++ -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_DSYMUTIL_SYMBOLMAP_H
-#define LLVM_TOOLS_DSYMUTIL_SYMBOLMAP_H
-
-#include "llvm/ADT/StringRef.h"
-
-#include <string>
-#include <vector>
-
-namespace llvm {
-namespace dsymutil {
-class DebugMap;
-
-/// Callable class to unobfuscate strings based on a BCSymbolMap.
-class SymbolMapTranslator {
-public:
-  SymbolMapTranslator() : MangleNames(false) {}
-
-  SymbolMapTranslator(std::vector<std::string> UnobfuscatedStrings,
-                      bool MangleNames)
-      : UnobfuscatedStrings(std::move(UnobfuscatedStrings)),
-        MangleNames(MangleNames) {}
-
-  StringRef operator()(StringRef Input);
-
-  operator bool() const { return !UnobfuscatedStrings.empty(); }
-
-private:
-  std::vector<std::string> UnobfuscatedStrings;
-  bool MangleNames;
-};
-
-/// Class to initialize SymbolMapTranslators from a BCSymbolMap.
-class SymbolMapLoader {
-public:
-  SymbolMapLoader(std::string SymbolMap) : SymbolMap(std::move(SymbolMap)) {}
-
-  SymbolMapTranslator Load(StringRef InputFile, const DebugMap &Map) const;
-
-private:
-  const std::string SymbolMap;
-};
-} // namespace dsymutil
-} // namespace llvm
-
-#endif // LLVM_TOOLS_DSYMUTIL_SYMBOLMAP_H
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index 25e281c415e75..bc968b6387b65 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -108,7 +108,6 @@ struct DsymutilOptions {
   bool Flat = false;
   bool InputIsYAMLDebugMap = false;
   bool ForceKeepFunctionForStatic = false;
-  std::string SymbolMap;
   std::string OutputFile;
   std::string Toolchain;
   std::string ReproducerPath;
@@ -341,12 +340,6 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) {
     return DWARFLinkerType.takeError();
   }
 
-  if (opt::Arg *SymbolMap = Args.getLastArg(OPT_symbolmap))
-    Options.SymbolMap = SymbolMap->getValue();
-
-  if (Args.hasArg(OPT_symbolmap))
-    Options.LinkOpts.Update = true;
-
   if (Expected<std::vector<std::string>> InputFiles =
           getInputs(Args, Options.LinkOpts.Update)) {
     Options.InputFiles = std::move(*InputFiles);
@@ -560,8 +553,7 @@ getOutputFileName(StringRef InputFile, const DsymutilOptions &Options) {
     return OutputLocation(Options.OutputFile);
 
   // When updating, do in place replacement.
-  if (Options.OutputFile.empty() &&
-      (Options.LinkOpts.Update || !Options.SymbolMap.empty()))
+  if (Options.OutputFile.empty() && Options.LinkOpts.Update)
     return OutputLocation(std::string(InputFile));
 
   // When dumping the debug map, just return an empty output location. This
@@ -668,8 +660,6 @@ int dsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
       return EXIT_FAILURE;
     }
 
-  SymbolMapLoader SymMapLoader(Options.SymbolMap);
-
   for (auto &InputFile : Options.InputFiles) {
     // Dump the symbol table for each input file and requested arch
     if (Options.DumpStab) {
@@ -760,9 +750,6 @@ int dsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
         if (Options.DumpDebugMap)
           continue;
 
-        if (!Options.SymbolMap.empty())
-          Options.LinkOpts.Translator = SymMapLoader.Load(InputFile, *Map);
-
         if (Map->begin() == Map->end()) {
           std::lock_guard<std::mutex> Guard(ErrorHandlerMutex);
           WithColor::warning()
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index 1c869e1739319..fd852563838f3 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -67,6 +67,7 @@ static cl::opt<std::string> ClDataLayout("data-layout",
                                          cl::desc("data layout string to use"),
                                          cl::value_desc("layout-string"),
                                          cl::init(""), cl::cat(AsCat));
+extern bool WriteNewDbgInfoFormatToBitcode;
 
 static void WriteOutputFile(const Module *M, const ModuleSummaryIndex *Index) {
   // Infer the output filename if needed.
@@ -139,6 +140,12 @@ int main(int argc, char **argv) {
     Err.print(argv[0], errs());
     return 1;
   }
+
+  // Convert to new debug format if requested.
+  assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug mode");
+  if (UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode)
+    M->convertToNewDbgValues();
+
   std::unique_ptr<ModuleSummaryIndex> Index = std::move(ModuleAndIndex.Index);
 
   if (!DisableVerify) {
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index a3e41be12e95d..78ccaf12a380b 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -29,9 +29,10 @@ declare_objc_class(LLVMDIBuilderRef DIB, LLVMMetadataRef File) {
   return Decl;
 }
 
-int llvm_test_dibuilder(void) {
+int llvm_test_dibuilder(bool NewDebugInfoFormat) {
   const char *Filename = "debuginfo.c";
   LLVMModuleRef M = LLVMModuleCreateWithName(Filename);
+  LLVMSetIsNewDbgInfoFormat(M, NewDebugInfoFormat);
   LLVMDIBuilderRef DIB = LLVMCreateDIBuilder(M);
 
   LLVMMetadataRef File = LLVMDIBuilderCreateFile(DIB, Filename,
@@ -135,21 +136,38 @@ int llvm_test_dibuilder(void) {
   LLVMMetadataRef FooParamVar1 =
     LLVMDIBuilderCreateParameterVariable(DIB, FunctionMetadata, "a", 1, 1, File,
                                          42, Int64Ty, true, 0);
-  LLVMDIBuilderInsertDeclareAtEnd(DIB, LLVMConstInt(LLVMInt64Type(), 0, false),
-                                  FooParamVar1, FooParamExpression,
-                                  FooParamLocation, FooEntryBlock);
+  if (LLVMIsNewDbgInfoFormat(M))
+    LLVMDIBuilderInsertDeclareRecordAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar1,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
+  else
+    LLVMDIBuilderInsertDeclareAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar1,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
   LLVMMetadataRef FooParamVar2 =
     LLVMDIBuilderCreateParameterVariable(DIB, FunctionMetadata, "b", 1, 2, File,
                                          42, Int64Ty, true, 0);
-  LLVMDIBuilderInsertDeclareAtEnd(DIB, LLVMConstInt(LLVMInt64Type(), 0, false),
-                                  FooParamVar2, FooParamExpression,
-                                  FooParamLocation, FooEntryBlock);
+
+  if (LLVMIsNewDbgInfoFormat(M))
+    LLVMDIBuilderInsertDeclareRecordAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar2,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
+  else
+    LLVMDIBuilderInsertDeclareAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar2,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
+
   LLVMMetadataRef FooParamVar3 =
     LLVMDIBuilderCreateParameterVariable(DIB, FunctionMetadata, "c", 1, 3, File,
                                          42, VectorTy, true, 0);
-  LLVMDIBuilderInsertDeclareAtEnd(DIB, LLVMConstInt(LLVMInt64Type(), 0, false),
-                                  FooParamVar3, FooParamExpression,
-                                  FooParamLocation, FooEntryBlock);
+  if (LLVMIsNewDbgInfoFormat(M))
+    LLVMDIBuilderInsertDeclareRecordAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar3,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
+  else
+    LLVMDIBuilderInsertDeclareAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar3,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
 
   LLVMSetSubprogram(FooFunction, FunctionMetadata);
 
@@ -166,9 +184,12 @@ int llvm_test_dibuilder(void) {
   LLVMValueRef FooVal1 = LLVMConstInt(LLVMInt64Type(), 0, false);
   LLVMMetadataRef FooVarValueExpr =
     LLVMDIBuilderCreateConstantValueExpression(DIB, 0);
-
-  LLVMDIBuilderInsertDbgValueAtEnd(DIB, FooVal1, FooVar1, FooVarValueExpr,
-                                   FooVarsLocation, FooVarBlock);
+  if (LLVMIsNewDbgInfoFormat(M))
+    LLVMDIBuilderInsertDbgValueRecordAtEnd(
+        DIB, FooVal1, FooVar1, FooVarValueExpr, FooVarsLocation, FooVarBlock);
+  else
+    LLVMDIBuilderInsertDbgValueIntrinsicAtEnd(
+        DIB, FooVal1, FooVar1, FooVarValueExpr, FooVarsLocation, FooVarBlock);
 
   LLVMMetadataRef MacroFile =
       LLVMDIBuilderCreateTempMacroFile(DIB, NULL, 0, File);
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index bc708e2d472ed..347863638849c 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -1397,6 +1397,14 @@ static void clone_symbols(LLVMModuleRef Src, LLVMModuleRef M) {
     }
     LLVMDisposeValueMetadataEntries(AllMetadata);
 
+    // Copy any prefix data that may be on the function
+    if (LLVMHasPrefixData(Cur))
+      LLVMSetPrefixData(Fun, clone_constant(LLVMGetPrefixData(Cur), M));
+
+    // Copy any prologue data that may be on the function
+    if (LLVMHasPrologueData(Cur))
+      LLVMSetPrologueData(Fun, clone_constant(LLVMGetPrologueData(Cur), M));
+
     FunCloner FC(Cur, Fun);
     FC.CloneBBs(Cur);
 
diff --git a/llvm/tools/llvm-c-test/llvm-c-test.h b/llvm/tools/llvm-c-test/llvm-c-test.h
index 00566660257e0..c50d3cce86748 100644
--- a/llvm/tools/llvm-c-test/llvm-c-test.h
+++ b/llvm/tools/llvm-c-test/llvm-c-test.h
@@ -36,7 +36,7 @@ int llvm_calc(void);
 int llvm_disassemble(void);
 
 // debuginfo.c
-int llvm_test_dibuilder(void);
+int llvm_test_dibuilder(bool NewDebugInfoFormat);
 int llvm_get_di_tag(void);
 int llvm_di_type_get_name(void);
 
diff --git a/llvm/tools/llvm-c-test/main.c b/llvm/tools/llvm-c-test/main.c
index badbe4b13b6ba..c4748d342fba1 100644
--- a/llvm/tools/llvm-c-test/main.c
+++ b/llvm/tools/llvm-c-test/main.c
@@ -110,7 +110,7 @@ int main(int argc, char **argv) {
   } else if (argc == 2 && !strcmp(argv[1], "--test-diagnostic-handler")) {
     return llvm_test_diagnostic_handler();
   } else if (argc == 2 && !strcmp(argv[1], "--test-dibuilder")) {
-    return llvm_test_dibuilder();
+    return llvm_test_dibuilder(false) && llvm_test_dibuilder(true);
   } else {
     print_usage();
   }
diff --git a/llvm/tools/llvm-debuginfo-analyzer/README.txt b/llvm/tools/llvm-debuginfo-analyzer/README.txt
index e6c20db7cd712..ce7569d272245 100644
--- a/llvm/tools/llvm-debuginfo-analyzer/README.txt
+++ b/llvm/tools/llvm-debuginfo-analyzer/README.txt
@@ -31,11 +31,11 @@ use LIT tests to validate the logical readers.
 
 Convert the unitests:
   llvm-project/llvm/unittests/DebugInfo/LogicalView/CodeViewReaderTest.cpp
-  llvm-project/llvm/unittests/DebugInfo/LogicalView/ELFReaderTest.cpp
+  llvm-project/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp
 
 into LIT tests:
   llvm-project/llvm/test/DebugInfo/LogicalView/CodeViewReader.test
-  llvm-project/llvm/test/DebugInfo/LogicalView/ELFReader.test
+  llvm-project/llvm/test/DebugInfo/LogicalView/DWARFReader.test
 
 //===----------------------------------------------------------------------===//
 // Eliminate calls to 'getInputFileDirectory()' in the unit tests.
@@ -210,9 +210,6 @@ The following DWARF debug location operands are not supported:
 //===----------------------------------------------------------------------===//
 // Add support for additional binary formats.
 //===----------------------------------------------------------------------===//
-- WebAssembly (Wasm).
-  https://github.com/llvm/llvm-project/issues/57040#issuecomment-1211336680
-
 - Extended COFF (XCOFF)
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 06fc669390bf1..8e443318dd7d2 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -80,6 +80,8 @@ static cl::opt<bool> PrintThinLTOIndexOnly(
     cl::desc("Only read thinlto index and print the index as LLVM assembly."),
     cl::init(false), cl::Hidden, cl::cat(DisCategory));
 
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 namespace {
 
 static void printDebugLoc(const DebugLoc &DL, formatted_raw_ostream &OS) {
@@ -249,8 +251,14 @@ int main(int argc, char **argv) {
 
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
-        if (M)
+        if (M) {
+          bool ChangeDbgFormat = M->IsNewDbgInfoFormat != WriteNewDbgInfoFormat;
+          if (ChangeDbgFormat)
+            M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat);
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
+          if (ChangeDbgFormat)
+            M->setIsNewDbgInfoFormat(!WriteNewDbgInfoFormat);
+        }
         if (Index)
           Index->print(Out->os());
       }
diff --git a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
index bd17f3c4a6595..285dcf75ecd9d 100644
--- a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
+++ b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
@@ -338,9 +338,9 @@ Error linkDebugInfoImpl(object::ObjectFile &File, const Options &Options,
   Triple TargetTriple = File.makeTriple();
   std::unique_ptr<classic::DwarfStreamer> Streamer;
   if (Expected<std::unique_ptr<classic::DwarfStreamer>> StreamerOrErr =
-          classic::DwarfStreamer::createStreamer(
-              TargetTriple, Linker::OutputFileType::Object, OutStream, nullptr,
-              ReportWarn))
+          classic::DwarfStreamer::createStreamer(TargetTriple,
+                                                 Linker::OutputFileType::Object,
+                                                 OutStream, ReportWarn))
     Streamer = std::move(*StreamerOrErr);
   else
     return StreamerOrErr.takeError();
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index 1ae2565e894c6..06e1c7f3c1bbe 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -423,10 +423,9 @@ static void runBenchmarkConfigurations(
         if (Err) {
           // Errors from executing the snippets are fine.
           // All other errors are a framework issue and should fail.
-          if (!Err.isA<SnippetExecutionFailure>()) {
-            llvm::errs() << "llvm-exegesis error: " << toString(std::move(Err));
-            exit(1);
-          }
+          if (!Err.isA<SnippetExecutionFailure>())
+            ExitOnErr(std::move(Err));
+
           BenchmarkResult.Error = toString(std::move(Err));
         }
         AllResults.push_back(std::move(BenchmarkResult));
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index 0e6935c0ac589..5e0d69a68d69b 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -3661,6 +3661,10 @@ struct class_ro32_t {
 #define RO_ROOT (1 << 1)
 #define RO_HAS_CXX_STRUCTORS (1 << 2)
 
+/* Values for method_list{64,32}_t->entsize */
+#define ML_HAS_RELATIVE_PTRS (1 << 31)
+#define ML_ENTSIZE_MASK 0xFFFF
+
 struct method_list64_t {
   uint32_t entsize;
   uint32_t count;
@@ -3685,6 +3689,12 @@ struct method32_t {
   uint32_t imp;   /* IMP (32-bit pointer) */
 };
 
+struct method_relative_t {
+  int32_t name;  /* SEL (32-bit relative) */
+  int32_t types; /* const char * (32-bit relative) */
+  int32_t imp;   /* IMP (32-bit relative) */
+};
+
 struct protocol_list64_t {
   uint64_t count; /* uintptr_t (a 64-bit value) */
   /* struct protocol64_t * list[0];  These pointers follow inline */
@@ -3986,6 +3996,12 @@ inline void swapStruct(struct method32_t &m) {
   sys::swapByteOrder(m.imp);
 }
 
+inline void swapStruct(struct method_relative_t &m) {
+  sys::swapByteOrder(m.name);
+  sys::swapByteOrder(m.types);
+  sys::swapByteOrder(m.imp);
+}
+
 inline void swapStruct(struct protocol_list64_t &pl) {
   sys::swapByteOrder(pl.count);
 }
@@ -4440,6 +4456,88 @@ static void print_layout_map32(uint32_t p, struct DisassembleInfo *info) {
   print_layout_map(layout_map, left);
 }
 
+static void print_relative_method_list(uint32_t structSizeAndFlags,
+                                       uint32_t structCount, uint64_t p,
+                                       struct DisassembleInfo *info,
+                                       const char *indent,
+                                       uint32_t pointerBits) {
+  struct method_relative_t m;
+  const char *r, *name;
+  uint32_t offset, xoffset, left, i;
+  SectionRef S, xS;
+
+  assert(((structSizeAndFlags & ML_HAS_RELATIVE_PTRS) != 0) &&
+         "expected structSizeAndFlags to have ML_HAS_RELATIVE_PTRS flag");
+
+  outs() << indent << "\t\t   entsize "
+         << (structSizeAndFlags & ML_ENTSIZE_MASK) << " (relative) \n";
+  outs() << indent << "\t\t     count " << structCount << "\n";
+
+  for (i = 0; i < structCount; i++) {
+    r = get_pointer_64(p, offset, left, S, info);
+    memset(&m, '\0', sizeof(struct method_relative_t));
+    if (left < sizeof(struct method_relative_t)) {
+      memcpy(&m, r, left);
+      outs() << indent << "   (method_t extends past the end of the section)\n";
+    } else
+      memcpy(&m, r, sizeof(struct method_relative_t));
+    if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+      swapStruct(m);
+
+    outs() << indent << "\t\t      name " << format("0x%" PRIx32, m.name);
+    uint64_t relNameRefVA = p + offsetof(struct method_relative_t, name);
+    uint64_t absNameRefVA = relNameRefVA + m.name;
+    outs() << " (" << format("0x%" PRIx32, absNameRefVA) << ")";
+
+    // since this is a relative list, absNameRefVA is the address of the
+    // __objc_selrefs entry, so a pointer, not the actual name
+    const char *nameRefPtr =
+        get_pointer_64(absNameRefVA, xoffset, left, xS, info);
+    if (nameRefPtr) {
+      uint32_t pointerSize = pointerBits / CHAR_BIT;
+      if (left < pointerSize)
+        outs() << indent << " (nameRefPtr extends past the end of the section)";
+      else {
+        if (pointerSize == 64) {
+          uint64_t nameOff_64 = *reinterpret_cast<const uint64_t *>(nameRefPtr);
+          if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+            sys::swapByteOrder(nameOff_64);
+          name = get_pointer_64(nameOff_64, xoffset, left, xS, info);
+        } else {
+          uint32_t nameOff_32 = *reinterpret_cast<const uint32_t *>(nameRefPtr);
+          if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+            sys::swapByteOrder(nameOff_32);
+          name = get_pointer_32(nameOff_32, xoffset, left, xS, info);
+        }
+        if (name != nullptr)
+          outs() << format(" %.*s", left, name);
+      }
+    }
+    outs() << "\n";
+
+    outs() << indent << "\t\t     types " << format("0x%" PRIx32, m.types);
+    uint64_t relTypesVA = p + offsetof(struct method_relative_t, types);
+    uint64_t absTypesVA = relTypesVA + m.types;
+    outs() << " (" << format("0x%" PRIx32, absTypesVA) << ")";
+    name = get_pointer_32(absTypesVA, xoffset, left, xS, info);
+    if (name != nullptr)
+      outs() << format(" %.*s", left, name);
+    outs() << "\n";
+
+    outs() << indent << "\t\t       imp " << format("0x%" PRIx32, m.imp);
+    uint64_t relImpVA = p + offsetof(struct method_relative_t, imp);
+    uint64_t absImpVA = relImpVA + m.imp;
+    outs() << " (" << format("0x%" PRIx32, absImpVA) << ")";
+    name = GuessSymbolName(absImpVA, info->AddrMap);
+    if (name != nullptr)
+      outs() << " " << name;
+    outs() << "\n";
+
+    p += sizeof(struct method_relative_t);
+    offset += sizeof(struct method_relative_t);
+  }
+}
+
 static void print_method_list64_t(uint64_t p, struct DisassembleInfo *info,
                                   const char *indent) {
   struct method_list64_t ml;
@@ -4461,10 +4559,17 @@ static void print_method_list64_t(uint64_t p, struct DisassembleInfo *info,
     memcpy(&ml, r, sizeof(struct method_list64_t));
   if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
     swapStruct(ml);
+  p += sizeof(struct method_list64_t);
+
+  if ((ml.entsize & ML_HAS_RELATIVE_PTRS) != 0) {
+    print_relative_method_list(ml.entsize, ml.count, p, info, indent,
+                               /*pointerBits=*/64);
+    return;
+  }
+
   outs() << indent << "\t\t   entsize " << ml.entsize << "\n";
   outs() << indent << "\t\t     count " << ml.count << "\n";
 
-  p += sizeof(struct method_list64_t);
   offset += sizeof(struct method_list64_t);
   for (i = 0; i < ml.count; i++) {
     r = get_pointer_64(p, offset, left, S, info);
@@ -4552,10 +4657,17 @@ static void print_method_list32_t(uint64_t p, struct DisassembleInfo *info,
     memcpy(&ml, r, sizeof(struct method_list32_t));
   if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
     swapStruct(ml);
+  p += sizeof(struct method_list32_t);
+
+  if ((ml.entsize & ML_HAS_RELATIVE_PTRS) != 0) {
+    print_relative_method_list(ml.entsize, ml.count, p, info, indent,
+                               /*pointerBits=*/32);
+    return;
+  }
+
   outs() << indent << "\t\t   entsize " << ml.entsize << "\n";
   outs() << indent << "\t\t     count " << ml.count << "\n";
 
-  p += sizeof(struct method_list32_t);
   offset += sizeof(struct method_list32_t);
   for (i = 0; i < ml.count; i++) {
     r = get_pointer_32(p, offset, left, S, info);
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index c6fcf7e1196ec..878147642aa6e 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -408,9 +408,22 @@ PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary,
           PerfContent::UnknownContent};
 }
 
+static StringRef filename(StringRef Path, bool UseBackSlash) {
+  llvm::sys::path::Style PathStyle =
+      UseBackSlash ? llvm::sys::path::Style::windows_backslash
+                   : llvm::sys::path::Style::native;
+  StringRef FileName = llvm::sys::path::filename(Path, PathStyle);
+
+  // In case this file use \r\n as newline.
+  if (UseBackSlash && FileName.back() == '\r')
+    return FileName.drop_back();
+
+  return FileName;
+}
+
 void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) {
   // Drop the event which doesn't belong to user-provided binary
-  StringRef BinaryName = llvm::sys::path::filename(Event.BinaryPath);
+  StringRef BinaryName = filename(Event.BinaryPath, Binary->isCOFF());
   if (Binary->getName() != BinaryName)
     return;
 
@@ -975,7 +988,7 @@ bool PerfScriptReader::extractMMap2EventForBinary(ProfiledBinary *Binary,
            << format("0x%" PRIx64 ":", MMap.Address) << " \n";
   }
 
-  StringRef BinaryName = llvm::sys::path::filename(MMap.BinaryPath);
+  StringRef BinaryName = filename(MMap.BinaryPath, Binary->isCOFF());
   return Binary->getName() == BinaryName;
 }
 
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index f62228627b8f1..1baf35820f97f 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
@@ -211,10 +212,11 @@ void ProfiledBinary::load() {
   OwningBinary<Binary> OBinary = unwrapOrError(createBinary(Path), Path);
   Binary &ExeBinary = *OBinary.getBinary();
 
-  auto *Obj = dyn_cast<ELFObjectFileBase>(&ExeBinary);
-  if (!Obj)
-    exitWithError("not a valid Elf image", Path);
+  IsCOFF = isa<COFFObjectFile>(&ExeBinary);
+  if (!isa<ELFObjectFileBase>(&ExeBinary) && !IsCOFF)
+    exitWithError("not a valid ELF/COFF image", Path);
 
+  auto *Obj = cast<ObjectFile>(&ExeBinary);
   TheTriple = Obj->makeTriple();
 
   LLVM_DEBUG(dbgs() << "Loading " << Path << "\n");
@@ -236,13 +238,14 @@ void ProfiledBinary::load() {
   DisassembleFunctionSet.insert(DisassembleFunctions.begin(),
                                 DisassembleFunctions.end());
 
-  checkPseudoProbe(Obj);
+  if (auto *ELFObj = dyn_cast<ELFObjectFileBase>(Obj)) {
+    checkPseudoProbe(ELFObj);
+    if (UsePseudoProbes)
+      populateElfSymbolAddressList(ELFObj);
 
-  if (UsePseudoProbes)
-    populateElfSymbolAddressList(Obj);
-
-  if (ShowDisassemblyOnly)
-    decodePseudoProbe(Obj);
+    if (ShowDisassemblyOnly)
+      decodePseudoProbe(ELFObj);
+  }
 
   // Disassemble the text sections.
   disassemble(Obj);
@@ -335,18 +338,35 @@ void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj,
     exitWithError("no executable segment found", FileName);
 }
 
-void ProfiledBinary::setPreferredTextSegmentAddresses(
-    const ELFObjectFileBase *Obj) {
+void ProfiledBinary::setPreferredTextSegmentAddresses(const COFFObjectFile *Obj,
+                                                      StringRef FileName) {
+  uint64_t ImageBase = Obj->getImageBase();
+  if (!ImageBase)
+    exitWithError("Not a COFF image", FileName);
+
+  PreferredTextSegmentAddresses.push_back(ImageBase);
+  FirstLoadableAddress = ImageBase;
+
+  for (SectionRef Section : Obj->sections()) {
+    const coff_section *Sec = Obj->getCOFFSection(Section);
+    if (Sec->Characteristics & COFF::IMAGE_SCN_CNT_CODE)
+      TextSegmentOffsets.push_back(Sec->VirtualAddress);
+  }
+}
+
+void ProfiledBinary::setPreferredTextSegmentAddresses(const ObjectFile *Obj) {
   if (const auto *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
     setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
   else if (const auto *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
     setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
   else if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
     setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
-  else if (const auto *ELFObj = cast<ELF64BEObjectFile>(Obj))
+  else if (const auto *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
     setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
+  else if (const auto *COFFObj = dyn_cast<COFFObjectFile>(Obj))
+    setPreferredTextSegmentAddresses(COFFObj, Obj->getFileName());
   else
-    llvm_unreachable("invalid ELF object format");
+    llvm_unreachable("invalid object format");
 }
 
 void ProfiledBinary::checkPseudoProbe(const ELFObjectFileBase *Obj) {
@@ -442,7 +462,7 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
 void ProfiledBinary::decodePseudoProbe() {
   OwningBinary<Binary> OBinary = unwrapOrError(createBinary(Path), Path);
   Binary &ExeBinary = *OBinary.getBinary();
-  auto *Obj = dyn_cast<ELFObjectFileBase>(&ExeBinary);
+  auto *Obj = cast<ELFObjectFileBase>(&ExeBinary);
   decodePseudoProbe(Obj);
 }
 
@@ -593,7 +613,7 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
   return true;
 }
 
-void ProfiledBinary::setUpDisassembler(const ELFObjectFileBase *Obj) {
+void ProfiledBinary::setUpDisassembler(const ObjectFile *Obj) {
   const Target *TheTarget = getTarget(Obj);
   std::string TripleName = TheTriple.getTriple();
   StringRef FileName = Obj->getFileName();
@@ -635,7 +655,7 @@ void ProfiledBinary::setUpDisassembler(const ELFObjectFileBase *Obj) {
   IPrinter->setPrintBranchImmAsAddress(true);
 }
 
-void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
+void ProfiledBinary::disassemble(const ObjectFile *Obj) {
   // Set up disassembler and related components.
   setUpDisassembler(Obj);
 
@@ -687,7 +707,7 @@ void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
              << "]:\n\n";
     }
 
-    if (SectionName == ".plt")
+    if (isa<ELFObjectFileBase>(Obj) && SectionName == ".plt")
       continue;
 
     // Get the section data.
@@ -722,8 +742,7 @@ void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
 }
 
 void ProfiledBinary::checkUseFSDiscriminator(
-    const ELFObjectFileBase *Obj,
-    std::map<SectionRef, SectionSymbolsTy> &AllSymbols) {
+    const ObjectFile *Obj, std::map<SectionRef, SectionSymbolsTy> &AllSymbols) {
   const char *FSDiscriminatorVar = "__llvm_fs_discriminator__";
   for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end();
        SI != SE; ++SI) {
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index 0fd12f5acd6b4..5d2088ad7691c 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -297,22 +297,26 @@ class ProfiledBinary {
   // Use to avoid redundant warning.
   bool MissingMMapWarned = false;
 
-  void setPreferredTextSegmentAddresses(const ELFObjectFileBase *O);
+  bool IsCOFF = false;
+
+  void setPreferredTextSegmentAddresses(const ObjectFile *O);
 
   template <class ELFT>
   void setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj,
                                         StringRef FileName);
+  void setPreferredTextSegmentAddresses(const COFFObjectFile *Obj,
+                                        StringRef FileName);
 
   void checkPseudoProbe(const ELFObjectFileBase *Obj);
 
   void decodePseudoProbe(const ELFObjectFileBase *Obj);
 
   void
-  checkUseFSDiscriminator(const ELFObjectFileBase *Obj,
+  checkUseFSDiscriminator(const ObjectFile *Obj,
                           std::map<SectionRef, SectionSymbolsTy> &AllSymbols);
 
   // Set up disassembler and related components.
-  void setUpDisassembler(const ELFObjectFileBase *Obj);
+  void setUpDisassembler(const ObjectFile *Obj);
   symbolize::LLVMSymbolizer::Options getSymbolizerOpts() const;
 
   // Load debug info of subprograms from DWARF section.
@@ -333,7 +337,7 @@ class ProfiledBinary {
   void warnNoFuncEntry();
 
   /// Dissassemble the text section and build various address maps.
-  void disassemble(const ELFObjectFileBase *O);
+  void disassemble(const ObjectFile *O);
 
   /// Helper function to dissassemble the symbol and extract info for unwinding
   bool dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
@@ -362,6 +366,8 @@ class ProfiledBinary {
   uint64_t getBaseAddress() const { return BaseAddress; }
   void setBaseAddress(uint64_t Address) { BaseAddress = Address; }
 
+  bool isCOFF() const { return IsCOFF; }
+
   // Canonicalize to use preferred load address as base address.
   uint64_t canonicalizeVirtualAddress(uint64_t Address) {
     return Address - BaseAddress + getPreferredBaseAddress();
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index e78732353cc87..d1c05f437042d 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -49,6 +49,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/HexagonAttributeParser.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MSP430AttributeParser.h"
 #include "llvm/Support/MSP430Attributes.h"
@@ -2824,6 +2825,11 @@ template <typename ELFT> void ELFDumper<ELFT>::printLoadName() {
 
 template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
   switch (Obj.getHeader().e_machine) {
+  case EM_HEXAGON:
+    printAttributes(ELF::SHT_HEXAGON_ATTRIBUTES,
+                    std::make_unique<HexagonAttributeParser>(&W),
+                    llvm::endianness::little);
+    break;
   case EM_ARM:
     if (Obj.isLE())
       printAttributes(ELF::SHT_ARM_ATTRIBUTES,
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 0d3fea71aafd4..0980d2ad3a852 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -160,15 +160,10 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj,
                                       ArrayRef<std::string> Sections,
                                       bool Decompress) {
   SmallString<0> Out;
-  bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
     StringRef SectionName = unwrapOrError(Obj.getFileName(), Section.getName());
-
-    if (!First)
-      W.startLine() << '\n';
-    First = false;
-    W.startLine() << "String dump of section '" << SectionName << "':\n";
+    W.startLine() << "\nString dump of section '" << SectionName << "':\n";
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
@@ -182,15 +177,10 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
                                    ArrayRef<std::string> Sections,
                                    bool Decompress) {
   SmallString<0> Out;
-  bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
     StringRef SectionName = unwrapOrError(Obj.getFileName(), Section.getName());
-
-    if (!First)
-      W.startLine() << '\n';
-    First = false;
-    W.startLine() << "Hex dump of section '" << SectionName << "':\n";
+    W.startLine() << "\nHex dump of section '" << SectionName << "':\n";
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp
index 2f3d4cac9fa01..25de659109c9f 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements a function which calls the Generic Delta pass in order
-// to reduce uninteresting DPValues from defined functions.
+// to reduce uninteresting DbgVariableRecords from defined functions.
 //
-// DPValues store variable-location debug-info and are attached to instructions.
-// This information used to be represented by intrinsics such as dbg.value, and
-// would naturally get reduced by llvm-reduce like any other instruction. As
-// DPValues get stored elsewhere, they need to be enumerated and eliminated like
-// any other data structure in LLVM.
+// DbgVariableRecords store variable-location debug-info and are attached to
+// instructions. This information used to be represented by intrinsics such as
+// dbg.value, and would naturally get reduced by llvm-reduce like any other
+// instruction. As DbgVariableRecords get stored elsewhere, they need to be
+// enumerated and eliminated like any other data structure in LLVM.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h
index 6a8f62155ec3e..07a1e04fceaee 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h
+++ b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements a function which calls the Generic Delta pass in order
-// to reduce uninteresting DPValues from defined functions.
+// to reduce uninteresting DbgVariableRecords from defined functions.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt
index eba1672faee7f..b20ac318e768d 100644
--- a/llvm/tools/llvm-shlib/CMakeLists.txt
+++ b/llvm/tools/llvm-shlib/CMakeLists.txt
@@ -33,10 +33,13 @@ if(LLVM_BUILD_LLVM_DYLIB)
   if (LLVM_LINK_LLVM_DYLIB)
     set(INSTALL_WITH_TOOLCHAIN INSTALL_WITH_TOOLCHAIN)
   endif()
-  add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB OUTPUT_NAME LLVM ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
-  # Add symlink for backwards compatibility with old library name
-  get_target_property(LLVM_DYLIB_SOVERSION LLVM SOVERSION)
-  llvm_install_library_symlink(LLVM-${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} LLVM SHARED COMPONENT LLVM SOVERSION ${LLVM_DYLIB_SOVERSION})
+  if (WIN32)
+    add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB SONAME ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
+  else()
+    add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB OUTPUT_NAME LLVM ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
+    # Add symlink for backwards compatibility with old library name
+    llvm_install_library_symlink(LLVM-${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} $<TARGET_FILE_NAME:LLVM> SHARED FULL_DEST COMPONENT LLVM)
+  endif()
 
   list(REMOVE_DUPLICATES LIB_NAMES)
   if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
diff --git a/llvm/tools/verify-uselistorder/verify-uselistorder.cpp b/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
index 9afe6817fefb9..d929ae09958a1 100644
--- a/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
+++ b/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
@@ -166,6 +166,11 @@ std::unique_ptr<Module> TempFile::readBitcode(LLVMContext &Context) const {
                           "verify-uselistorder: error: ");
     return nullptr;
   }
+
+  // verify-uselistoder currently only supports old-style debug info mode.
+  // FIXME: Update mapping code for RemoveDIs.
+  assert(!ModuleOr.get()->IsNewDbgInfoFormat &&
+         "Unexpectedly in new debug info mode");
   return std::move(ModuleOr.get());
 }
 
@@ -175,6 +180,9 @@ std::unique_ptr<Module> TempFile::readAssembly(LLVMContext &Context) const {
   std::unique_ptr<Module> M = parseAssemblyFile(Filename, Err, Context);
   if (!M.get())
     Err.print("verify-uselistorder", errs());
+  // verify-uselistoder currently only supports old-style debug info mode.
+  // FIXME: Update mapping code for RemoveDIs.
+  assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug info mode");
   return M;
 }
 
@@ -541,6 +549,9 @@ int main(int argc, char **argv) {
 
   // Load the input module...
   std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
+  // verify-uselistoder currently only supports old-style debug info mode.
+  // FIXME: Update mapping code for RemoveDIs.
+  assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug info mode");
 
   if (!M.get()) {
     Err.print(argv[0], errs());
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 11237d2e1602d..83cbb02e0f58b 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -14,6 +14,8 @@
 #include "llvm/Support/Alignment.h"
 #include "gtest/gtest.h"
 #include <array>
+#include <climits>
+#include <limits>
 #include <optional>
 
 using namespace llvm;
@@ -2911,6 +2913,91 @@ TEST(APIntTest, RoundingSDiv) {
   }
 }
 
+TEST(APIntTest, Average) {
+  APInt A0(32, 0);
+  APInt A2(32, 2);
+  APInt A100(32, 100);
+  APInt A101(32, 101);
+  APInt A200(32, 200, false);
+  APInt ApUMax = APInt::getMaxValue(32);
+
+  EXPECT_EQ(APInt(32, 150), APIntOps::avgFloorU(A100, A200));
+  EXPECT_EQ(APIntOps::RoundingUDiv(A100 + A200, A2, APInt::Rounding::DOWN),
+            APIntOps::avgFloorU(A100, A200));
+  EXPECT_EQ(APIntOps::RoundingUDiv(A100 + A200, A2, APInt::Rounding::UP),
+            APIntOps::avgCeilU(A100, A200));
+  EXPECT_EQ(APIntOps::RoundingUDiv(A100 + A101, A2, APInt::Rounding::DOWN),
+            APIntOps::avgFloorU(A100, A101));
+  EXPECT_EQ(APIntOps::RoundingUDiv(A100 + A101, A2, APInt::Rounding::UP),
+            APIntOps::avgCeilU(A100, A101));
+  EXPECT_EQ(A0, APIntOps::avgFloorU(A0, A0));
+  EXPECT_EQ(A0, APIntOps::avgCeilU(A0, A0));
+  EXPECT_EQ(ApUMax, APIntOps::avgFloorU(ApUMax, ApUMax));
+  EXPECT_EQ(ApUMax, APIntOps::avgCeilU(ApUMax, ApUMax));
+  EXPECT_EQ(APIntOps::RoundingUDiv(ApUMax, A2, APInt::Rounding::DOWN),
+            APIntOps::avgFloorU(A0, ApUMax));
+  EXPECT_EQ(APIntOps::RoundingUDiv(ApUMax, A2, APInt::Rounding::UP),
+            APIntOps::avgCeilU(A0, ApUMax));
+
+  APInt Ap100(32, +100);
+  APInt Ap101(32, +101);
+  APInt Ap200(32, +200);
+  APInt Am1(32, -1);
+  APInt Am100(32, -100);
+  APInt Am101(32, -101);
+  APInt Am200(32, -200);
+  APInt AmSMin = APInt::getSignedMinValue(32);
+  APInt ApSMax = APInt::getSignedMaxValue(32);
+
+  EXPECT_EQ(APInt(32, +150), APIntOps::avgFloorS(Ap100, Ap200));
+  EXPECT_EQ(APIntOps::RoundingSDiv(Ap100 + Ap200, A2, APInt::Rounding::DOWN),
+            APIntOps::avgFloorS(Ap100, Ap200));
+  EXPECT_EQ(APIntOps::RoundingSDiv(Ap100 + Ap200, A2, APInt::Rounding::UP),
+            APIntOps::avgCeilS(Ap100, Ap200));
+
+  EXPECT_EQ(APInt(32, -150), APIntOps::avgFloorS(Am100, Am200));
+  EXPECT_EQ(APIntOps::RoundingSDiv(Am100 + Am200, A2, APInt::Rounding::DOWN),
+            APIntOps::avgFloorS(Am100, Am200));
+  EXPECT_EQ(APIntOps::RoundingSDiv(Am100 + Am200, A2, APInt::Rounding::UP),
+            APIntOps::avgCeilS(Am100, Am200));
+
+  EXPECT_EQ(APInt(32, +100), APIntOps::avgFloorS(Ap100, Ap101));
+  EXPECT_EQ(APIntOps::RoundingSDiv(Ap100 + Ap101, A2, APInt::Rounding::DOWN),
+            APIntOps::avgFloorS(Ap100, Ap101));
+  EXPECT_EQ(APInt(32, +101), APIntOps::avgCeilS(Ap100, Ap101));
+  EXPECT_EQ(APIntOps::RoundingSDiv(Ap100 + Ap101, A2, APInt::Rounding::UP),
+            APIntOps::avgCeilS(Ap100, Ap101));
+
+  EXPECT_EQ(APInt(32, -101), APIntOps::avgFloorS(Am100, Am101));
+  EXPECT_EQ(APIntOps::RoundingSDiv(Am100 + Am101, A2, APInt::Rounding::DOWN),
+            APIntOps::avgFloorS(Am100, Am101));
+  EXPECT_EQ(APInt(32, -100), APIntOps::avgCeilS(Am100, Am101));
+  EXPECT_EQ(APIntOps::RoundingSDiv(Am100 + Am101, A2, APInt::Rounding::UP),
+            APIntOps::avgCeilS(Am100, Am101));
+
+  EXPECT_EQ(AmSMin, APIntOps::avgFloorS(AmSMin, AmSMin));
+  EXPECT_EQ(AmSMin, APIntOps::avgCeilS(AmSMin, AmSMin));
+
+  EXPECT_EQ(APIntOps::RoundingSDiv(AmSMin, A2, APInt::Rounding::DOWN),
+            APIntOps::avgFloorS(A0, AmSMin));
+  EXPECT_EQ(APIntOps::RoundingSDiv(AmSMin, A2, APInt::Rounding::UP),
+            APIntOps::avgCeilS(A0, AmSMin));
+
+  EXPECT_EQ(A0, APIntOps::avgFloorS(A0, A0));
+  EXPECT_EQ(A0, APIntOps::avgCeilS(A0, A0));
+
+  EXPECT_EQ(Am1, APIntOps::avgFloorS(AmSMin, ApSMax));
+  EXPECT_EQ(A0, APIntOps::avgCeilS(AmSMin, ApSMax));
+
+  EXPECT_EQ(APIntOps::RoundingSDiv(ApSMax, A2, APInt::Rounding::DOWN),
+            APIntOps::avgFloorS(A0, ApSMax));
+  EXPECT_EQ(APIntOps::RoundingSDiv(ApSMax, A2, APInt::Rounding::UP),
+            APIntOps::avgCeilS(A0, ApSMax));
+
+  EXPECT_EQ(ApSMax, APIntOps::avgFloorS(ApSMax, ApSMax));
+  EXPECT_EQ(ApSMax, APIntOps::avgCeilS(ApSMax, ApSMax));
+}
+
 TEST(APIntTest, umul_ov) {
   const std::pair<uint64_t, uint64_t> Overflows[] = {
       {0x8000000000000000, 2},
@@ -2962,6 +3049,69 @@ TEST(APIntTest, smul_ov) {
       }
 }
 
+TEST(APIntTest, sfloordiv_ov) {
+  // int16 test overflow
+  {
+    using IntTy = int16_t;
+    APInt divisor(8 * sizeof(IntTy), std::numeric_limits<IntTy>::lowest(),
+                  true);
+    APInt dividend(8 * sizeof(IntTy), IntTy(-1), true);
+    bool Overflow = false;
+    (void)divisor.sfloordiv_ov(dividend, Overflow);
+    EXPECT_TRUE(Overflow);
+  }
+  // int32 test overflow
+  {
+    using IntTy = int32_t;
+    APInt divisor(8 * sizeof(IntTy), std::numeric_limits<IntTy>::lowest(),
+                  true);
+    APInt dividend(8 * sizeof(IntTy), IntTy(-1), true);
+    bool Overflow = false;
+    (void)divisor.sfloordiv_ov(dividend, Overflow);
+    EXPECT_TRUE(Overflow);
+  }
+  // int64 test overflow
+  {
+    using IntTy = int64_t;
+    APInt divisor(8 * sizeof(IntTy), std::numeric_limits<IntTy>::lowest(),
+                  true);
+    APInt dividend(8 * sizeof(IntTy), IntTy(-1), true);
+    bool Overflow = false;
+    (void)divisor.sfloordiv_ov(dividend, Overflow);
+    EXPECT_TRUE(Overflow);
+  }
+  // test all of int8
+  {
+    bool Overflow = false;
+    for (int i = -128; i < 128; ++i) {
+      for (int j = -128; j < 128; ++j) {
+        if (j == 0)
+          continue;
+
+        int8_t a = static_cast<int8_t>(i);
+        int8_t b = static_cast<int8_t>(j);
+
+        APInt divisor(8, a, true);
+        APInt dividend(8, b, true);
+        APInt quotient = divisor.sfloordiv_ov(dividend, Overflow);
+
+        if (i == -128 && j == -1) {
+          EXPECT_TRUE(Overflow);
+          continue;
+        }
+
+        if (((i >= 0 && j > 0) || (i <= 0 && j < 0)) ||
+            (i % j == 0)) // if quotient >= 0 and remain == 0 floordiv
+                          // equivalent to div
+          EXPECT_EQ(quotient.getSExtValue(), a / b);
+        else
+          EXPECT_EQ(quotient.getSExtValue(), a / b - 1);
+        EXPECT_FALSE(Overflow);
+      }
+    }
+  }
+}
+
 TEST(APIntTest, SolveQuadraticEquationWrap) {
   // Verify that "Solution" is the first non-negative integer that solves
   // Ax^2 + Bx + C = "0 or overflow", i.e. that it is a correct solution
diff --git a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
index cb34802a5de27..b60d82b529fa3 100644
--- a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
+++ b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
@@ -18,6 +18,24 @@ using namespace llvm;
 
 namespace {
 
+TEST(LowLevelTypeTest, Token) {
+  LLVMContext C;
+  DataLayout DL("");
+
+  const LLT TTy = LLT::token();
+
+  // Test kind.
+  EXPECT_TRUE(TTy.isValid());
+  EXPECT_TRUE(TTy.isScalar());
+  EXPECT_TRUE(TTy.isToken());
+
+  EXPECT_FALSE(TTy.isPointer());
+  EXPECT_FALSE(TTy.isVector());
+
+  const LLT STy = LLT::scalar(0);
+  EXPECT_EQ(STy, TTy);
+}
+
 TEST(LowLevelTypeTest, Scalar) {
   LLVMContext C;
   DataLayout DL("");
@@ -32,6 +50,8 @@ TEST(LowLevelTypeTest, Scalar) {
     ASSERT_FALSE(Ty.isPointer());
     ASSERT_FALSE(Ty.isVector());
 
+    EXPECT_TRUE(S != 0 || Ty.isToken());
+
     // Test sizes.
     EXPECT_EQ(S, Ty.getSizeInBits());
     EXPECT_EQ(S, Ty.getScalarSizeInBits());
@@ -77,6 +97,7 @@ TEST(LowLevelTypeTest, Vector) {
 
       ASSERT_FALSE(VTy.isScalar());
       ASSERT_FALSE(VTy.isPointer());
+      ASSERT_FALSE(VTy.isToken());
 
       // Test sizes.
       EXPECT_EQ(S, VTy.getScalarSizeInBits());
@@ -300,6 +321,7 @@ TEST(LowLevelTypeTest, Invalid) {
   ASSERT_FALSE(Ty.isScalar());
   ASSERT_FALSE(Ty.isPointer());
   ASSERT_FALSE(Ty.isVector());
+  ASSERT_FALSE(Ty.isToken());
 }
 
 TEST(LowLevelTypeTest, Divide) {
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 2b764c9e1f0fd..1967a62bbf9d1 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -133,6 +133,10 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
   SDValue And = DAG->getNode(ISD::AND, DL, Int32VT, Op0, Op1);
   SDValue Xor = DAG->getNode(ISD::XOR, DL, Int32VT, Op1, Op0);
   SDValue Or  = DAG->getNode(ISD::OR, DL, Int32VT, Op0, Op1);
+  SDValue SMax = DAG->getNode(ISD::SMAX, DL, Int32VT, Op0, Op1);
+  SDValue SMin = DAG->getNode(ISD::SMIN, DL, Int32VT, Op1, Op0);
+  SDValue UMax = DAG->getNode(ISD::UMAX, DL, Int32VT, Op0, Op1);
+  SDValue UMin = DAG->getNode(ISD::UMIN, DL, Int32VT, Op1, Op0);
 
   SDValue SFAdd = DAG->getNode(ISD::STRICT_FADD, DL, {Float32VT, MVT::Other},
                                {DAG->getEntryNode(), Op2, Op2});
@@ -155,6 +159,15 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
   EXPECT_TRUE(sd_match(Or, m_c_BinOp(ISD::OR, m_Value(), m_Value())));
   EXPECT_TRUE(sd_match(Or, m_Or(m_Value(), m_Value())));
 
+  EXPECT_TRUE(sd_match(SMax, m_c_BinOp(ISD::SMAX, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(SMax, m_SMax(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(SMin, m_c_BinOp(ISD::SMIN, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(SMin, m_SMin(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(UMax, m_c_BinOp(ISD::UMAX, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(UMax, m_UMax(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(UMin, m_c_BinOp(ISD::UMIN, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(UMin, m_UMin(m_Value(), m_Value())));
+
   SDValue BindVal;
   EXPECT_TRUE(sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_Value(BindVal),
                                              m_Deferred(BindVal))));
@@ -174,10 +187,20 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue SExt = DAG->getNode(ISD::SIGN_EXTEND, DL, Int64VT, Op0);
   SDValue Trunc = DAG->getNode(ISD::TRUNCATE, DL, Int32VT, Op1);
 
+  SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Trunc, Op0);
+  SDValue Neg = DAG->getNegative(Op0, DL, Int32VT);
+  SDValue Not = DAG->getNOT(DL, Op0, Int32VT);
+
   using namespace SDPatternMatch;
   EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value())));
   EXPECT_TRUE(sd_match(SExt, m_SExt(m_Value())));
   EXPECT_TRUE(sd_match(Trunc, m_Trunc(m_Specific(Op1))));
+
+  EXPECT_TRUE(sd_match(Neg, m_Neg(m_Value())));
+  EXPECT_TRUE(sd_match(Not, m_Not(m_Value())));
+  EXPECT_FALSE(sd_match(ZExt, m_Neg(m_Value())));
+  EXPECT_FALSE(sd_match(Sub, m_Neg(m_Value())));
+  EXPECT_FALSE(sd_match(Neg, m_Not(m_Value())));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
diff --git a/llvm/unittests/DebugInfo/LogicalView/CMakeLists.txt b/llvm/unittests/DebugInfo/LogicalView/CMakeLists.txt
index 1b93d77a20bd6..1116edb212b05 100644
--- a/llvm/unittests/DebugInfo/LogicalView/CMakeLists.txt
+++ b/llvm/unittests/DebugInfo/LogicalView/CMakeLists.txt
@@ -12,7 +12,7 @@ add_llvm_unittest_with_input_files(DebugInfoLogicalViewTests
   CodeViewReaderTest.cpp
   CommandLineOptionsTest.cpp
   CompareElementsTest.cpp
-  ELFReaderTest.cpp
+  DWARFReaderTest.cpp
   SelectElementsTest.cpp
   LocationRangesTest.cpp
   LogicalElementsTest.cpp
diff --git a/llvm/unittests/DebugInfo/LogicalView/ELFReaderTest.cpp b/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp
similarity index 99%
rename from llvm/unittests/DebugInfo/LogicalView/ELFReaderTest.cpp
rename to llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp
index bd278befbc642..a1bd5dee35656 100644
--- a/llvm/unittests/DebugInfo/LogicalView/ELFReaderTest.cpp
+++ b/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp
@@ -1,4 +1,4 @@
-//===- llvm/unittest/DebugInfo/LogicalView/ELFReaderTest.cpp --------------===//
+//===- llvm/unittest/DebugInfo/LogicalView/DWARFReaderTest.cpp ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -323,7 +323,7 @@ void compareElements(SmallString<128> &InputsDir) {
   checkElementComparison(Reference.get(), Target.get());
 }
 
-TEST(LogicalViewTest, ELFReader) {
+TEST(LogicalViewTest, DWARFReader) {
   // Initialize targets and assembly printers/parsers.
   llvm::InitializeAllTargetInfos();
   llvm::InitializeAllTargetMCs();
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index f102ba59e3754..8a6a26bba63c2 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_unittest(OrcJITTests
   JITTargetMachineBuilderTest.cpp
   LazyCallThroughAndReexportsTest.cpp
   LookupAndRecordAddrsTest.cpp
+  MachOPlatformTest.cpp
   MapperJITLinkMemoryManagerTest.cpp
   MemoryMapperTest.cpp
   ObjectFormatsTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/MachOPlatformTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MachOPlatformTest.cpp
new file mode 100644
index 0000000000000..bf6c1042252d7
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/MachOPlatformTest.cpp
@@ -0,0 +1,56 @@
+//===---------- MachOPlatformTest.cpp - MachPlatform API Tests ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+TEST(MachOPlatformTests, BuildVersionOptsFromTriple) {
+
+  auto darwinOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-darwin"), 0, 0);
+  EXPECT_FALSE(darwinOS);
+
+  auto macOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-macosx"), 0, 0);
+  EXPECT_TRUE(macOS);
+  EXPECT_EQ(macOS->Platform, MachO::PLATFORM_MACOS);
+
+  auto iOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-ios"), 0, 0);
+  EXPECT_TRUE(iOS);
+  EXPECT_EQ(iOS->Platform, MachO::PLATFORM_IOS);
+
+  auto iOSSim = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-ios-simulator"), 0, 0);
+  EXPECT_TRUE(iOSSim);
+  EXPECT_EQ(iOSSim->Platform, MachO::PLATFORM_IOSSIMULATOR);
+
+  auto tvOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-tvos"), 0, 0);
+  EXPECT_TRUE(tvOS);
+  EXPECT_EQ(tvOS->Platform, MachO::PLATFORM_TVOS);
+
+  auto tvOSSim = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-tvos-simulator"), 0, 0);
+  EXPECT_TRUE(tvOSSim);
+  EXPECT_EQ(tvOSSim->Platform, MachO::PLATFORM_TVOSSIMULATOR);
+
+  auto watchOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-watchos"), 0, 0);
+  EXPECT_TRUE(watchOS);
+  EXPECT_EQ(watchOS->Platform, MachO::PLATFORM_WATCHOS);
+
+  auto watchOSSim = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-watchos-simulator"), 0, 0);
+  EXPECT_TRUE(watchOSSim);
+  EXPECT_EQ(watchOSSim->Platform, MachO::PLATFORM_WATCHOSSIMULATOR);
+}
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index fdbe8df783b11..5c415cadcd686 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -5856,6 +5856,23 @@ TEST_F(OpenMPIRBuilderTest, TargetDataRegion) {
   EXPECT_TRUE(TargetDataCall->getOperand(2)->getType()->isIntegerTy(32));
   EXPECT_TRUE(TargetDataCall->getOperand(8)->getType()->isPointerTy());
 
+  // Check that BodyGenCB is still made when IsTargetDevice is set to true.
+  OMPBuilder.Config.setIsTargetDevice(true);
+  bool CheckDevicePassBodyGen = false;
+  auto BodyTargetCB = [&](InsertPointTy CodeGenIP, BodyGenTy BodyGenType) {
+    CheckDevicePassBodyGen = true;
+    Builder.restoreIP(CodeGenIP);
+    CallInst *TargetDataCall =
+        dyn_cast<CallInst>(BB->back().getPrevNode()->getPrevNode());
+    // Make sure no begin_mapper call is present for device pass.
+    EXPECT_EQ(TargetDataCall, nullptr);
+    return Builder.saveIP();
+  };
+  Builder.restoreIP(OMPBuilder.createTargetData(
+      Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID),
+      /* IfCond= */ nullptr, Info, GenMapInfoCB, nullptr, BodyTargetCB));
+  EXPECT_TRUE(CheckDevicePassBodyGen);
+
   Builder.CreateRetVoid();
   EXPECT_FALSE(verifyModule(*M, &errs()));
 }
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index e23c7eaa4930d..92658b7b6895f 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -41,7 +41,7 @@ namespace {
 // position that it already resides at. This is fine -- but gets complicated
 // with dbg.value intrinsics. By moving an instruction, we can end up changing
 // nothing but the location of debug-info intrinsics. That has to be modelled
-// by DPValues, the dbg.value replacement.
+// by DbgVariableRecords, the dbg.value replacement.
 TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   LLVMContext C;
   UseNewDbgInfoFormat = true;
@@ -90,10 +90,10 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   //    %b = add
   //    %c = add
   //    dbg.value
-  // Check that this is replicated by DPValues.
+  // Check that this is replicated by DbgVariableRecords.
   Inst2->moveAfter(Inst1);
 
-  // Inst1 should only have one DPValue on it.
+  // Inst1 should only have one DbgVariableRecord on it.
   EXPECT_TRUE(Inst1->hasDbgRecords());
   auto Range1 = Inst1->getDbgRecordRange();
   EXPECT_EQ(std::distance(Range1.begin(), Range1.end()), 1u);
@@ -149,7 +149,7 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   Instruction *Instr2 = Instr1->getNextNode();
   DPMarker *Marker1 = Instr1->DbgMarker;
   DPMarker *Marker2 = Instr2->DbgMarker;
-  // There's no TrailingDPValues marker allocated yet.
+  // There's no TrailingDbgRecords marker allocated yet.
   DPMarker *EndMarker = nullptr;
 
   // Check that the "getMarker" utilities operate as expected.
@@ -158,83 +158,83 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   EXPECT_EQ(BB.getNextMarker(Instr1), Marker2);
   EXPECT_EQ(BB.getNextMarker(Instr2), EndMarker); // Is nullptr.
 
-  // There should be two DPValues,
-  EXPECT_EQ(Marker1->StoredDPValues.size(), 1u);
-  EXPECT_EQ(Marker2->StoredDPValues.size(), 1u);
+  // There should be two DbgVariableRecords,
+  EXPECT_EQ(Marker1->StoredDbgRecords.size(), 1u);
+  EXPECT_EQ(Marker2->StoredDbgRecords.size(), 1u);
 
   // Unlink them and try to re-insert them through the basic block.
-  DbgRecord *DPV1 = &*Marker1->StoredDPValues.begin();
-  DbgRecord *DPV2 = &*Marker2->StoredDPValues.begin();
-  DPV1->removeFromParent();
-  DPV2->removeFromParent();
-  EXPECT_TRUE(Marker1->StoredDPValues.empty());
-  EXPECT_TRUE(Marker2->StoredDPValues.empty());
+  DbgRecord *DVR1 = &*Marker1->StoredDbgRecords.begin();
+  DbgRecord *DVR2 = &*Marker2->StoredDbgRecords.begin();
+  DVR1->removeFromParent();
+  DVR2->removeFromParent();
+  EXPECT_TRUE(Marker1->StoredDbgRecords.empty());
+  EXPECT_TRUE(Marker2->StoredDbgRecords.empty());
 
   // This should appear in Marker1.
-  BB.insertDbgRecordBefore(DPV1, BB.begin());
-  EXPECT_EQ(Marker1->StoredDPValues.size(), 1u);
-  EXPECT_EQ(DPV1, &*Marker1->StoredDPValues.begin());
+  BB.insertDbgRecordBefore(DVR1, BB.begin());
+  EXPECT_EQ(Marker1->StoredDbgRecords.size(), 1u);
+  EXPECT_EQ(DVR1, &*Marker1->StoredDbgRecords.begin());
 
   // This should attach to Marker2.
-  BB.insertDbgRecordAfter(DPV2, &*BB.begin());
-  EXPECT_EQ(Marker2->StoredDPValues.size(), 1u);
-  EXPECT_EQ(DPV2, &*Marker2->StoredDPValues.begin());
+  BB.insertDbgRecordAfter(DVR2, &*BB.begin());
+  EXPECT_EQ(Marker2->StoredDbgRecords.size(), 1u);
+  EXPECT_EQ(DVR2, &*Marker2->StoredDbgRecords.begin());
 
-  // Now, how about removing instructions? That should cause any DPValues to
-  // "fall down".
+  // Now, how about removing instructions? That should cause any
+  // DbgVariableRecords to "fall down".
   Instr1->removeFromParent();
   Marker1 = nullptr;
-  // DPValues should now be in Marker2.
+  // DbgVariableRecords should now be in Marker2.
   EXPECT_EQ(BB.size(), 1u);
-  EXPECT_EQ(Marker2->StoredDPValues.size(), 2u);
+  EXPECT_EQ(Marker2->StoredDbgRecords.size(), 2u);
   // They should also be in the correct order.
-  SmallVector<DbgRecord *, 2> DPVs;
-  for (DbgRecord &DPV : Marker2->getDbgRecordRange())
-    DPVs.push_back(&DPV);
-  EXPECT_EQ(DPVs[0], DPV1);
-  EXPECT_EQ(DPVs[1], DPV2);
-
-  // If we remove the end instruction, the DPValues should fall down into
-  // the trailing marker.
+  SmallVector<DbgRecord *, 2> DVRs;
+  for (DbgRecord &DVR : Marker2->getDbgRecordRange())
+    DVRs.push_back(&DVR);
+  EXPECT_EQ(DVRs[0], DVR1);
+  EXPECT_EQ(DVRs[1], DVR2);
+
+  // If we remove the end instruction, the DbgVariableRecords should fall down
+  // into the trailing marker.
   EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
   Instr2->removeFromParent();
   EXPECT_TRUE(BB.empty());
   EndMarker = BB.getTrailingDbgRecords();
   ASSERT_NE(EndMarker, nullptr);
-  EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
   // Again, these should arrive in the correct order.
 
-  DPVs.clear();
-  for (DbgRecord &DPV : EndMarker->getDbgRecordRange())
-    DPVs.push_back(&DPV);
-  EXPECT_EQ(DPVs[0], DPV1);
-  EXPECT_EQ(DPVs[1], DPV2);
+  DVRs.clear();
+  for (DbgRecord &DVR : EndMarker->getDbgRecordRange())
+    DVRs.push_back(&DVR);
+  EXPECT_EQ(DVRs[0], DVR1);
+  EXPECT_EQ(DVRs[1], DVR2);
 
   // Inserting a normal instruction at the beginning: shouldn't dislodge the
-  // DPValues. It's intended to not go at the start.
+  // DbgVariableRecords. It's intended to not go at the start.
   Instr1->insertBefore(BB, BB.begin());
-  EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
   Instr1->removeFromParent();
 
-  // Inserting at end(): should dislodge the DPValues, if they were dbg.values
-  // then they would sit "above" the new instruction.
+  // Inserting at end(): should dislodge the DbgVariableRecords, if they were
+  // dbg.values then they would sit "above" the new instruction.
   Instr1->insertBefore(BB, BB.end());
-  EXPECT_EQ(Instr1->DbgMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(Instr1->DbgMarker->StoredDbgRecords.size(), 2u);
   // We should de-allocate the trailing marker when something is inserted
   // at end().
   EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
 
-  // Remove Instr1: now the DPValues will fall down again,
+  // Remove Instr1: now the DbgVariableRecords will fall down again,
   Instr1->removeFromParent();
   EndMarker = BB.getTrailingDbgRecords();
-  EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
 
   // Inserting a terminator, however it's intended, should dislodge the
-  // trailing DPValues, as it's the clear intention of the caller that this be
-  // the final instr in the block, and DPValues aren't allowed to live off the
-  // end forever.
+  // trailing DbgVariableRecords, as it's the clear intention of the caller that
+  // this be the final instr in the block, and DbgVariableRecords aren't allowed
+  // to live off the end forever.
   Instr2->insertBefore(BB, BB.begin());
-  EXPECT_EQ(Instr2->DbgMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(Instr2->DbgMarker->StoredDbgRecords.size(), 2u);
   EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
 
   // Teardown,
@@ -298,34 +298,35 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   Instruction *DInst = CInst->getNextNode();
   // CInst should have debug-info.
   ASSERT_TRUE(CInst->DbgMarker);
-  EXPECT_FALSE(CInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_FALSE(CInst->DbgMarker->StoredDbgRecords.empty());
 
-  // If we move "c" to the start of the block, just normally, then the DPValues
-  // should fall down to "d".
+  // If we move "c" to the start of the block, just normally, then the
+  // DbgVariableRecords should fall down to "d".
   CInst->moveBefore(BB, BeginIt2);
-  EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDbgRecords.empty());
   ASSERT_TRUE(DInst->DbgMarker);
-  EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
 
   // Wheras if we move D to the start of the block with moveBeforePreserving,
-  // the DPValues should move with it.
+  // the DbgVariableRecords should move with it.
   DInst->moveBeforePreserving(BB, BB.begin());
-  EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), DInst);
 
-  // Similarly, moveAfterPreserving "D" to "C" should move DPValues with "D".
+  // Similarly, moveAfterPreserving "D" to "C" should move DbgVariableRecords
+  // with "D".
   DInst->moveAfterPreserving(CInst);
-  EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
 
   // (move back to the start...)
   DInst->moveBeforePreserving(BB, BB.begin());
 
-  // Current order of insts: "D -> C -> B -> Ret". DPValues on "D".
+  // Current order of insts: "D -> C -> B -> Ret". DbgVariableRecords on "D".
   // If we move "C" to the beginning of the block, it should go before the
-  // DPValues. They'll stay on "D".
+  // DbgVariableRecords. They'll stay on "D".
   CInst->moveBefore(BB, BB.begin());
-  EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDPValues.empty());
-  EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDbgRecords.empty());
+  EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), CInst);
   EXPECT_EQ(CInst->getNextNode(), DInst);
 
@@ -333,16 +334,16 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   CInst->moveBefore(BInst);
   EXPECT_EQ(&*BB.begin(), DInst);
 
-  // Current order of insts: "D -> C -> B -> Ret". DPValues on "D".
+  // Current order of insts: "D -> C -> B -> Ret". DbgVariableRecords on "D".
   // Now move CInst to the position of DInst, but using getIterator instead of
   // BasicBlock::begin. This signals that we want the "C" instruction to be
-  // immediately before "D", with any DPValues on "D" now moving to "C".
-  // It's the equivalent of moving an instruction to the position between a
+  // immediately before "D", with any DbgVariableRecords on "D" now moving to
+  // "C". It's the equivalent of moving an instruction to the position between a
   // run of dbg.values and the next instruction.
   CInst->moveBefore(BB, DInst->getIterator());
-  // CInst gains the DPValues.
-  EXPECT_TRUE(!DInst->DbgMarker || DInst->DbgMarker->StoredDPValues.empty());
-  EXPECT_FALSE(CInst->DbgMarker->StoredDPValues.empty());
+  // CInst gains the DbgVariableRecords.
+  EXPECT_TRUE(!DInst->DbgMarker || DInst->DbgMarker->StoredDbgRecords.empty());
+  EXPECT_FALSE(CInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), CInst);
 
   UseNewDbgInfoFormat = false;
@@ -378,8 +379,8 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
-  // Check that DPValues can be accessed from Instructions without digging
-  // into the depths of DPMarkers.
+  // Check that DbgVariableRecords can be accessed from Instructions without
+  // digging into the depths of DPMarkers.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
   // Convert the module to "new" form debug-info.
   M->convertToNewDbgValues();
@@ -390,26 +391,26 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
 
   ASSERT_FALSE(BInst->DbgMarker);
   ASSERT_TRUE(CInst->DbgMarker);
-  ASSERT_EQ(CInst->DbgMarker->StoredDPValues.size(), 1u);
-  DbgRecord *DPV1 = &*CInst->DbgMarker->StoredDPValues.begin();
-  ASSERT_TRUE(DPV1);
+  ASSERT_EQ(CInst->DbgMarker->StoredDbgRecords.size(), 1u);
+  DbgRecord *DVR1 = &*CInst->DbgMarker->StoredDbgRecords.begin();
+  ASSERT_TRUE(DVR1);
   EXPECT_FALSE(BInst->hasDbgRecords());
 
-  // Clone DPValues from one inst to another. Other arguments to clone are
-  // tested in DPMarker test.
+  // Clone DbgVariableRecords from one inst to another. Other arguments to clone
+  // are tested in DPMarker test.
   auto Range1 = BInst->cloneDebugInfoFrom(CInst);
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 1u);
-  DbgRecord *DPV2 = &*BInst->DbgMarker->StoredDPValues.begin();
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
+  DbgRecord *DVR2 = &*BInst->DbgMarker->StoredDbgRecords.begin();
   EXPECT_EQ(std::distance(Range1.begin(), Range1.end()), 1u);
-  EXPECT_EQ(&*Range1.begin(), DPV2);
-  EXPECT_NE(DPV1, DPV2);
+  EXPECT_EQ(&*Range1.begin(), DVR2);
+  EXPECT_NE(DVR1, DVR2);
 
   // We should be able to get a range over exactly the same information.
   auto Range2 = BInst->getDbgRecordRange();
   EXPECT_EQ(Range1.begin(), Range2.begin());
   EXPECT_EQ(Range1.end(), Range2.end());
 
-  // We should be able to query if there are DPValues,
+  // We should be able to query if there are DbgVariableRecords,
   EXPECT_TRUE(BInst->hasDbgRecords());
   EXPECT_TRUE(CInst->hasDbgRecords());
   EXPECT_FALSE(DInst->hasDbgRecords());
@@ -417,12 +418,12 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   // Dropping should be easy,
   BInst->dropDbgRecords();
   EXPECT_FALSE(BInst->hasDbgRecords());
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 0u);
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 0u);
 
-  // And we should be able to drop individual DPValues.
-  CInst->dropOneDbgRecord(DPV1);
+  // And we should be able to drop individual DbgVariableRecords.
+  CInst->dropOneDbgRecord(DVR1);
   EXPECT_FALSE(CInst->hasDbgRecords());
-  EXPECT_EQ(CInst->DbgMarker->StoredDPValues.size(), 0u);
+  EXPECT_EQ(CInst->DbgMarker->StoredDbgRecords.size(), 0u);
 
   UseNewDbgInfoFormat = false;
 }
@@ -514,7 +515,7 @@ class DbgSpliceTest : public ::testing::Test {
   BasicBlock *BBEntry, *BBExit;
   BasicBlock::iterator Dest, First, Last;
   Instruction *BInst, *Branch, *CInst;
-  DPValue *DPVA, *DPVB, *DPVConst;
+  DbgVariableRecord *DVRA, *DVRB, *DVRConst;
 
   void SetUp() override {
     UseNewDbgInfoFormat = true;
@@ -531,18 +532,21 @@ class DbgSpliceTest : public ::testing::Test {
     Branch = &*Last;
     CInst = &*Dest;
 
-    DPVA = cast<DPValue>(&*BInst->DbgMarker->StoredDPValues.begin());
-    DPVB = cast<DPValue>(&*Branch->DbgMarker->StoredDPValues.begin());
-    DPVConst = cast<DPValue>(&*CInst->DbgMarker->StoredDPValues.begin());
+    DVRA =
+        cast<DbgVariableRecord>(&*BInst->DbgMarker->StoredDbgRecords.begin());
+    DVRB =
+        cast<DbgVariableRecord>(&*Branch->DbgMarker->StoredDbgRecords.begin());
+    DVRConst =
+        cast<DbgVariableRecord>(&*CInst->DbgMarker->StoredDbgRecords.begin());
   }
 
   void TearDown() override { UseNewDbgInfoFormat = false; }
 
-  bool InstContainsDPValue(Instruction *I, DPValue *DPV) {
+  bool InstContainsDbgVariableRecord(Instruction *I, DbgVariableRecord *DVR) {
     for (DbgRecord &D : I->getDbgRecordRange()) {
-      if (&D == DPV) {
+      if (&D == DVR) {
         // Confirm too that the links between the records are correct.
-        EXPECT_EQ(DPV->Marker, I->DbgMarker);
+        EXPECT_EQ(DVR->Marker, I->DbgMarker);
         EXPECT_EQ(I->DbgMarker->MarkedInstr, I);
         return true;
       }
@@ -550,7 +554,8 @@ class DbgSpliceTest : public ::testing::Test {
     return false;
   }
 
-  bool CheckDPVOrder(Instruction *I, SmallVector<DPValue *> CheckVals) {
+  bool CheckDVROrder(Instruction *I,
+                     SmallVector<DbgVariableRecord *> CheckVals) {
     SmallVector<DbgRecord *> Vals;
     for (DbgRecord &D : I->getDbgRecordRange())
       Vals.push_back(&D);
@@ -578,15 +583,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest0) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, not including leading dbg.value, to Last, including the
@@ -595,15 +600,14 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0, !dbg !11
         }
 
 
@@ -613,14 +617,14 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Dest, in exit block.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVB));
+  // DVRB: should be on Dest, in exit block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRB));
 
-  // DPVA, should have "fallen" onto the branch, remained in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVA));
+  // DVRA, should have "fallen" onto the branch, remained in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRA));
 
-  // DPVConst should be on the moved %b instruction.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVConst));
+  // DVRConst should be on the moved %b instruction.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRConst));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest1) {
@@ -631,15 +635,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest1) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, not including leading dbg.value, to Last, including the
@@ -648,15 +652,15 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
 First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRConst  call void @llvm.dbg.value(metadata i16 0,
+metadata !9, metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1,
+!dbg !11 ret i16 0, !dbg !11
         }
 
 
@@ -666,17 +670,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on CInst, in exit block.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVB));
+  // DVRB: should be on CInst, in exit block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRB));
 
-  // DPVA, should have "fallen" onto the branch, remained in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVA));
+  // DVRA, should have "fallen" onto the branch, remained in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRA));
 
-  // DPVConst should be behind / after the moved instructions, remain on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be behind / after the moved instructions, remain on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVB and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(CInst, {DPVB, DPVConst}));
+  // Order of DVRB and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(CInst, {DVRB, DVRConst}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest2) {
@@ -687,15 +691,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest2) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from head of First, which includes the leading dbg.value, to Last,
@@ -707,12 +711,12 @@ BBEntry entry:
 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRA      call void @llvm.dbg.value(metadata i16 %a,
+metadata !9, metadata !DIExpression()), !dbg !11 First     %b = add i16 %a, 1,
+!dbg !11 DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9,
+metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret
+i16 0, !dbg !11
         }
 
 
@@ -721,18 +725,18 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(BInst->getParent(), BBExit);
   EXPECT_EQ(CInst->getParent(), BBExit);
 
-  // DPVB: should be on CInst, in exit block.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVB));
+  // DVRB: should be on CInst, in exit block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRB));
 
-  // DPVA, should have transferred with the spliced instructions, remains on
+  // DVRA, should have transferred with the spliced instructions, remains on
   // the "b" inst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVA));
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRA));
 
-  // DPVConst should be ahead of the moved instructions, ahead of BInst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVConst));
+  // DVRConst should be ahead of the moved instructions, ahead of BInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRConst));
 
-  // Order of DPVA and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(BInst, {DPVConst, DPVA}));
+  // Order of DVRA and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(BInst, {DVRConst, DVRA}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest3) {
@@ -743,15 +747,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest3) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from head of First, which includes the leading dbg.value, to Last,
@@ -763,12 +767,12 @@ BBEntry entry:
 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9,
+metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret
+i16 0, !dbg !11
         }
 
   */
@@ -776,18 +780,18 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(BInst->getParent(), BBExit);
   EXPECT_EQ(CInst->getParent(), BBExit);
 
-  // DPVB: should be on CInst, in exit block.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVB));
+  // DVRB: should be on CInst, in exit block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRB));
 
-  // DPVA, should have transferred with the spliced instructions, remains on
+  // DVRA, should have transferred with the spliced instructions, remains on
   // the "b" inst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVA));
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRA));
 
-  // DPVConst should be behind the moved instructions, ahead of CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be behind the moved instructions, ahead of CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVB and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(CInst, {DPVB, DPVConst}));
+  // Order of DVRB and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(CInst, {DVRB, DVRConst}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest4) {
@@ -798,15 +802,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest4) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, not including the leading dbg.value, to Last, but NOT
@@ -815,15 +819,15 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRB      call void @llvm.dbg.value(metadata i16 %b,
+metadata !9, metadata !DIExpression()), !dbg !11 Last      br label %exit, !dbg
+!11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 Dest      %c =
+add i16 %b, 1, !dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -832,17 +836,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have remained in entry block, falls onto Branch inst.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVA));
+  // DVRA, should have remained in entry block, falls onto Branch inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRA));
 
-  // DPVConst should be ahead of the moved instructions, BInst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVConst));
+  // DVRConst should be ahead of the moved instructions, BInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRConst));
 
-  // Order of DPVA and DPVA should be thus:
-  EXPECT_TRUE(CheckDPVOrder(Branch, {DPVA, DPVB}));
+  // Order of DVRA and DVRA should be thus:
+  EXPECT_TRUE(CheckDVROrder(Branch, {DVRA, DVRB}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest5) {
@@ -853,15 +857,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest5) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, not including the leading dbg.value, to Last, but NOT
@@ -870,15 +874,16 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRB      call void @llvm.dbg.value(metadata i16 %b,
+metadata !9, metadata !DIExpression()), !dbg !11 Last      br label %exit, !dbg
+!11
 
 BBExit  exit:
 First     %b = add i16 %a, 1, !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
   */
@@ -887,17 +892,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have remained in entry block, falls onto Branch inst.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVA));
+  // DVRA, should have remained in entry block, falls onto Branch inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRA));
 
-  // DPVConst should be behind of the moved instructions, on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be behind of the moved instructions, on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVA and DPVB should be thus:
-  EXPECT_TRUE(CheckDPVOrder(Branch, {DPVA, DPVB}));
+  // Order of DVRA and DVRB should be thus:
+  EXPECT_TRUE(CheckDVROrder(Branch, {DVRA, DVRB}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest6) {
@@ -908,15 +913,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest6) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, including the leading dbg.value, to Last, but NOT
@@ -925,15 +930,14 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRA      call void @llvm.dbg.value(metadata i16 %a,
+metadata !9, metadata !DIExpression()), !dbg !11 First     %b = add i16 %a, 1,
+!dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -942,17 +946,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have transferred to BBExit, on B inst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVA));
+  // DVRA, should have transferred to BBExit, on B inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRA));
 
-  // DPVConst should be ahead of the moved instructions, on BInst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVConst));
+  // DVRConst should be ahead of the moved instructions, on BInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRConst));
 
-  // Order of DPVA and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(BInst, {DPVConst, DPVA}));
+  // Order of DVRA and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(BInst, {DVRConst, DVRA}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest7) {
@@ -963,15 +967,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest7) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, including the leading dbg.value, to Last, but NOT
@@ -980,15 +984,14 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRConst  call
+void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()),
+!dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -997,21 +1000,21 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have transferred to BBExit, on B inst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVA));
+  // DVRA, should have transferred to BBExit, on B inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRA));
 
-  // DPVConst should be after of the moved instructions, on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be after of the moved instructions, on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 }
 
 // But wait, there's more! What if you splice a range that is empty, but
 // implicitly contains debug-info? In the dbg.value design for debug-info,
-// this would be an explicit range, but in DPValue debug-info, it isn't.
-// Check that if we try to do that, with differing head-bit values, that
-// DPValues are transferred.
+// this would be an explicit range, but in DbgVariableRecord debug-info, it
+// isn't. Check that if we try to do that, with differing head-bit values, that
+// DbgVariableRecords are transferred.
 // Test with empty transfers to Dest, with head bit set and not set.
 
 TEST_F(DbgSpliceTest, DbgSpliceEmpty0) {
@@ -1021,31 +1024,31 @@ TEST_F(DbgSpliceTest, DbgSpliceEmpty0) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from BBEntry.getFirstInsertionPt to First -- this implicitly is a
-    splice of DPVA, but the iterators are pointing at the same instruction. The
+    splice of DVRA, but the iterators are pointing at the same instruction. The
     only difference is the setting of the head bit. Becomes;
 
         define i16 @f(i16 %a) !dbg !6 {
 First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRA      call void @llvm.dbg.value(metadata i16 %a,
+metadata !9, metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1,
+!dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -1054,17 +1057,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have transferred to BBExit, on C inst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVA));
+  // DVRA, should have transferred to BBExit, on C inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRA));
 
-  // DPVConst should be ahead of the moved DPValue, on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be ahead of the moved DbgVariableRecord, on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVA and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(CInst, {DPVConst, DPVA}));
+  // Order of DVRA and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(CInst, {DVRConst, DVRA}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceEmpty1) {
@@ -1074,32 +1077,32 @@ TEST_F(DbgSpliceTest, DbgSpliceEmpty1) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from BBEntry.getFirstInsertionPt to First -- this implicitly is a
-    splice of DPVA, but the iterators are pointing at the same instruction. The
+    splice of DVRA, but the iterators are pointing at the same instruction. The
     only difference is the setting of the head bit. Insert at head of Dest,
-    i.e. before DPVConst. Becomes;
+    i.e. before DVRConst. Becomes;
 
         define i16 @f(i16 %a) !dbg !6 {
 First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRConst  call void @llvm.dbg.value(metadata i16 0,
+metadata !9, metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1,
+!dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -1108,21 +1111,21 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have transferred to BBExit, on C inst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVA));
+  // DVRA, should have transferred to BBExit, on C inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRA));
 
-  // DPVConst should be ahead of the moved DPValue, on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be ahead of the moved DbgVariableRecord, on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVA and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(CInst, {DPVA, DPVConst}));
+  // Order of DVRA and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(CInst, {DVRA, DVRConst}));
 }
 
-// If we splice new instructions into a block with trailing DPValues, then
-// the trailing DPValues should get flushed back out.
+// If we splice new instructions into a block with trailing DbgVariableRecords,
+// then the trailing DbgVariableRecords should get flushed back out.
 TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   LLVMContext C;
   UseNewDbgInfoFormat = true;
@@ -1159,7 +1162,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   BasicBlock &Exit = *Entry.getNextNode();
   M->convertToNewDbgValues();
 
-  // Begin by forcing entry block to have dangling DPValue.
+  // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
   ASSERT_NE(Entry.getTrailingDbgRecords(), nullptr);
   EXPECT_TRUE(Entry.empty());
@@ -1167,19 +1170,19 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   // Now transfer the entire contents of the exit block into the entry.
   Entry.splice(Entry.end(), &Exit, Exit.begin(), Exit.end());
 
-  // The trailing DPValue should have been placed at the front of what's been
-  // spliced in.
+  // The trailing DbgVariableRecord should have been placed at the front of
+  // what's been spliced in.
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->DbgMarker);
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 1u);
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
 
   UseNewDbgInfoFormat = false;
 }
 
-// When we remove instructions from the program, adjacent DPValues coalesce
-// together into one DPMarker. In "old" dbg.value mode you could re-insert
-// the removed instruction back into the middle of a sequence of dbg.values.
-// Test that this can be replicated correctly by DPValues
+// When we remove instructions from the program, adjacent DbgVariableRecords
+// coalesce together into one DPMarker. In "old" dbg.value mode you could
+// re-insert the removed instruction back into the middle of a sequence of
+// dbg.values. Test that this can be replicated correctly by DbgVariableRecords
 TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   LLVMContext C;
   UseNewDbgInfoFormat = true;
@@ -1221,7 +1224,7 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   Instruction *RetInst = AddInst->getNextNode();
   ASSERT_TRUE(isa<ReturnInst>(RetInst));
 
-  // add and sub should both have one DPValue on add and ret.
+  // add and sub should both have one DbgVariableRecord on add and ret.
   EXPECT_FALSE(SubInst->hasDbgRecords());
   EXPECT_TRUE(AddInst->hasDbgRecords());
   EXPECT_TRUE(RetInst->hasDbgRecords());
@@ -1232,20 +1235,21 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
 
   // The Supported (TM) code sequence for removing then reinserting insts
   // after another instruction:
-  std::optional<DPValue::self_iterator> Pos =
+  std::optional<DbgVariableRecord::self_iterator> Pos =
       AddInst->getDbgReinsertionPosition();
   AddInst->removeFromParent();
 
   // We should have a re-insertion position.
   ASSERT_TRUE(Pos);
-  // Both DPValues should now be attached to the ret inst.
+  // Both DbgVariableRecords should now be attached to the ret inst.
   auto R3 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R3.begin(), R3.end()), 2u);
 
   // Re-insert and re-insert.
   AddInst->insertAfter(SubInst);
   Entry.reinsertInstInDbgRecords(AddInst, Pos);
-  // We should be back into a position of having one DPValue on add and ret.
+  // We should be back into a position of having one DbgVariableRecord on add
+  // and ret.
   EXPECT_FALSE(SubInst->hasDbgRecords());
   EXPECT_TRUE(AddInst->hasDbgRecords());
   EXPECT_TRUE(RetInst->hasDbgRecords());
@@ -1257,9 +1261,9 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   UseNewDbgInfoFormat = false;
 }
 
-// Test instruction removal and re-insertion, this time with one DPValue that
-// should hop up one instruction.
-TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDPValue) {
+// Test instruction removal and re-insertion, this time with one
+// DbgVariableRecord that should hop up one instruction.
+TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
   LLVMContext C;
   UseNewDbgInfoFormat = true;
 
@@ -1299,7 +1303,7 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDPValue) {
   Instruction *RetInst = AddInst->getNextNode();
   ASSERT_TRUE(isa<ReturnInst>(RetInst));
 
-  // There should be one DPValue.
+  // There should be one DbgVariableRecord.
   EXPECT_FALSE(SubInst->hasDbgRecords());
   EXPECT_TRUE(AddInst->hasDbgRecords());
   EXPECT_FALSE(RetInst->hasDbgRecords());
@@ -1307,13 +1311,13 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDPValue) {
   EXPECT_EQ(std::distance(R1.begin(), R1.end()), 1u);
 
   // The Supported (TM) code sequence for removing then reinserting insts:
-  std::optional<DPValue::self_iterator> Pos =
+  std::optional<DbgVariableRecord::self_iterator> Pos =
       AddInst->getDbgReinsertionPosition();
   AddInst->removeFromParent();
 
-  // No re-insertion position as there were no DPValues on the ret.
+  // No re-insertion position as there were no DbgVariableRecords on the ret.
   ASSERT_FALSE(Pos);
-  // The single DPValue should now be attached to the ret inst.
+  // The single DbgVariableRecord should now be attached to the ret inst.
   EXPECT_TRUE(RetInst->hasDbgRecords());
   auto R2 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R2.begin(), R2.end()), 1u);
@@ -1321,7 +1325,8 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDPValue) {
   // Re-insert and re-insert.
   AddInst->insertAfter(SubInst);
   Entry.reinsertInstInDbgRecords(AddInst, Pos);
-  // We should be back into a position of having one DPValue on the AddInst.
+  // We should be back into a position of having one DbgVariableRecord on the
+  // AddInst.
   EXPECT_FALSE(SubInst->hasDbgRecords());
   EXPECT_TRUE(AddInst->hasDbgRecords());
   EXPECT_FALSE(RetInst->hasDbgRecords());
@@ -1374,7 +1379,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   BasicBlock &Exit = *Entry.getNextNode();
   M->convertToNewDbgValues();
 
-  // Begin by forcing entry block to have dangling DPValue.
+  // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
   ASSERT_NE(Entry.getTrailingDbgRecords(), nullptr);
   EXPECT_TRUE(Entry.empty());
@@ -1387,17 +1392,17 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   // should be in the correct order of %a, then 0.
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->hasDbgRecords());
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 2u);
-  SmallVector<DPValue *, 2> DPValues;
-  for (DbgRecord &DPV : BInst->getDbgRecordRange())
-    DPValues.push_back(cast<DPValue>(&DPV));
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 2u);
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
+  for (DbgRecord &DVR : BInst->getDbgRecordRange())
+    DbgVariableRecords.push_back(cast<DbgVariableRecord>(&DVR));
 
-  EXPECT_EQ(DPValues[0]->getVariableLocationOp(0), F.getArg(0));
-  Value *SecondDPVValue = DPValues[1]->getVariableLocationOp(0);
-  ASSERT_TRUE(isa<ConstantInt>(SecondDPVValue));
-  EXPECT_EQ(cast<ConstantInt>(SecondDPVValue)->getZExtValue(), 0ull);
+  EXPECT_EQ(DbgVariableRecords[0]->getVariableLocationOp(0), F.getArg(0));
+  Value *SecondDVRValue = DbgVariableRecords[1]->getVariableLocationOp(0);
+  ASSERT_TRUE(isa<ConstantInt>(SecondDVRValue));
+  EXPECT_EQ(cast<ConstantInt>(SecondDVRValue)->getZExtValue(), 0ull);
 
-  // No trailing DPValues in the entry block now.
+  // No trailing DbgVariableRecords in the entry block now.
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
 
   UseNewDbgInfoFormat = false;
@@ -1444,31 +1449,31 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   BasicBlock &Exit = *Entry.getNextNode();
   M->convertToNewDbgValues();
 
-  // Begin by forcing entry block to have dangling DPValue.
+  // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
   ASSERT_NE(Entry.getTrailingDbgRecords(), nullptr);
   EXPECT_TRUE(Entry.empty());
 
   // Now transfer into the entry block -- fetching the first instruction with
   // begin and then calling getIterator clears the "head" bit, meaning that the
-  // range to move will not include any leading DPValues.
+  // range to move will not include any leading DbgVariableRecords.
   Entry.splice(Entry.end(), &Exit, Exit.begin()->getIterator(), Exit.end());
 
   // We should now have one dbg.values on the first instruction, %a.
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->hasDbgRecords());
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 1u);
-  SmallVector<DPValue *, 2> DPValues;
-  for (DbgRecord &DPV : BInst->getDbgRecordRange())
-    DPValues.push_back(cast<DPValue>(&DPV));
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
+  for (DbgRecord &DVR : BInst->getDbgRecordRange())
+    DbgVariableRecords.push_back(cast<DbgVariableRecord>(&DVR));
 
-  EXPECT_EQ(DPValues[0]->getVariableLocationOp(0), F.getArg(0));
-  // No trailing DPValues in the entry block now.
+  EXPECT_EQ(DbgVariableRecords[0]->getVariableLocationOp(0), F.getArg(0));
+  // No trailing DbgVariableRecords in the entry block now.
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
 
   // We should have nothing left in the exit block...
   EXPECT_TRUE(Exit.empty());
-  // ... except for some dangling DPValues.
+  // ... except for some dangling DbgVariableRecords.
   EXPECT_NE(Exit.getTrailingDbgRecords(), nullptr);
   EXPECT_FALSE(Exit.getTrailingDbgRecords()->empty());
   Exit.getTrailingDbgRecords()->eraseFromParent();
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 0b019c26148b5..3672f2bccfecd 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -234,8 +234,8 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
   EXPECT_TRUE(DbgDeclare->isKillLocation());
 }
 
-// Duplicate of above test, but in DPValue representation.
-TEST(MetadataTest, DeleteInstUsedByDPValue) {
+// Duplicate of above test, but in DbgVariableRecord representation.
+TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -267,26 +267,26 @@ TEST(MetadataTest, DeleteInstUsedByDPValue) {
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
   M->convertToNewDbgValues();
 
-  // Find the DPValues using %b.
+  // Find the DbgVariableRecords using %b.
   SmallVector<DbgValueInst *, 2> DVIs;
-  SmallVector<DPValue *, 2> DPVs;
-  findDbgValues(DVIs, &I, &DPVs);
-  ASSERT_EQ(DPVs.size(), 2u);
+  SmallVector<DbgVariableRecord *, 2> DVRs;
+  findDbgValues(DVIs, &I, &DVRs);
+  ASSERT_EQ(DVRs.size(), 2u);
 
-  // Delete %b. The DPValue should now point to undef.
+  // Delete %b. The DbgVariableRecord should now point to undef.
   I.eraseFromParent();
-  EXPECT_EQ(DPVs[0]->getNumVariableLocationOps(), 1u);
-  EXPECT_TRUE(isa<UndefValue>(DPVs[0]->getVariableLocationOp(0)));
-  EXPECT_TRUE(DPVs[0]->isKillLocation());
-  EXPECT_EQ(DPVs[1]->getNumVariableLocationOps(), 2u);
-  EXPECT_TRUE(isa<UndefValue>(DPVs[1]->getVariableLocationOp(1)));
-  EXPECT_TRUE(DPVs[1]->isKillLocation());
+  EXPECT_EQ(DVRs[0]->getNumVariableLocationOps(), 1u);
+  EXPECT_TRUE(isa<UndefValue>(DVRs[0]->getVariableLocationOp(0)));
+  EXPECT_TRUE(DVRs[0]->isKillLocation());
+  EXPECT_EQ(DVRs[1]->getNumVariableLocationOps(), 2u);
+  EXPECT_TRUE(isa<UndefValue>(DVRs[1]->getVariableLocationOp(1)));
+  EXPECT_TRUE(DVRs[1]->isKillLocation());
   UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 // Ensure that the order of dbg.value intrinsics returned by findDbgValues, and
-// their corresponding DPValue representation, are consistent.
-TEST(MetadataTest, OrderingOfDPValues) {
+// their corresponding DbgVariableRecord representation, are consistent.
+TEST(MetadataTest, OrderingOfDbgVariableRecords) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -319,10 +319,10 @@ TEST(MetadataTest, OrderingOfDPValues) {
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
 
   SmallVector<DbgValueInst *, 2> DVIs;
-  SmallVector<DPValue *, 2> DPVs;
-  findDbgValues(DVIs, &I, &DPVs);
+  SmallVector<DbgVariableRecord *, 2> DVRs;
+  findDbgValues(DVIs, &I, &DVRs);
   ASSERT_EQ(DVIs.size(), 2u);
-  ASSERT_EQ(DPVs.size(), 0u);
+  ASSERT_EQ(DVRs.size(), 0u);
 
   // The correct order of dbg.values is given by their use-list, which becomes
   // the reverse order of creation. Thus the dbg.values should come out as
@@ -332,17 +332,17 @@ TEST(MetadataTest, OrderingOfDPValues) {
   DILocalVariable *Var1 = DVIs[1]->getVariable();
   EXPECT_TRUE(Var1->getName() == "foo");
 
-  // Now try again, but in DPValue form.
+  // Now try again, but in DbgVariableRecord form.
   DVIs.clear();
 
   M->convertToNewDbgValues();
-  findDbgValues(DVIs, &I, &DPVs);
+  findDbgValues(DVIs, &I, &DVRs);
   ASSERT_EQ(DVIs.size(), 0u);
-  ASSERT_EQ(DPVs.size(), 2u);
+  ASSERT_EQ(DVRs.size(), 2u);
 
-  Var0 = DPVs[0]->getVariable();
+  Var0 = DVRs[0]->getVariable();
   EXPECT_TRUE(Var0->getName() == "bar");
-  Var1 = DPVs[1]->getVariable();
+  Var1 = DVRs[1]->getVariable();
   EXPECT_TRUE(Var1->getName() == "foo");
 
   M->convertFromNewDbgValues();
@@ -861,9 +861,9 @@ TEST(AssignmentTrackingTest, InstrMethods) {
   }
 }
 
-// Test some very straight-forward operations on DPValues -- these are
+// Test some very straight-forward operations on DbgVariableRecords -- these are
 // dbg.values that have been converted to a non-instruction format.
-TEST(MetadataTest, ConvertDbgToDPValue) {
+TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -900,7 +900,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
   const DIExpression *Expr = nullptr;
   const DILocation *Loc = nullptr;
   const Metadata *MLoc = nullptr;
-  DPValue *DPV1 = nullptr;
+  DbgVariableRecord *DVR1 = nullptr;
   {
     DbgValueInst *DPI = dyn_cast<DbgValueInst>(&I);
     ASSERT_TRUE(DPI);
@@ -909,17 +909,18 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
     Loc = DPI->getDebugLoc().get();
     MLoc = DPI->getRawLocation();
 
-    // Test the creation of a DPValue and it's conversion back to a dbg.value.
-    DPV1 = new DPValue(DPI);
-    EXPECT_EQ(DPV1->getVariable(), Var);
-    EXPECT_EQ(DPV1->getExpression(), Expr);
-    EXPECT_EQ(DPV1->getDebugLoc().get(), Loc);
-    EXPECT_EQ(DPV1->getRawLocation(), MLoc);
+    // Test the creation of a DbgVariableRecord and it's conversion back to a
+    // dbg.value.
+    DVR1 = new DbgVariableRecord(DPI);
+    EXPECT_EQ(DVR1->getVariable(), Var);
+    EXPECT_EQ(DVR1->getExpression(), Expr);
+    EXPECT_EQ(DVR1->getDebugLoc().get(), Loc);
+    EXPECT_EQ(DVR1->getRawLocation(), MLoc);
 
     // Erase dbg.value,
     DPI->eraseFromParent();
-    // Re-create from DPV1, inserting at front.
-    DPV1->createDebugIntrinsic(&*M,
+    // Re-create from DVR1, inserting at front.
+    DVR1->createDebugIntrinsic(&*M,
                                &M->getFunction("f")->getEntryBlock().front());
 
     Instruction *NewDPI = &M->getFunction("f")->getEntryBlock().front();
@@ -931,17 +932,17 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
     EXPECT_EQ(DPI2->getRawLocation(), MLoc);
   }
 
-  // Fetch the second dbg.value, convert it to a DPValue,
+  // Fetch the second dbg.value, convert it to a DbgVariableRecord,
   BasicBlock::iterator It = M->getFunction("f")->getEntryBlock().begin();
   It = std::next(std::next(It));
   DbgValueInst *DPI3 = dyn_cast<DbgValueInst>(It);
   ASSERT_TRUE(DPI3);
-  DPValue *DPV2 = new DPValue(DPI3);
+  DbgVariableRecord *DVR2 = new DbgVariableRecord(DPI3);
 
   // These dbg.values are supposed to refer to different values.
-  EXPECT_NE(DPV1->getRawLocation(), DPV2->getRawLocation());
+  EXPECT_NE(DVR1->getRawLocation(), DVR2->getRawLocation());
 
-  // Try manipulating DPValues and markers in the exit block.
+  // Try manipulating DbgVariableRecords and markers in the exit block.
   BasicBlock *ExitBlock = &*std::next(M->getFunction("f")->getEntryBlock().getIterator());
   Instruction *FirstInst = &ExitBlock->front();
   Instruction *RetInst = &*std::next(FirstInst->getIterator());
@@ -951,67 +952,69 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
   ExitBlock->createMarker(FirstInst);
   ExitBlock->createMarker(RetInst);
 
-  // Insert DPValues into markers, order should come out DPV2, DPV1.
-  FirstInst->DbgMarker->insertDbgRecord(DPV1, false);
-  FirstInst->DbgMarker->insertDbgRecord(DPV2, true);
+  // Insert DbgRecords into markers, order should come out DVR2, DVR1.
+  FirstInst->DbgMarker->insertDbgRecord(DVR1, false);
+  FirstInst->DbgMarker->insertDbgRecord(DVR2, true);
   unsigned int ItCount = 0;
   for (DbgRecord &Item : FirstInst->DbgMarker->getDbgRecordRange()) {
-    EXPECT_TRUE((&Item == DPV2 && ItCount == 0) ||
-              (&Item == DPV1 && ItCount == 1));
+    EXPECT_TRUE((&Item == DVR2 && ItCount == 0) ||
+                (&Item == DVR1 && ItCount == 1));
     EXPECT_EQ(Item.getMarker(), FirstInst->DbgMarker);
     ++ItCount;
   }
 
-  // Clone them onto the second marker -- should allocate new DPVs.
+  // Clone them onto the second marker -- should allocate new DVRs.
   RetInst->DbgMarker->cloneDebugInfoFrom(FirstInst->DbgMarker, std::nullopt, false);
-  EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 2u);
   ItCount = 0;
   // Check these things store the same information; but that they're not the same
   // objects.
-  for (DPValue &Item :
-       DPValue::filter(RetInst->DbgMarker->getDbgRecordRange())) {
-    EXPECT_TRUE((Item.getRawLocation() == DPV2->getRawLocation() && ItCount == 0) ||
-                (Item.getRawLocation() == DPV1->getRawLocation() && ItCount == 1));
+  for (DbgVariableRecord &Item :
+       filterDbgVars(RetInst->DbgMarker->getDbgRecordRange())) {
+    EXPECT_TRUE(
+        (Item.getRawLocation() == DVR2->getRawLocation() && ItCount == 0) ||
+        (Item.getRawLocation() == DVR1->getRawLocation() && ItCount == 1));
 
     EXPECT_EQ(Item.getMarker(), RetInst->DbgMarker);
-    EXPECT_NE(&Item, DPV1);
-    EXPECT_NE(&Item, DPV2);
+    EXPECT_NE(&Item, DVR1);
+    EXPECT_NE(&Item, DVR2);
     ++ItCount;
   }
 
   RetInst->DbgMarker->dropDbgRecords();
-  EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 0u);
+  EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 0u);
 
-  // Try cloning one single DPValue.
+  // Try cloning one single DbgVariableRecord.
   auto DIIt = std::next(FirstInst->DbgMarker->getDbgRecordRange().begin());
   RetInst->DbgMarker->cloneDebugInfoFrom(FirstInst->DbgMarker, DIIt, false);
-  EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 1u);
-  // The second DPValue should have been cloned; it should have the same values
-  // as DPV1.
-  EXPECT_EQ(cast<DPValue>(RetInst->DbgMarker->StoredDPValues.begin())
-                ->getRawLocation(),
-            DPV1->getRawLocation());
-  // We should be able to drop individual DPValues.
+  EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 1u);
+  // The second DbgVariableRecord should have been cloned; it should have the
+  // same values as DVR1.
+  EXPECT_EQ(
+      cast<DbgVariableRecord>(RetInst->DbgMarker->StoredDbgRecords.begin())
+          ->getRawLocation(),
+      DVR1->getRawLocation());
+  // We should be able to drop individual DbgRecords.
   RetInst->DbgMarker->dropOneDbgRecord(
-      &*RetInst->DbgMarker->StoredDPValues.begin());
+      &*RetInst->DbgMarker->StoredDbgRecords.begin());
 
   // "Aborb" a DPMarker: this means pretend that the instruction it's attached
   // to is disappearing so it needs to be transferred into "this" marker.
   RetInst->DbgMarker->absorbDebugValues(*FirstInst->DbgMarker, true);
-  EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 2u);
-  // Should be the DPV1 and DPV2 objects.
+  EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 2u);
+  // Should be the DVR1 and DVR2 objects.
   ItCount = 0;
   for (DbgRecord &Item : RetInst->DbgMarker->getDbgRecordRange()) {
-    EXPECT_TRUE((&Item == DPV2 && ItCount == 0) ||
-              (&Item == DPV1 && ItCount == 1));
+    EXPECT_TRUE((&Item == DVR2 && ItCount == 0) ||
+                (&Item == DVR1 && ItCount == 1));
     EXPECT_EQ(Item.getMarker(), RetInst->DbgMarker);
     ++ItCount;
   }
 
-  // Finally -- there are two DPValues left over. If we remove evrything in the
-  // basic block, then they should sink down into the "TrailingDPValues"
-  // container for dangling debug-info. Future facilities will restore them
-  // back when a terminator is inserted.
+  // Finally -- there are two DbgVariableRecords left over. If we remove
+  // evrything in the basic block, then they should sink down into the
+  // "TrailingDbgRecords" container for dangling debug-info. Future facilities
+  // will restore them back when a terminator is inserted.
   FirstInst->DbgMarker->removeMarker();
   FirstInst->eraseFromParent();
   RetInst->DbgMarker->removeMarker();
@@ -1019,25 +1022,25 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
 
   DPMarker *EndMarker = ExitBlock->getTrailingDbgRecords();
   ASSERT_NE(EndMarker, nullptr);
-  EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
-  // Test again that it's those two DPValues, DPV1 and DPV2.
+  EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
+  // Test again that it's those two DbgVariableRecords, DVR1 and DVR2.
   ItCount = 0;
   for (DbgRecord &Item : EndMarker->getDbgRecordRange()) {
-    EXPECT_TRUE((&Item == DPV2 && ItCount == 0) ||
-              (&Item == DPV1 && ItCount == 1));
+    EXPECT_TRUE((&Item == DVR2 && ItCount == 0) ||
+                (&Item == DVR1 && ItCount == 1));
     EXPECT_EQ(Item.getMarker(), EndMarker);
     ++ItCount;
   }
 
-  // Cleanup the trailing DPValue records and marker.
+  // Cleanup the trailing DbgVariableRecord records and marker.
   EndMarker->eraseFromParent();
 
-  // The record of those trailing DPValues would dangle and cause an assertion
-  // failure if it lived until the end of the LLVMContext.
+  // The record of those trailing DbgVariableRecords would dangle and cause an
+  // assertion failure if it lived until the end of the LLVMContext.
   ExitBlock->deleteTrailingDbgRecords();
 }
 
-TEST(MetadataTest, DPValueConversionRoutines) {
+TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   LLVMContext C;
 
   // For the purpose of this test, set and un-set the command line option
@@ -1074,14 +1077,14 @@ TEST(MetadataTest, DPValueConversionRoutines) {
 )");
 
   // Check that the conversion routines and utilities between dbg.value
-  // debug-info format and DPValues works.
+  // debug-info format and DbgVariableRecords works.
   Function *F = M->getFunction("f");
   BasicBlock *BB1 = &F->getEntryBlock();
   // First instruction should be a dbg.value.
   EXPECT_TRUE(isa<DbgValueInst>(BB1->front()));
   EXPECT_FALSE(BB1->IsNewDbgInfoFormat);
-  // Validating the block for DPValues / DPMarkers shouldn't fail -- there's
-  // no data stored right now.
+  // Validating the block for DbgVariableRecords / DPMarkers shouldn't fail --
+  // there's no data stored right now.
   bool BrokenDebugInfo = false;
   bool Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
   EXPECT_FALSE(Error);
@@ -1105,7 +1108,7 @@ TEST(MetadataTest, DPValueConversionRoutines) {
       EXPECT_FALSE(isa<DbgValueInst>(I));
 
   // There should be a DPMarker on each of the two instructions in the entry
-  // block, each containing one DPValue.
+  // block, each containing one DbgVariableRecord.
   EXPECT_EQ(BB1->size(), 2u);
   Instruction *FirstInst = &BB1->front();
   Instruction *SecondInst = FirstInst->getNextNode();
@@ -1115,27 +1118,27 @@ TEST(MetadataTest, DPValueConversionRoutines) {
   EXPECT_EQ(FirstInst, FirstInst->DbgMarker->MarkedInstr);
   EXPECT_EQ(SecondInst, SecondInst->DbgMarker->MarkedInstr);
 
-  EXPECT_EQ(FirstInst->DbgMarker->StoredDPValues.size(), 1u);
-  DPValue *DPV1 =
-      cast<DPValue>(&*FirstInst->DbgMarker->getDbgRecordRange().begin());
-  EXPECT_EQ(DPV1->getMarker(), FirstInst->DbgMarker);
+  EXPECT_EQ(FirstInst->DbgMarker->StoredDbgRecords.size(), 1u);
+  DbgVariableRecord *DVR1 = cast<DbgVariableRecord>(
+      &*FirstInst->DbgMarker->getDbgRecordRange().begin());
+  EXPECT_EQ(DVR1->getMarker(), FirstInst->DbgMarker);
   // Should point at %a, an argument.
-  EXPECT_TRUE(isa<Argument>(DPV1->getVariableLocationOp(0)));
+  EXPECT_TRUE(isa<Argument>(DVR1->getVariableLocationOp(0)));
 
-  EXPECT_EQ(SecondInst->DbgMarker->StoredDPValues.size(), 1u);
-  DPValue *DPV2 =
-      cast<DPValue>(&*SecondInst->DbgMarker->getDbgRecordRange().begin());
-  EXPECT_EQ(DPV2->getMarker(), SecondInst->DbgMarker);
+  EXPECT_EQ(SecondInst->DbgMarker->StoredDbgRecords.size(), 1u);
+  DbgVariableRecord *DVR2 = cast<DbgVariableRecord>(
+      &*SecondInst->DbgMarker->getDbgRecordRange().begin());
+  EXPECT_EQ(DVR2->getMarker(), SecondInst->DbgMarker);
   // Should point at FirstInst.
-  EXPECT_EQ(DPV2->getVariableLocationOp(0), FirstInst);
+  EXPECT_EQ(DVR2->getVariableLocationOp(0), FirstInst);
 
-  // There should be no DPValues / DPMarkers in the second block, but it should
-  // be marked as being in the new format.
+  // There should be no DbgVariableRecords / DPMarkers in the second block, but
+  // it should be marked as being in the new format.
   BasicBlock *BB2 = BB1->getNextNode();
   EXPECT_TRUE(BB2->IsNewDbgInfoFormat);
   for (auto &Inst : *BB2)
     // Either there should be no marker, or it should be empty.
-    EXPECT_TRUE(!Inst.DbgMarker || Inst.DbgMarker->StoredDPValues.empty());
+    EXPECT_TRUE(!Inst.DbgMarker || Inst.DbgMarker->StoredDbgRecords.empty());
 
   // Validating the first block should continue to not be a problem,
   Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
@@ -1144,17 +1147,17 @@ TEST(MetadataTest, DPValueConversionRoutines) {
   // But if we were to break something, it should be able to fire. Don't attempt
   // to comprehensively test the validator, it's a smoke-test rather than a
   // "proper" verification pass.
-  DPV1->setMarker(nullptr);
+  DVR1->setMarker(nullptr);
   // A marker pointing the wrong way should be an error.
   Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
   EXPECT_FALSE(Error);
   EXPECT_TRUE(BrokenDebugInfo);
-  DPV1->setMarker(FirstInst->DbgMarker);
+  DVR1->setMarker(FirstInst->DbgMarker);
 
-  DILocalVariable *DLV1 = DPV1->getVariable();
-  DIExpression *Expr1 = DPV1->getExpression();
-  DILocalVariable *DLV2 = DPV2->getVariable();
-  DIExpression *Expr2 = DPV2->getExpression();
+  DILocalVariable *DLV1 = DVR1->getVariable();
+  DIExpression *Expr1 = DVR1->getExpression();
+  DILocalVariable *DLV2 = DVR2->getVariable();
+  DIExpression *Expr2 = DVR2->getExpression();
 
   // Convert everything back to the "old" format and ensure it's right.
   M->convertFromNewDbgValues();
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 139e8832c97b9..ec30598212304 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -57,7 +57,7 @@ TEST_F(IRBuilderTest, Intrinsics) {
   IRBuilder<> Builder(BB);
   Value *V;
   Instruction *I;
-  CallInst *Call;
+  Value *Result;
   IntrinsicInst *II;
 
   V = Builder.CreateLoad(GV->getValueType(), GV);
@@ -65,78 +65,80 @@ TEST_F(IRBuilderTest, Intrinsics) {
   I->setHasNoInfs(true);
   I->setHasNoNaNs(false);
 
-  Call = Builder.CreateMinNum(V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateMinNum(V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::minnum);
 
-  Call = Builder.CreateMaxNum(V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateMaxNum(V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maxnum);
 
-  Call = Builder.CreateMinimum(V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateMinimum(V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::minimum);
 
-  Call = Builder.CreateMaximum(V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateMaximum(V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maximum);
 
-  Call = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, {}, {});
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, {}, {});
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::readcyclecounter);
 
-  Call = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fabs);
   EXPECT_FALSE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, V, I);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, V, I);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fabs);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateBinaryIntrinsic(Intrinsic::pow, V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateBinaryIntrinsic(Intrinsic::pow, V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::pow);
   EXPECT_FALSE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateBinaryIntrinsic(Intrinsic::pow, V, V, I);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateBinaryIntrinsic(Intrinsic::pow, V, V, I);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::pow);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V});
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V});
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_FALSE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
-  II = cast<IntrinsicInst>(Call);
+  Result =
+      Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
-  II = cast<IntrinsicInst>(Call);
+  Result =
+      Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateUnaryIntrinsic(Intrinsic::roundeven, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateUnaryIntrinsic(Intrinsic::roundeven, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::roundeven);
   EXPECT_FALSE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateIntrinsic(
+  Result = Builder.CreateIntrinsic(
       Intrinsic::set_rounding, {},
       {Builder.getInt32(static_cast<uint32_t>(RoundingMode::TowardZero))});
-  II = cast<IntrinsicInst>(Call);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::set_rounding);
 }
 
@@ -942,7 +944,7 @@ TEST_F(IRBuilderTest, DIBuilder) {
         DIB.createAutoVariable(BarSP, "X", File, 2, IntType, true);
     DILocalVariable *VarY =
         DIB.createAutoVariable(BarSP, "Y", File, 2, IntType, true);
-    { /* dbg.value | DPValue::Value */
+    { /* dbg.value | DbgVariableRecord::Value */
       ExpectOrder(DIB.insertDbgValueIntrinsic(I, VarX, DIB.createExpression(),
                                               VarLoc, I),
                   I->getIterator());
@@ -953,7 +955,7 @@ TEST_F(IRBuilderTest, DIBuilder) {
       ExpectOrder(VarXValue, I->getIterator());
       EXPECT_EQ(BB->getTrailingDbgRecords(), nullptr);
     }
-    { /* dbg.declare | DPValue::Declare */
+    { /* dbg.declare | DbgVariableRecord::Declare */
       ExpectOrder(DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc, I),
                   I->getIterator());
       // Check inserting at end of the block works as with labels.
@@ -963,7 +965,7 @@ TEST_F(IRBuilderTest, DIBuilder) {
       ExpectOrder(VarYDeclare, I->getIterator());
       EXPECT_EQ(BB->getTrailingDbgRecords(), nullptr);
     }
-    { /* dbg.assign | DPValue::Assign */
+    { /* dbg.assign | DbgVariableRecord::Assign */
       I = Builder.CreateAlloca(Builder.getInt32Ty());
       I->setMetadata(LLVMContext::MD_DIAssignID, DIAssignID::getDistinct(Ctx));
       // DbgAssign interface is slightly different - it always inserts after the
diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
index 767dd1a59d2b9..4c2e5f77a5403 100644
--- a/llvm/unittests/IR/MetadataTest.cpp
+++ b/llvm/unittests/IR/MetadataTest.cpp
@@ -106,7 +106,7 @@ class MetadataTest : public testing::Test {
   DIType *getDerivedType() {
     return DIDerivedType::getDistinct(
         Context, dwarf::DW_TAG_pointer_type, "", nullptr, 0, nullptr,
-        getBasicType("basictype"), 1, 2, 0, std::nullopt, DINode::FlagZero);
+        getBasicType("basictype"), 1, 2, 0, std::nullopt, {}, DINode::FlagZero);
   }
   Constant *getConstant() {
     return ConstantInt::get(Type::getInt32Ty(Context), Counter++);
@@ -461,7 +461,7 @@ TEST_F(MDNodeTest, PrintTree) {
     auto *StructTy = cast<DICompositeType>(getCompositeType());
     DIType *PointerTy = DIDerivedType::getDistinct(
         Context, dwarf::DW_TAG_pointer_type, "", nullptr, 0, nullptr, StructTy,
-        1, 2, 0, std::nullopt, DINode::FlagZero);
+        1, 2, 0, std::nullopt, {}, DINode::FlagZero);
     StructTy->replaceElements(MDTuple::get(Context, PointerTy));
 
     auto *Var = DILocalVariable::get(Context, Scope, "foo", File,
@@ -1864,13 +1864,17 @@ TEST_F(DIDerivedTypeTest, get) {
   DIType *BaseType = getBasicType("basic");
   MDTuple *ExtraData = getTuple();
   unsigned DWARFAddressSpace = 8;
+  DIDerivedType::PtrAuthData PtrAuthData(1, false, 1234, true, true);
+  DIDerivedType::PtrAuthData PtrAuthData2(1, false, 1234, true, false);
   DINode::DIFlags Flags5 = static_cast<DINode::DIFlags>(5);
   DINode::DIFlags Flags4 = static_cast<DINode::DIFlags>(4);
 
-  auto *N =
-      DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something", File,
-                         1, Scope, BaseType, 2, 3, 4, DWARFAddressSpace, Flags5,
-                         ExtraData);
+  auto *N = DIDerivedType::get(
+      Context, dwarf::DW_TAG_pointer_type, "something", File, 1, Scope,
+      BaseType, 2, 3, 4, DWARFAddressSpace, std::nullopt, Flags5, ExtraData);
+  auto *N1 = DIDerivedType::get(Context, dwarf::DW_TAG_LLVM_ptrauth_type, "",
+                                File, 1, Scope, N, 2, 3, 4, DWARFAddressSpace,
+                                PtrAuthData, Flags5, ExtraData);
   EXPECT_EQ(dwarf::DW_TAG_pointer_type, N->getTag());
   EXPECT_EQ("something", N->getName());
   EXPECT_EQ(File, N->getFile());
@@ -1881,53 +1885,73 @@ TEST_F(DIDerivedTypeTest, get) {
   EXPECT_EQ(3u, N->getAlignInBits());
   EXPECT_EQ(4u, N->getOffsetInBits());
   EXPECT_EQ(DWARFAddressSpace, *N->getDWARFAddressSpace());
+  EXPECT_EQ(std::nullopt, N->getPtrAuthData());
+  EXPECT_EQ(PtrAuthData, N1->getPtrAuthData());
+  EXPECT_NE(PtrAuthData2, N1->getPtrAuthData());
   EXPECT_EQ(5u, N->getFlags());
   EXPECT_EQ(ExtraData, N->getExtraData());
   EXPECT_EQ(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
 
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_reference_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "else",
-                                  File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  File, 1, Scope, BaseType, 2, 3, 4,
+                                  DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", getFile(), 1, Scope, BaseType, 2,
-                                  3, 4, DWARFAddressSpace, Flags5, ExtraData));
+                                  3, 4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 2, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, getSubprogram(),
-                                  BaseType, 2, 3, 4, DWARFAddressSpace, Flags5,
-                                  ExtraData));
+                                  BaseType, 2, 3, 4, DWARFAddressSpace,
+                                  std::nullopt, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(
                    Context, dwarf::DW_TAG_pointer_type, "something", File, 1,
                    Scope, getBasicType("basic2"), 2, 3, 4, DWARFAddressSpace,
-                   Flags5, ExtraData));
+                   std::nullopt, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 3, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 2,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  5, DWARFAddressSpace, Flags5, ExtraData));
+                                  5, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace + 1, Flags5, ExtraData));
+                                  4, DWARFAddressSpace + 1, std::nullopt,
+                                  Flags5, ExtraData));
+  EXPECT_NE(N1,
+            DIDerivedType::get(Context, dwarf::DW_TAG_LLVM_ptrauth_type, "",
+                               File, 1, Scope, N, 2, 3, 4, DWARFAddressSpace,
+                               std::nullopt, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags4, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags4,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, getTuple()));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  getTuple()));
 
   TempDIDerivedType Temp = N->clone();
   EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+  TempDIDerivedType Temp1 = N1->clone();
+  EXPECT_EQ(N1, MDNode::replaceWithUniqued(std::move(Temp1)));
 }
 
 TEST_F(DIDerivedTypeTest, getWithLargeValues) {
@@ -1937,14 +1961,23 @@ TEST_F(DIDerivedTypeTest, getWithLargeValues) {
   MDTuple *ExtraData = getTuple();
   DINode::DIFlags Flags = static_cast<DINode::DIFlags>(5);
 
-  auto *N = DIDerivedType::get(
-      Context, dwarf::DW_TAG_pointer_type, "something", File, 1, Scope,
-      BaseType, UINT64_MAX, UINT32_MAX - 1, UINT64_MAX - 2, UINT32_MAX - 3,
-      Flags, ExtraData);
+  auto *N = DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something",
+                               File, 1, Scope, BaseType, UINT64_MAX,
+                               UINT32_MAX - 1, UINT64_MAX - 2, UINT32_MAX - 3,
+                               std::nullopt, Flags, ExtraData);
   EXPECT_EQ(UINT64_MAX, N->getSizeInBits());
   EXPECT_EQ(UINT32_MAX - 1, N->getAlignInBits());
   EXPECT_EQ(UINT64_MAX - 2, N->getOffsetInBits());
   EXPECT_EQ(UINT32_MAX - 3, *N->getDWARFAddressSpace());
+
+  auto *N1 = DIDerivedType::get(
+      Context, dwarf::DW_TAG_LLVM_ptrauth_type, "", File, 1, Scope, N,
+      UINT64_MAX, UINT32_MAX - 1, UINT64_MAX - 2, UINT32_MAX - 3,
+      DIDerivedType::PtrAuthData(7, true, 0xffff, true, false), Flags,
+      ExtraData);
+  EXPECT_EQ(7U, N1->getPtrAuthData()->key());
+  EXPECT_EQ(true, N1->getPtrAuthData()->isAddressDiscriminated());
+  EXPECT_EQ(0xffffU, N1->getPtrAuthData()->extraDiscriminator());
 }
 
 typedef MetadataTest DICompositeTypeTest;
@@ -3560,6 +3593,27 @@ TEST_F(DIExpressionTest, foldConstant) {
 #undef EXPECT_FOLD_CONST
 }
 
+TEST_F(DIExpressionTest, appendToStackAssert) {
+  DIExpression *Expr = DIExpression::get(Context, {});
+
+  // Verify that the DW_OP_LLVM_convert operands, which have the same values as
+  // DW_OP_stack_value and DW_OP_LLVM_fragment, do not get interpreted as such
+  // operations. This previously triggered an assert.
+  uint64_t FromSize = dwarf::DW_OP_stack_value;
+  uint64_t ToSize = dwarf::DW_OP_LLVM_fragment;
+  uint64_t Ops[] = {
+      dwarf::DW_OP_LLVM_convert, FromSize, dwarf::DW_ATE_signed,
+      dwarf::DW_OP_LLVM_convert, ToSize,   dwarf::DW_ATE_signed,
+  };
+  Expr = DIExpression::appendToStack(Expr, Ops);
+
+  uint64_t Expected[] = {
+      dwarf::DW_OP_LLVM_convert, FromSize, dwarf::DW_ATE_signed,
+      dwarf::DW_OP_LLVM_convert, ToSize,   dwarf::DW_ATE_signed,
+      dwarf::DW_OP_stack_value};
+  EXPECT_EQ(Expr->getElements(), ArrayRef<uint64_t>(Expected));
+}
+
 typedef MetadataTest DIObjCPropertyTest;
 
 TEST_F(DIObjCPropertyTest, get) {
@@ -4268,7 +4322,7 @@ TEST_F(MDTupleAllocationTest, Tracking2) {
 #if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG) && !defined(GTEST_HAS_SEH)
 typedef MetadataTest MDTupleAllocationDeathTest;
 TEST_F(MDTupleAllocationDeathTest, ResizeRejected) {
-  MDTuple *A = MDTuple::get(Context, None);
+  MDTuple *A = MDTuple::get(Context, std::nullopt);
   auto *Value1 = getConstantAsMetadata();
   EXPECT_DEATH(A->push_back(Value1),
                "Resizing is not supported for uniqued nodes");
diff --git a/llvm/unittests/IR/ValueTest.cpp b/llvm/unittests/IR/ValueTest.cpp
index 22d6764c57514..f9da29390d2dd 100644
--- a/llvm/unittests/IR/ValueTest.cpp
+++ b/llvm/unittests/IR/ValueTest.cpp
@@ -319,9 +319,9 @@ TEST(ValueTest, replaceUsesOutsideBlock) {
   ASSERT_TRUE(Ret->getOperand(0) == cast<Value>(B));
 }
 
-TEST(ValueTest, replaceUsesOutsideBlockDPValue) {
+TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
   // Check that Value::replaceUsesOutsideBlock(New, BB) replaces uses outside
-  // BB, including DPValues.
+  // BB, including DbgVariableRecords.
   const auto *IR = R"(
     define i32 @f() !dbg !6 {
     entry:
@@ -381,14 +381,16 @@ TEST(ValueTest, replaceUsesOutsideBlockDPValue) {
   EXPECT_TRUE(Branch->hasDbgRecords());
   EXPECT_TRUE(Ret->hasDbgRecords());
 
-  DPValue *DPV1 = cast<DPValue>(&*Branch->getDbgRecordRange().begin());
-  DPValue *DPV2 = cast<DPValue>(&*Ret->getDbgRecordRange().begin());
+  DbgVariableRecord *DVR1 =
+      cast<DbgVariableRecord>(&*Branch->getDbgRecordRange().begin());
+  DbgVariableRecord *DVR2 =
+      cast<DbgVariableRecord>(&*Ret->getDbgRecordRange().begin());
 
   A->replaceUsesOutsideBlock(B, Entry);
   // These users are in Entry so shouldn't be changed.
-  EXPECT_TRUE(DPV1->getVariableLocationOp(0) == cast<Value>(A));
+  EXPECT_TRUE(DVR1->getVariableLocationOp(0) == cast<Value>(A));
   // These users are outside Entry so should be changed.
-  EXPECT_TRUE(DPV2->getVariableLocationOp(0) == cast<Value>(B));
+  EXPECT_TRUE(DVR2->getVariableLocationOp(0) == cast<Value>(B));
   UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index d3177ce7e9837..48de0889fdf33 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -362,39 +362,14 @@ TEST(KnownBitsTest, BinaryExhaustive) {
         return Known1 ^ Known2;
       },
       [](const APInt &N1, const APInt &N2) { return N1 ^ N2; });
-
-  testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::umax(Known1, Known2);
-      },
-      [](const APInt &N1, const APInt &N2) { return APIntOps::umax(N1, N2); });
-  testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::umin(Known1, Known2);
-      },
-      [](const APInt &N1, const APInt &N2) { return APIntOps::umin(N1, N2); });
-  testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::smax(Known1, Known2);
-      },
-      [](const APInt &N1, const APInt &N2) { return APIntOps::smax(N1, N2); });
-  testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::smin(Known1, Known2);
-      },
-      [](const APInt &N1, const APInt &N2) { return APIntOps::smin(N1, N2); });
-  testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::abdu(Known1, Known2);
-      },
-      [](const APInt &N1, const APInt &N2) { return APIntOps::abdu(N1, N2); },
-      checkCorrectnessOnlyBinary);
-  testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::abds(Known1, Known2);
-      },
-      [](const APInt &N1, const APInt &N2) { return APIntOps::abds(N1, N2); },
-      checkCorrectnessOnlyBinary);
+  testBinaryOpExhaustive(KnownBits::umax, APIntOps::umax);
+  testBinaryOpExhaustive(KnownBits::umin, APIntOps::umin);
+  testBinaryOpExhaustive(KnownBits::smax, APIntOps::smax);
+  testBinaryOpExhaustive(KnownBits::smin, APIntOps::smin);
+  testBinaryOpExhaustive(KnownBits::abdu, APIntOps::abdu,
+                         checkCorrectnessOnlyBinary);
+  testBinaryOpExhaustive(KnownBits::abds, APIntOps::abds,
+                         checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
       [](const KnownBits &Known1, const KnownBits &Known2) {
         return KnownBits::udiv(Known1, Known2);
@@ -437,9 +412,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::urem(Known1, Known2);
-      },
+      KnownBits::urem,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         if (N2.isZero())
           return std::nullopt;
@@ -447,9 +420,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::srem(Known1, Known2);
-      },
+      KnownBits::srem,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         if (N2.isZero())
           return std::nullopt;
@@ -457,33 +428,25 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::sadd_sat(Known1, Known2);
-      },
+      KnownBits::sadd_sat,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         return N1.sadd_sat(N2);
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::uadd_sat(Known1, Known2);
-      },
+      KnownBits::uadd_sat,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         return N1.uadd_sat(N2);
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::ssub_sat(Known1, Known2);
-      },
+      KnownBits::ssub_sat,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         return N1.ssub_sat(N2);
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::usub_sat(Known1, Known2);
-      },
+      KnownBits::usub_sat,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         return N1.usub_sat(N2);
       },
@@ -582,7 +545,6 @@ TEST(KnownBitsTest, BinaryExhaustive) {
         return N1.ashr(N2);
       },
       checkOptimalityBinary, /* RefinePoisonToZero */ true);
-
   testBinaryOpExhaustive(
       [](const KnownBits &Known1, const KnownBits &Known2) {
         return KnownBits::mul(Known1, Known2);
@@ -590,18 +552,14 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       [](const APInt &N1, const APInt &N2) { return N1 * N2; },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::mulhs(Known1, Known2);
-      },
+      KnownBits::mulhs,
       [](const APInt &N1, const APInt &N2) {
         unsigned Bits = N1.getBitWidth();
         return (N1.sext(2 * Bits) * N2.sext(2 * Bits)).extractBits(Bits, Bits);
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::mulhu(Known1, Known2);
-      },
+      KnownBits::mulhu,
       [](const APInt &N1, const APInt &N2) {
         unsigned Bits = N1.getBitWidth();
         return (N1.zext(2 * Bits) * N2.zext(2 * Bits)).extractBits(Bits, Bits);
diff --git a/llvm/unittests/Support/LEB128Test.cpp b/llvm/unittests/Support/LEB128Test.cpp
index 21523e5f7a08c..08b8c5573ce63 100644
--- a/llvm/unittests/Support/LEB128Test.cpp
+++ b/llvm/unittests/Support/LEB128Test.cpp
@@ -242,6 +242,21 @@ TEST(LEB128Test, DecodeInvalidSLEB128) {
 #undef EXPECT_INVALID_SLEB128
 }
 
+TEST(LEB128Test, DecodeAndInc) {
+#define EXPECT_LEB128(FUN, VALUE, SIZE)                                        \
+  do {                                                                         \
+    const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE), *P = V;       \
+    auto Expected = FUN(P), Actual = FUN##AndInc(P);                           \
+    EXPECT_EQ(Actual, Expected);                                               \
+    EXPECT_EQ(P - V, SIZE);                                                    \
+  } while (0)
+  EXPECT_LEB128(decodeULEB128, "\x7f", 1);
+  EXPECT_LEB128(decodeULEB128, "\x80\x01", 2);
+  EXPECT_LEB128(decodeSLEB128, "\x7f", 1);
+  EXPECT_LEB128(decodeSLEB128, "\x80\x01", 2);
+#undef EXPECT_LEB128
+}
+
 TEST(LEB128Test, SLEB128Size) {
   // Positive Value Testing Plan:
   // (1) 128 ^ n - 1 ........ need (n+1) bytes
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index 82cf4c639b616..a331e6a74ceb6 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -739,170 +739,173 @@ TEST(RiscvExtensionsHelp, CheckExtensions) {
   std::string ExpectedOutput =
 R"(All available -march extensions for RISC-V
 
-    Name                Version   Description
-    i                   2.1       This is a long dummy description
-    e                   2.0
-    m                   2.0
-    a                   2.1
-    f                   2.2
-    d                   2.2
-    c                   2.0
-    v                   1.0
-    h                   1.0
-    zic64b              1.0
-    zicbom              1.0
-    zicbop              1.0
-    zicboz              1.0
-    ziccamoa            1.0
-    ziccif              1.0
-    zicclsm             1.0
-    ziccrse             1.0
-    zicntr              2.0
-    zicond              1.0
-    zicsr               2.0
-    zifencei            2.0
-    zihintntl           1.0
-    zihintpause         2.0
-    zihpm               2.0
-    zmmul               1.0
-    za128rs             1.0
-    za64rs              1.0
-    zacas               1.0
-    zawrs               1.0
-    zfa                 1.0
-    zfh                 1.0
-    zfhmin              1.0
-    zfinx               1.0
-    zdinx               1.0
-    zca                 1.0
-    zcb                 1.0
-    zcd                 1.0
-    zce                 1.0
-    zcf                 1.0
-    zcmp                1.0
-    zcmt                1.0
-    zba                 1.0
-    zbb                 1.0
-    zbc                 1.0
-    zbkb                1.0
-    zbkc                1.0
-    zbkx                1.0
-    zbs                 1.0
-    zk                  1.0
-    zkn                 1.0
-    zknd                1.0
-    zkne                1.0
-    zknh                1.0
-    zkr                 1.0
-    zks                 1.0
-    zksed               1.0
-    zksh                1.0
-    zkt                 1.0
-    zvbb                1.0
-    zvbc                1.0
-    zve32f              1.0
-    zve32x              1.0
-    zve64d              1.0
-    zve64f              1.0
-    zve64x              1.0
-    zvfh                1.0
-    zvfhmin             1.0
-    zvkb                1.0
-    zvkg                1.0
-    zvkn                1.0
-    zvknc               1.0
-    zvkned              1.0
-    zvkng               1.0
-    zvknha              1.0
-    zvknhb              1.0
-    zvks                1.0
-    zvksc               1.0
-    zvksed              1.0
-    zvksg               1.0
-    zvksh               1.0
-    zvkt                1.0
-    zvl1024b            1.0
-    zvl128b             1.0
-    zvl16384b           1.0
-    zvl2048b            1.0
-    zvl256b             1.0
-    zvl32768b           1.0
-    zvl32b              1.0
-    zvl4096b            1.0
-    zvl512b             1.0
-    zvl64b              1.0
-    zvl65536b           1.0
-    zvl8192b            1.0
-    zhinx               1.0
-    zhinxmin            1.0
-    shcounterenw        1.0
-    shgatpa             1.0
-    shtvala             1.0
-    shvsatpa            1.0
-    shvstvala           1.0
-    shvstvecd           1.0
-    smaia               1.0
-    smepmp              1.0
-    ssaia               1.0
-    ssccptr             1.0
-    sscofpmf            1.0
-    sscounterenw        1.0
-    ssstateen           1.0
-    ssstrict            1.0
-    sstc                1.0
-    sstvala             1.0
-    sstvecd             1.0
-    ssu64xl             1.0
-    svade               1.0
-    svadu               1.0
-    svbare              1.0
-    svinval             1.0
-    svnapot             1.0
-    svpbmt              1.0
-    xcvalu              1.0
-    xcvbi               1.0
-    xcvbitmanip         1.0
-    xcvelw              1.0
-    xcvmac              1.0
-    xcvmem              1.0
-    xcvsimd             1.0
-    xsfvcp              1.0
-    xsfvfnrclipxfqf     1.0
-    xsfvfwmaccqqq       1.0
-    xsfvqmaccdod        1.0
-    xsfvqmaccqoq        1.0
-    xtheadba            1.0
-    xtheadbb            1.0
-    xtheadbs            1.0
-    xtheadcmo           1.0
-    xtheadcondmov       1.0
-    xtheadfmemidx       1.0
-    xtheadmac           1.0
-    xtheadmemidx        1.0
-    xtheadmempair       1.0
-    xtheadsync          1.0
-    xtheadvdot          1.0
-    xventanacondops     1.0
+    Name                 Version   Description
+    i                    2.1       This is a long dummy description
+    e                    2.0
+    m                    2.0
+    a                    2.1
+    f                    2.2
+    d                    2.2
+    c                    2.0
+    v                    1.0
+    h                    1.0
+    zic64b               1.0
+    zicbom               1.0
+    zicbop               1.0
+    zicboz               1.0
+    ziccamoa             1.0
+    ziccif               1.0
+    zicclsm              1.0
+    ziccrse              1.0
+    zicntr               2.0
+    zicond               1.0
+    zicsr                2.0
+    zifencei             2.0
+    zihintntl            1.0
+    zihintpause          2.0
+    zihpm                2.0
+    zmmul                1.0
+    za128rs              1.0
+    za64rs               1.0
+    zacas                1.0
+    zawrs                1.0
+    zfa                  1.0
+    zfh                  1.0
+    zfhmin               1.0
+    zfinx                1.0
+    zdinx                1.0
+    zca                  1.0
+    zcb                  1.0
+    zcd                  1.0
+    zce                  1.0
+    zcf                  1.0
+    zcmp                 1.0
+    zcmt                 1.0
+    zba                  1.0
+    zbb                  1.0
+    zbc                  1.0
+    zbkb                 1.0
+    zbkc                 1.0
+    zbkx                 1.0
+    zbs                  1.0
+    zk                   1.0
+    zkn                  1.0
+    zknd                 1.0
+    zkne                 1.0
+    zknh                 1.0
+    zkr                  1.0
+    zks                  1.0
+    zksed                1.0
+    zksh                 1.0
+    zkt                  1.0
+    zvbb                 1.0
+    zvbc                 1.0
+    zve32f               1.0
+    zve32x               1.0
+    zve64d               1.0
+    zve64f               1.0
+    zve64x               1.0
+    zvfh                 1.0
+    zvfhmin              1.0
+    zvkb                 1.0
+    zvkg                 1.0
+    zvkn                 1.0
+    zvknc                1.0
+    zvkned               1.0
+    zvkng                1.0
+    zvknha               1.0
+    zvknhb               1.0
+    zvks                 1.0
+    zvksc                1.0
+    zvksed               1.0
+    zvksg                1.0
+    zvksh                1.0
+    zvkt                 1.0
+    zvl1024b             1.0
+    zvl128b              1.0
+    zvl16384b            1.0
+    zvl2048b             1.0
+    zvl256b              1.0
+    zvl32768b            1.0
+    zvl32b               1.0
+    zvl4096b             1.0
+    zvl512b              1.0
+    zvl64b               1.0
+    zvl65536b            1.0
+    zvl8192b             1.0
+    zhinx                1.0
+    zhinxmin             1.0
+    shcounterenw         1.0
+    shgatpa              1.0
+    shtvala              1.0
+    shvsatpa             1.0
+    shvstvala            1.0
+    shvstvecd            1.0
+    smaia                1.0
+    smepmp               1.0
+    ssaia                1.0
+    ssccptr              1.0
+    sscofpmf             1.0
+    sscounterenw         1.0
+    ssstateen            1.0
+    ssstrict             1.0
+    sstc                 1.0
+    sstvala              1.0
+    sstvecd              1.0
+    ssu64xl              1.0
+    svade                1.0
+    svadu                1.0
+    svbare               1.0
+    svinval              1.0
+    svnapot              1.0
+    svpbmt               1.0
+    xcvalu               1.0
+    xcvbi                1.0
+    xcvbitmanip          1.0
+    xcvelw               1.0
+    xcvmac               1.0
+    xcvmem               1.0
+    xcvsimd              1.0
+    xsfcease             1.0
+    xsfvcp               1.0
+    xsfvfnrclipxfqf      1.0
+    xsfvfwmaccqqq        1.0
+    xsfvqmaccdod         1.0
+    xsfvqmaccqoq         1.0
+    xsifivecdiscarddlone 1.0
+    xsifivecflushdlone   1.0
+    xtheadba             1.0
+    xtheadbb             1.0
+    xtheadbs             1.0
+    xtheadcmo            1.0
+    xtheadcondmov        1.0
+    xtheadfmemidx        1.0
+    xtheadmac            1.0
+    xtheadmemidx         1.0
+    xtheadmempair        1.0
+    xtheadsync           1.0
+    xtheadvdot           1.0
+    xventanacondops      1.0
 
 Experimental extensions
-    zicfilp             0.4       This is a long dummy description
-    zicfiss             0.4
-    zimop               0.1
-    zaamo               0.2
-    zabha               1.0
-    zalasr              0.1
-    zalrsc              0.2
-    zfbfmin             1.0
-    zcmop               0.2
-    ztso                0.1
-    zvfbfmin            1.0
-    zvfbfwma            1.0
-    smmpm               0.8
-    smnpm               0.8
-    ssnpm               0.8
-    sspm                0.8
-    ssqosid             1.0
-    supm                0.8
+    zicfilp              0.4       This is a long dummy description
+    zicfiss              0.4
+    zimop                0.1
+    zaamo                0.2
+    zabha                1.0
+    zalasr               0.1
+    zalrsc               0.2
+    zfbfmin              1.0
+    zcmop                0.2
+    ztso                 0.1
+    zvfbfmin             1.0
+    zvfbfwma             1.0
+    smmpm                0.8
+    smnpm                0.8
+    ssnpm                0.8
+    sspm                 0.8
+    ssqosid              1.0
+    supm                 0.8
 
 Use -march to specify the target's extension.
 For example, clang -march=rv32i_v1p0)";
diff --git a/llvm/unittests/Support/TimeProfilerTest.cpp b/llvm/unittests/Support/TimeProfilerTest.cpp
index 6be716bae6b3f..bb820ec99a393 100644
--- a/llvm/unittests/Support/TimeProfilerTest.cpp
+++ b/llvm/unittests/Support/TimeProfilerTest.cpp
@@ -54,6 +54,17 @@ TEST(TimeProfiler, Begin_End_Smoke) {
   ASSERT_TRUE(json.find(R"("detail":"detail")") != std::string::npos);
 }
 
+TEST(TimeProfiler, Async_Begin_End_Smoke) {
+  setupProfiler();
+
+  auto *Profiler = timeTraceAsyncProfilerBegin("event", "detail");
+  timeTraceProfilerEnd(Profiler);
+
+  std::string json = teardownProfiler();
+  ASSERT_TRUE(json.find(R"("name":"event")") != std::string::npos);
+  ASSERT_TRUE(json.find(R"("detail":"detail")") != std::string::npos);
+}
+
 TEST(TimeProfiler, Begin_End_Disabled) {
   // Nothing should be observable here. The test is really just making sure
   // we've not got a stray nullptr deref.
diff --git a/llvm/unittests/Support/VirtualFileSystemTest.cpp b/llvm/unittests/Support/VirtualFileSystemTest.cpp
index d4abbb4345873..695b09343257f 100644
--- a/llvm/unittests/Support/VirtualFileSystemTest.cpp
+++ b/llvm/unittests/Support/VirtualFileSystemTest.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
@@ -3330,3 +3332,66 @@ TEST(RedirectingFileSystemTest, Used) {
   EXPECT_TRUE(Redirecting1->hasBeenUsed());
   EXPECT_FALSE(Redirecting2->hasBeenUsed());
 }
+
+// Check that paths looked up in the external filesystem are unmodified, except
+// potentially to add the working directory. We cannot canonicalize away ..
+// in the presence of symlinks in the external filesystem.
+TEST(RedirectingFileSystemTest, ExternalPaths) {
+  struct InterceptorFS : llvm::vfs::ProxyFileSystem {
+    std::vector<std::string> SeenPaths;
+
+    InterceptorFS(IntrusiveRefCntPtr<FileSystem> UnderlyingFS)
+        : ProxyFileSystem(UnderlyingFS) {}
+
+    llvm::ErrorOr<llvm::vfs::Status> status(const Twine &Path) override {
+      SeenPaths.push_back(Path.str());
+      return ProxyFileSystem::status(Path);
+    }
+
+    llvm::ErrorOr<std::unique_ptr<llvm::vfs::File>>
+    openFileForRead(const Twine &Path) override {
+      SeenPaths.push_back(Path.str());
+      return ProxyFileSystem::openFileForRead(Path);
+    }
+
+    std::error_code isLocal(const Twine &Path, bool &Result) override {
+      SeenPaths.push_back(Path.str());
+      return ProxyFileSystem::isLocal(Path, Result);
+    }
+
+    vfs::directory_iterator dir_begin(const Twine &Dir,
+                                      std::error_code &EC) override {
+      SeenPaths.push_back(Dir.str());
+      return ProxyFileSystem::dir_begin(Dir, EC);
+    }
+  };
+
+  std::error_code EC;
+  auto BaseFS = makeIntrusiveRefCnt<DummyFileSystem>();
+  BaseFS->setCurrentWorkingDirectory("/cwd");
+  auto CheckFS = makeIntrusiveRefCnt<InterceptorFS>(BaseFS);
+  auto FS = vfs::RedirectingFileSystem::create({}, /*UseExternalNames=*/false,
+                                               *CheckFS);
+
+  FS->status("/a/../b");
+  FS->openFileForRead("c");
+  FS->exists("./d");
+  bool IsLocal = false;
+  FS->isLocal("/e/./../f", IsLocal);
+  FS->dir_begin(".././g", EC);
+
+  std::vector<std::string> Expected{"/a/../b", "/cwd/c", "/cwd/./d",
+                                    "/e/./../f", "/cwd/.././g"};
+
+  EXPECT_EQ(CheckFS->SeenPaths, Expected);
+
+  CheckFS->SeenPaths.clear();
+  FS->setRedirection(vfs::RedirectingFileSystem::RedirectKind::Fallback);
+  FS->status("/a/../b");
+  FS->openFileForRead("c");
+  FS->exists("./d");
+  FS->isLocal("/e/./../f", IsLocal);
+  FS->dir_begin(".././g", EC);
+
+  EXPECT_EQ(CheckFS->SeenPaths, Expected);
+}
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 3773f59a3c5af..a7d0b1687a7f9 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1140,6 +1140,22 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
                  AArch64::AEK_JSCVT,       AArch64::AEK_FCMA}),
             "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "cortex-a520ae", "armv9.2-a", "crypto-neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,        AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,         AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,        AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,         AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,        AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,         AArch64::AEK_CRC,
+                 AArch64::AEK_FP,          AArch64::AEK_SB,
+                 AArch64::AEK_SSBS,        AArch64::AEK_MTE,
+                 AArch64::AEK_FP16FML,     AArch64::AEK_PAUTH,
+                 AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
+                 AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
+                 AArch64::AEK_JSCVT,       AArch64::AEK_FCMA}),
+            "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a57", "armv8-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset({AArch64::AEK_CRC, AArch64::AEK_AES,
@@ -1283,6 +1299,23 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
                  AArch64::AEK_FCMA}),
             "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "cortex-a720ae", "armv9.2-a", "crypto-neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,        AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,         AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,        AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,         AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,        AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,         AArch64::AEK_CRC,
+                 AArch64::AEK_FP,          AArch64::AEK_SB,
+                 AArch64::AEK_SSBS,        AArch64::AEK_MTE,
+                 AArch64::AEK_FP16FML,     AArch64::AEK_PAUTH,
+                 AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
+                 AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
+                 AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
+                 AArch64::AEK_FCMA}),
+            "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-v1", "armv8.4-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
@@ -1717,7 +1750,7 @@ INSTANTIATE_TEST_SUITE_P(
     ARMCPUTestParams<AArch64::ExtensionBitset>::PrintToStringParamName);
 
 // Note: number of CPUs includes aliases.
-static constexpr unsigned NumAArch64CPUArchs = 70;
+static constexpr unsigned NumAArch64CPUArchs = 72;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;
diff --git a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
index 89fa1334b427e..0b00734fc4d75 100644
--- a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
+++ b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
@@ -60,7 +60,7 @@ struct DebugValueDrop : public FunctionPass {
       for (Instruction &I : BB) {
         if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
           Dbgs.push_back(DVI);
-        // If there are any non-intrinsic records (DPValues), drop those too.
+        // If there are any non-intrinsic records (DbgRecords), drop those too.
         I.dropDbgRecords();
       }
     }
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 87a2a2ae47005..4258f218794f8 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -1279,11 +1279,11 @@ TEST(Local, ExpressionForConstant) {
   EXPECT_EQ(Expr, nullptr);
 }
 
-TEST(Local, ReplaceDPValue) {
+TEST(Local, ReplaceDbgVariableRecord) {
   LLVMContext C;
 
-  // Test that RAUW also replaces the operands of DPValue objects, i.e.
-  // non-instruction stored debugging information.
+  // Test that RAUW also replaces the operands of DbgVariableRecord objects,
+  // i.e. non-instruction stored debugging information.
   std::unique_ptr<Module> M = parseIR(C,
                                       R"(
       declare void @llvm.dbg.value(metadata, metadata, metadata)
@@ -1323,24 +1323,23 @@ TEST(Local, ReplaceDPValue) {
   It = std::next(It);
   Instruction *RetInst = &*It;
 
-  // Convert DVI into a DPValue.
+  // Convert DVI into a DbgVariableRecord.
   RetInst->DbgMarker = new DPMarker();
   RetInst->DbgMarker->MarkedInstr = RetInst;
-  DPValue *DPV = new DPValue(DVI);
-  RetInst->DbgMarker->insertDbgRecord(DPV, false);
+  DbgVariableRecord *DVR = new DbgVariableRecord(DVI);
+  RetInst->DbgMarker->insertDbgRecord(DVR, false);
   // ... and erase the dbg.value.
   DVI->eraseFromParent();
 
-  // DPV should originally refer to %bar,
-  EXPECT_EQ(DPV->getVariableLocationOp(0), BarInst);
+  // DVR should originally refer to %bar,
+  EXPECT_EQ(DVR->getVariableLocationOp(0), BarInst);
 
   // Now try to replace the computation of %bar with %foo -- this should cause
-  // the DPValue's to have it's operand updated beneath it.
+  // the DbgVariableRecord's to have it's operand updated beneath it.
   BarInst->replaceAllUsesWith(FooInst);
-  // Check DPV now points at %foo.
-  EXPECT_EQ(DPV->getVariableLocationOp(0), FooInst);
+  // Check DVR now points at %foo.
+  EXPECT_EQ(DVR->getVariableLocationOp(0), FooInst);
 
   // Teardown.
   RetInst->DbgMarker->eraseFromParent();
 }
-
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index febd96086df27..5df7990d8fc2c 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -375,6 +375,10 @@ class AsmVariantInfo {
   int AsmVariantNo;
 };
 
+bool getPreferSmallerInstructions(CodeGenTarget const &Target) {
+  return Target.getAsmParser()->getValueAsBit("PreferSmallerInstructions");
+}
+
 /// MatchableInfo - Helper class for storing the necessary information for an
 /// instruction or alias which is capable of being matched.
 struct MatchableInfo {
@@ -502,6 +506,9 @@ struct MatchableInfo {
   /// matchable came from.
   Record *const TheDef;
 
+  // ResInstSize - The size of the resulting instruction for this matchable.
+  unsigned ResInstSize;
+
   /// DefRec - This is the definition that it came from.
   PointerUnion<const CodeGenInstruction *, const CodeGenInstAlias *> DefRec;
 
@@ -543,10 +550,12 @@ struct MatchableInfo {
 
   MatchableInfo(const CodeGenInstruction &CGI)
       : AsmVariantID(0), AsmString(CGI.AsmString), TheDef(CGI.TheDef),
-        DefRec(&CGI), UseInstAsmMatchConverter(true) {}
+        ResInstSize(TheDef->getValueAsInt("Size")), DefRec(&CGI),
+        UseInstAsmMatchConverter(true) {}
 
   MatchableInfo(std::unique_ptr<const CodeGenInstAlias> Alias)
       : AsmVariantID(0), AsmString(Alias->AsmString), TheDef(Alias->TheDef),
+        ResInstSize(Alias->ResultInst->TheDef->getValueAsInt("Size")),
         DefRec(Alias.release()), UseInstAsmMatchConverter(TheDef->getValueAsBit(
                                      "UseInstAsmMatchConverter")) {}
 
@@ -555,9 +564,9 @@ struct MatchableInfo {
   // where it was copied while being in an owning state.
   MatchableInfo(const MatchableInfo &RHS)
       : AsmVariantID(RHS.AsmVariantID), AsmString(RHS.AsmString),
-        TheDef(RHS.TheDef), DefRec(RHS.DefRec), ResOperands(RHS.ResOperands),
-        Mnemonic(RHS.Mnemonic), AsmOperands(RHS.AsmOperands),
-        RequiredFeatures(RHS.RequiredFeatures),
+        TheDef(RHS.TheDef), ResInstSize(RHS.ResInstSize), DefRec(RHS.DefRec),
+        ResOperands(RHS.ResOperands), Mnemonic(RHS.Mnemonic),
+        AsmOperands(RHS.AsmOperands), RequiredFeatures(RHS.RequiredFeatures),
         ConversionFnKind(RHS.ConversionFnKind),
         HasDeprecation(RHS.HasDeprecation),
         UseInstAsmMatchConverter(RHS.UseInstAsmMatchConverter) {
@@ -608,12 +617,18 @@ struct MatchableInfo {
   void buildInstructionResultOperands();
   void buildAliasResultOperands(bool AliasConstraintsAreChecked);
 
-  /// operator< - Compare two matchables.
-  bool operator<(const MatchableInfo &RHS) const {
+  /// shouldBeMatchedBefore - Compare two matchables for ordering.
+  bool shouldBeMatchedBefore(const MatchableInfo &RHS,
+                             bool PreferSmallerInstructions) const {
     // The primary comparator is the instruction mnemonic.
     if (int Cmp = Mnemonic.compare_insensitive(RHS.Mnemonic))
       return Cmp == -1;
 
+    // (Optionally) Order by the resultant instuctions size.
+    // eg. for ARM thumb instructions smaller encodings should be preferred.
+    if (PreferSmallerInstructions && ResInstSize != RHS.ResInstSize)
+      return ResInstSize < RHS.ResInstSize;
+
     if (AsmOperands.size() != RHS.AsmOperands.size())
       return AsmOperands.size() < RHS.AsmOperands.size();
 
@@ -652,7 +667,8 @@ struct MatchableInfo {
   /// couldMatchAmbiguouslyWith - Check whether this matchable could
   /// ambiguously match the same set of operands as \p RHS (without being a
   /// strictly superior match).
-  bool couldMatchAmbiguouslyWith(const MatchableInfo &RHS) const {
+  bool couldMatchAmbiguouslyWith(const MatchableInfo &RHS,
+                                 bool PreferSmallerInstructions) const {
     // The primary comparator is the instruction mnemonic.
     if (Mnemonic != RHS.Mnemonic)
       return false;
@@ -661,6 +677,10 @@ struct MatchableInfo {
     if (AsmVariantID != RHS.AsmVariantID)
       return false;
 
+    // The size of instruction is unambiguous.
+    if (PreferSmallerInstructions && ResInstSize != RHS.ResInstSize)
+      return false;
+
     // The number of operands is unambiguous.
     if (AsmOperands.size() != RHS.AsmOperands.size())
       return false;
@@ -1986,16 +2006,16 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   }
   CvtOS << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n";
   CvtOS << "  const uint8_t *Converter = ConversionTable[Kind];\n";
-  CvtOS << "  unsigned OpIdx;\n";
   CvtOS << "  Inst.setOpcode(Opcode);\n";
   CvtOS << "  for (const uint8_t *p = Converter; *p; p += 2) {\n";
   if (HasOptionalOperands) {
     // When optional operands are involved, formal and actual operand indices
     // may differ. Map the former to the latter by subtracting the number of
     // absent optional operands.
-    CvtOS << "    OpIdx = *(p + 1) - DefaultsOffset[*(p + 1)];\n";
+    // FIXME: This is not an operand index in the CVT_Tied case
+    CvtOS << "    unsigned OpIdx = *(p + 1) - DefaultsOffset[*(p + 1)];\n";
   } else {
-    CvtOS << "    OpIdx = *(p + 1);\n";
+    CvtOS << "    unsigned OpIdx = *(p + 1);\n";
   }
   CvtOS << "    switch (*p) {\n";
   CvtOS << "    default: llvm_unreachable(\"invalid conversion entry!\");\n";
@@ -2004,11 +2024,11 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         << " &>(*Operands[OpIdx]).addRegOperands(Inst, 1);\n";
   CvtOS << "      break;\n";
   CvtOS << "    case CVT_Tied: {\n";
-  CvtOS << "      assert(OpIdx < (size_t)(std::end(TiedAsmOperandTable) -\n";
+  CvtOS << "      assert(*(p + 1) < (size_t)(std::end(TiedAsmOperandTable) -\n";
   CvtOS
       << "                              std::begin(TiedAsmOperandTable)) &&\n";
   CvtOS << "             \"Tied operand not found\");\n";
-  CvtOS << "      unsigned TiedResOpnd = TiedAsmOperandTable[OpIdx][0];\n";
+  CvtOS << "      unsigned TiedResOpnd = TiedAsmOperandTable[*(p + 1)][0];\n";
   CvtOS << "      if (TiedResOpnd != (uint8_t)-1)\n";
   CvtOS << "        Inst.addOperand(Inst.getOperand(TiedResOpnd));\n";
   CvtOS << "      break;\n";
@@ -3221,20 +3241,23 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   AsmMatcherInfo Info(AsmParser, Target, Records);
   Info.buildInfo();
 
+  bool PreferSmallerInstructions = getPreferSmallerInstructions(Target);
   // Sort the instruction table using the partial order on classes. We use
   // stable_sort to ensure that ambiguous instructions are still
   // deterministically ordered.
   llvm::stable_sort(
       Info.Matchables,
-      [](const std::unique_ptr<MatchableInfo> &a,
-         const std::unique_ptr<MatchableInfo> &b) { return *a < *b; });
+      [PreferSmallerInstructions](const std::unique_ptr<MatchableInfo> &A,
+                                  const std::unique_ptr<MatchableInfo> &B) {
+        return A->shouldBeMatchedBefore(*B, PreferSmallerInstructions);
+      });
 
 #ifdef EXPENSIVE_CHECKS
   // Verify that the table is sorted and operator < works transitively.
   for (auto I = Info.Matchables.begin(), E = Info.Matchables.end(); I != E;
        ++I) {
     for (auto J = I; J != E; ++J) {
-      assert(!(**J < **I));
+      assert(!(*J)->shouldBeMatchedBefore(**I, PreferSmallerInstructions));
     }
   }
 #endif
@@ -3253,7 +3276,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
         const MatchableInfo &A = **I;
         const MatchableInfo &B = **J;
 
-        if (A.couldMatchAmbiguouslyWith(B)) {
+        if (A.couldMatchAmbiguouslyWith(B, PreferSmallerInstructions)) {
           errs() << "warning: ambiguous matchables:\n";
           A.dump();
           errs() << "\nis incomparable with:\n";
@@ -3729,6 +3752,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "        } else {\n";
     OS << "          DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"but formal "
           "operand not required\\n\");\n";
+    OS << "          if (isSubclass(Formal, OptionalMatchClass)) {\n";
+    OS << "            OptionalOperandsMask.set(FormalIdx);\n";
+    OS << "          }\n";
     OS << "        }\n";
     OS << "        continue;\n";
   } else {
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 628bff520a12e..732f34ed04c57 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2283,10 +2283,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_CheckField: {
       // Decode the start value.
-      unsigned Len;
-      unsigned Start = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      Len = *Ptr;)";
+      unsigned Start = decodeULEB128AndInc(++Ptr);
+      unsigned Len = *Ptr;)";
   if (IsVarLenInst)
     OS << "\n      makeUp(insn, Start + Len);";
   OS << R"(
@@ -2311,10 +2309,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       break;
     }
     case MCD::OPC_CheckPredicate: {
-      unsigned Len;
       // Decode the Predicate Index value.
-      unsigned PIdx = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
+      unsigned PIdx = decodeULEB128AndInc(++Ptr);
       // NumToSkip is a plain 24-bit integer.
       unsigned NumToSkip = *Ptr++;
       NumToSkip |= (*Ptr++) << 8;
@@ -2330,18 +2326,15 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       break;
     }
     case MCD::OPC_Decode: {
-      unsigned Len;
       // Decode the Opcode value.
-      unsigned Opc = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);
-      Ptr += Len;
+      unsigned Opc = decodeULEB128AndInc(++Ptr);
+      unsigned DecodeIdx = decodeULEB128AndInc(Ptr);
 
       MI.clear();
       MI.setOpcode(Opc);
       bool DecodeComplete;)";
   if (IsVarLenInst) {
-    OS << "\n      Len = InstrLenTable[Opc];\n"
+    OS << "\n      unsigned Len = InstrLenTable[Opc];\n"
        << "      makeUp(insn, Len);";
   }
   OS << R"(
@@ -2354,12 +2347,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       return S;
     }
     case MCD::OPC_TryDecode: {
-      unsigned Len;
       // Decode the Opcode value.
-      unsigned Opc = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);
-      Ptr += Len;
+      unsigned Opc = decodeULEB128AndInc(++Ptr);
+      unsigned DecodeIdx = decodeULEB128AndInc(Ptr);
       // NumToSkip is a plain 24-bit integer.
       unsigned NumToSkip = *Ptr++;
       NumToSkip |= (*Ptr++) << 8;
@@ -2391,11 +2381,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_SoftFail: {
       // Decode the mask values.
-      unsigned Len;
-      uint64_t PositiveMask = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      uint64_t NegativeMask = decodeULEB128(Ptr, &Len);
-      Ptr += Len;
+      uint64_t PositiveMask = decodeULEB128AndInc(++Ptr);
+      uint64_t NegativeMask = decodeULEB128AndInc(Ptr);
       bool Fail = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
       if (Fail)
         S = MCDisassembler::SoftFail;
diff --git a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
index 7f494e532b1f4..91c3b0b4359cf 100644
--- a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
+++ b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
@@ -40,12 +40,10 @@
 
 #include "CodeGenTarget.h"
 #include "PredicateExpander.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include <set>
 #include <vector>
 
 using namespace llvm;
@@ -61,14 +59,14 @@ class MacroFusionPredicatorEmitter {
                            raw_ostream &OS);
   void emitMacroFusionImpl(std::vector<Record *> Fusions, PredicateExpander &PE,
                            raw_ostream &OS);
-  void emitPredicates(std::vector<Record *> &FirstPredicate,
+  void emitPredicates(std::vector<Record *> &FirstPredicate, bool IsCommutable,
                       PredicateExpander &PE, raw_ostream &OS);
-  void emitFirstPredicate(Record *SecondPredicate, PredicateExpander &PE,
-                          raw_ostream &OS);
-  void emitSecondPredicate(Record *SecondPredicate, PredicateExpander &PE,
-                           raw_ostream &OS);
-  void emitBothPredicate(Record *Predicates, PredicateExpander &PE,
-                         raw_ostream &OS);
+  void emitFirstPredicate(Record *SecondPredicate, bool IsCommutable,
+                          PredicateExpander &PE, raw_ostream &OS);
+  void emitSecondPredicate(Record *SecondPredicate, bool IsCommutable,
+                           PredicateExpander &PE, raw_ostream &OS);
+  void emitBothPredicate(Record *Predicates, bool IsCommutable,
+                         PredicateExpander &PE, raw_ostream &OS);
 
 public:
   MacroFusionPredicatorEmitter(RecordKeeper &R) : Records(R), Target(R) {}
@@ -103,6 +101,7 @@ void MacroFusionPredicatorEmitter::emitMacroFusionImpl(
   for (Record *Fusion : Fusions) {
     std::vector<Record *> Predicates =
         Fusion->getValueAsListOfDefs("Predicates");
+    bool IsCommutable = Fusion->getValueAsBit("IsCommutable");
 
     OS << "bool is" << Fusion->getName() << "(\n";
     OS.indent(4) << "const TargetInstrInfo &TII,\n";
@@ -111,7 +110,7 @@ void MacroFusionPredicatorEmitter::emitMacroFusionImpl(
     OS.indent(4) << "const MachineInstr &SecondMI) {\n";
     OS.indent(2) << "auto &MRI = SecondMI.getMF()->getRegInfo();\n";
 
-    emitPredicates(Predicates, PE, OS);
+    emitPredicates(Predicates, IsCommutable, PE, OS);
 
     OS.indent(2) << "return true;\n";
     OS << "}\n";
@@ -122,15 +121,16 @@ void MacroFusionPredicatorEmitter::emitMacroFusionImpl(
 }
 
 void MacroFusionPredicatorEmitter::emitPredicates(
-    std::vector<Record *> &Predicates, PredicateExpander &PE, raw_ostream &OS) {
+    std::vector<Record *> &Predicates, bool IsCommutable, PredicateExpander &PE,
+    raw_ostream &OS) {
   for (Record *Predicate : Predicates) {
     Record *Target = Predicate->getValueAsDef("Target");
     if (Target->getName() == "first_fusion_target")
-      emitFirstPredicate(Predicate, PE, OS);
+      emitFirstPredicate(Predicate, IsCommutable, PE, OS);
     else if (Target->getName() == "second_fusion_target")
-      emitSecondPredicate(Predicate, PE, OS);
+      emitSecondPredicate(Predicate, IsCommutable, PE, OS);
     else if (Target->getName() == "both_fusion_target")
-      emitBothPredicate(Predicate, PE, OS);
+      emitBothPredicate(Predicate, IsCommutable, PE, OS);
     else
       PrintFatalError(Target->getLoc(),
                       "Unsupported 'FusionTarget': " + Target->getName());
@@ -138,6 +138,7 @@ void MacroFusionPredicatorEmitter::emitPredicates(
 }
 
 void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate,
+                                                      bool IsCommutable,
                                                       PredicateExpander &PE,
                                                       raw_ostream &OS) {
   if (Predicate->isSubClassOf("WildcardPred")) {
@@ -170,6 +171,7 @@ void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate,
 }
 
 void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
+                                                       bool IsCommutable,
                                                        PredicateExpander &PE,
                                                        raw_ostream &OS) {
   if (Predicate->isSubClassOf("FusionPredicateWithMCInstPredicate")) {
@@ -182,6 +184,36 @@ void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
     OS << ")\n";
     OS.indent(4) << "  return false;\n";
     OS.indent(2) << "}\n";
+  } else if (Predicate->isSubClassOf("SameReg")) {
+    int FirstOpIdx = Predicate->getValueAsInt("FirstOpIdx");
+    int SecondOpIdx = Predicate->getValueAsInt("SecondOpIdx");
+
+    OS.indent(2) << "if (!SecondMI.getOperand(" << FirstOpIdx
+                 << ").getReg().isVirtual()) {\n";
+    OS.indent(4) << "if (SecondMI.getOperand(" << FirstOpIdx
+                 << ").getReg() != SecondMI.getOperand(" << SecondOpIdx
+                 << ").getReg())";
+
+    if (IsCommutable) {
+      OS << " {\n";
+      OS.indent(6) << "if (!SecondMI.getDesc().isCommutable())\n";
+      OS.indent(6) << "  return false;\n";
+
+      OS.indent(6)
+          << "unsigned SrcOpIdx1 = " << SecondOpIdx
+          << ", SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;\n";
+      OS.indent(6)
+          << "if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))\n";
+      OS.indent(6)
+          << "  if (SecondMI.getOperand(" << FirstOpIdx
+          << ").getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())\n";
+      OS.indent(6) << "    return false;\n";
+      OS.indent(4) << "}\n";
+    } else {
+      OS << "\n";
+      OS.indent(4) << "  return false;\n";
+    }
+    OS.indent(2) << "}\n";
   } else {
     PrintFatalError(Predicate->getLoc(),
                     "Unsupported predicate for second instruction: " +
@@ -190,13 +222,14 @@ void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
 }
 
 void MacroFusionPredicatorEmitter::emitBothPredicate(Record *Predicate,
+                                                     bool IsCommutable,
                                                      PredicateExpander &PE,
                                                      raw_ostream &OS) {
   if (Predicate->isSubClassOf("FusionPredicateWithCode"))
     OS << Predicate->getValueAsString("Predicate");
   else if (Predicate->isSubClassOf("BothFusionPredicateWithMCInstPredicate")) {
-    emitFirstPredicate(Predicate, PE, OS);
-    emitSecondPredicate(Predicate, PE, OS);
+    emitFirstPredicate(Predicate, IsCommutable, PE, OS);
+    emitSecondPredicate(Predicate, IsCommutable, PE, OS);
   } else if (Predicate->isSubClassOf("TieReg")) {
     int FirstOpIdx = Predicate->getValueAsInt("FirstOpIdx");
     int SecondOpIdx = Predicate->getValueAsInt("SecondOpIdx");
@@ -206,8 +239,28 @@ void MacroFusionPredicatorEmitter::emitBothPredicate(Record *Predicate,
                  << ").isReg() &&\n";
     OS.indent(2) << "      FirstMI->getOperand(" << FirstOpIdx
                  << ").getReg() == SecondMI.getOperand(" << SecondOpIdx
-                 << ").getReg()))\n";
-    OS.indent(2) << "  return false;\n";
+                 << ").getReg()))";
+
+    if (IsCommutable) {
+      OS << " {\n";
+      OS.indent(4) << "if (!SecondMI.getDesc().isCommutable())\n";
+      OS.indent(4) << "  return false;\n";
+
+      OS.indent(4)
+          << "unsigned SrcOpIdx1 = " << SecondOpIdx
+          << ", SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;\n";
+      OS.indent(4)
+          << "if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))\n";
+      OS.indent(4)
+          << "  if (FirstMI->getOperand(" << FirstOpIdx
+          << ").getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())\n";
+      OS.indent(4) << "    return false;\n";
+      OS.indent(2) << "}";
+    } else {
+      OS << "\n";
+      OS.indent(2) << "  return false;";
+    }
+    OS << "\n";
   } else
     PrintFatalError(Predicate->getLoc(),
                     "Unsupported predicate for both instruction: " +
diff --git a/llvm/utils/TableGen/X86ManualFoldTables.def b/llvm/utils/TableGen/X86ManualFoldTables.def
index 8e6cb4a7bd879..2ebd92883c071 100644
--- a/llvm/utils/TableGen/X86ManualFoldTables.def
+++ b/llvm/utils/TableGen/X86ManualFoldTables.def
@@ -227,6 +227,13 @@ NOFOLD(MMX_MOVQ64rr_REV)
 NOFOLD(INSERTPSrr)
 NOFOLD(VINSERTPSZrr)
 NOFOLD(VINSERTPSrr)
+// Memory faults are suppressed for CFCMOV with memory operand.
+NOFOLD(CFCMOV16rr_REV)
+NOFOLD(CFCMOV32rr_REV)
+NOFOLD(CFCMOV64rr_REV)
+NOFOLD(CFCMOV16rr_ND)
+NOFOLD(CFCMOV32rr_ND)
+NOFOLD(CFCMOV64rr_ND)
 #undef NOFOLD
 
 #ifndef ENTRY
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 39006a40daeb6..a2bc037b690c6 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -540,6 +540,13 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPERAND(relocation)
     HANDLE_OPERAND(opcodeModifier)
     break;
+  case X86Local::MRMDestRegCC:
+    assert(numPhysicalOperands == 3 &&
+           "Unexpected number of operands for MRMDestRegCC");
+    HANDLE_OPERAND(rmRegister)
+    HANDLE_OPERAND(roRegister)
+    HANDLE_OPERAND(opcodeModifier)
+    break;
   case X86Local::MRMDestReg:
     // Operand 1 is a register operand in the R/M field.
     // - In AVX512 there may be a mask operand here -
@@ -566,6 +573,13 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPTIONAL(immediate)
     HANDLE_OPTIONAL(immediate)
     break;
+  case X86Local::MRMDestMemCC:
+    assert(numPhysicalOperands == 3 &&
+           "Unexpected number of operands for MRMDestMemCC");
+    HANDLE_OPERAND(memory)
+    HANDLE_OPERAND(roRegister)
+    HANDLE_OPERAND(opcodeModifier)
+    break;
   case X86Local::MRMDestMem4VOp3CC:
     // Operand 1 is a register operand in the Reg/Opcode field.
     // Operand 2 is a register operand in the R/M field.
@@ -650,8 +664,10 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPTIONAL(immediate)
     break;
   case X86Local::MRMSrcRegCC:
-    assert(numPhysicalOperands == 3 &&
+    assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 4 &&
            "Unexpected number of operands for MRMSrcRegCC");
+    if (IsND)
+      HANDLE_OPERAND(vvvvRegister)
     HANDLE_OPERAND(roRegister)
     HANDLE_OPERAND(rmRegister)
     HANDLE_OPERAND(opcodeModifier)
@@ -700,8 +716,10 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPTIONAL(immediate)
     break;
   case X86Local::MRMSrcMemCC:
-    assert(numPhysicalOperands == 3 &&
+    assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 4 &&
            "Unexpected number of operands for MRMSrcMemCC");
+    if (IsND)
+      HANDLE_OPERAND(vvvvRegister)
     HANDLE_OPERAND(roRegister)
     HANDLE_OPERAND(memory)
     HANDLE_OPERAND(opcodeModifier)
@@ -871,6 +889,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
     filter = std::make_unique<DumbFilter>();
     break;
   case X86Local::MRMDestReg:
+  case X86Local::MRMDestRegCC:
   case X86Local::MRMSrcReg:
   case X86Local::MRMSrcReg4VOp3:
   case X86Local::MRMSrcRegOp4:
@@ -880,6 +899,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
     filter = std::make_unique<ModFilter>(true);
     break;
   case X86Local::MRMDestMem:
+  case X86Local::MRMDestMemCC:
   case X86Local::MRMDestMem4VOp3CC:
   case X86Local::MRMDestMemFSIB:
   case X86Local::MRMSrcMem:
@@ -950,6 +970,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   if (Form == X86Local::AddRegFrm || Form == X86Local::MRMSrcRegCC ||
       Form == X86Local::MRMSrcMemCC || Form == X86Local::MRMXrCC ||
       Form == X86Local::MRMXmCC || Form == X86Local::AddCCFrm ||
+      Form == X86Local::MRMDestRegCC || Form == X86Local::MRMDestMemCC ||
       Form == X86Local::MRMDestMem4VOp3CC) {
     uint8_t Count = Form == X86Local::AddRegFrm ? 8 : 16;
     assert(((opcodeToSet % Count) == 0) && "ADDREG_FRM opcode not aligned");
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index ad319b3f28b72..68af68fb5aa03 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -106,6 +106,8 @@ enum {
   RawFrmImm16 = 8,
   AddCCFrm = 9,
   PrefixByte = 10,
+  MRMDestRegCC = 18,
+  MRMDestMemCC = 19,
   MRMDestMem4VOp3CC = 20,
   MRMr0 = 21,
   MRMSrcMemFSIB = 22,
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 3b41f9b4007f3..cd90c7752e164 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -81,6 +81,7 @@ static_library("bugprone") {
     "SuspiciousReallocUsageCheck.cpp",
     "SuspiciousSemicolonCheck.cpp",
     "SuspiciousStringCompareCheck.cpp",
+    "SuspiciousStringviewDataUsageCheck.cpp",
     "SwappedArgumentsCheck.cpp",
     "SwitchMissingDefaultCaseCheck.cpp",
     "TerminatingContinueCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index d484ff9cddd91..b18b109dcff56 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -25,6 +25,7 @@ diag_groups = [
   "CrossTU",
   "Driver",
   "Frontend",
+  "InstallAPI",
   "Lex",
   "Parse",
   "Refactoring",
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
index e69567c2d9c65..04f20211b3c71 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
@@ -23,8 +23,8 @@ static_library("FlowSensitive") {
     target_gen_dir,
   ]
   sources = [
+    "AdornedCFG.cpp",
     "Arena.cpp",
-    "ControlFlowContext.cpp",
     "DataflowAnalysisContext.cpp",
     "DataflowEnvironment.cpp",
     "DebugSupport.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 7c00aaffdc1b9..25fcdc4d00151 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -205,6 +205,7 @@ copy("Headers") {
     "ia32intrin.h",
     "immintrin.h",
     "intrin.h",
+    "intrin0.h",
     "inttypes.h",
     "invpcidintrin.h",
     "iso646.h",
@@ -256,6 +257,7 @@ copy("Headers") {
     "ppc_wrappers/xmmintrin.h",
     "prfchiintrin.h",
     "prfchwintrin.h",
+    "ptrauth.h",
     "ptwriteintrin.h",
     "raointintrin.h",
     "rdpruintrin.h",
@@ -309,6 +311,7 @@ copy("Headers") {
     "xsaveoptintrin.h",
     "xsavesintrin.h",
     "xtestintrin.h",
+    "yvals_core.h",
   ]
   outputs = [ "$clang_resource_dir/include/{{source_target_relative}}" ]
 }
diff --git a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
index 5e533bf23ec47..2f43a915b2a66 100644
--- a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
@@ -3,10 +3,12 @@ static_library("InstallAPI") {
   configs += [ "//llvm/utils/gn/build:clang_code" ]
   deps = [
     "//clang/lib/AST",
+    "//llvm/lib/Demangle",
     "//llvm/lib/Support",
     "//llvm/lib/TextAPI",
   ]
   sources = [
+    "DylibVerifier.cpp",
     "FileList.cpp",
     "Frontend.cpp",
     "HeaderFile.cpp",
diff --git a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn
index 4f6895181f552..89f0107686f91 100644
--- a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn
@@ -1,8 +1,15 @@
+import("//llvm/utils/TableGen/tablegen.gni")
 import("//llvm/utils/gn/build/driver_executable.gni")
 
+tablegen("InstallAPIOpts") {
+  visibility = [ ":clang-installapi" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
 driver_executable("clang-installapi") {
   configs += [ "//llvm/utils/gn/build:clang_code" ]
   deps = [
+    ":InstallAPIOpts",
     "//clang/lib/Driver",
     "//clang/lib/Frontend",
     "//clang/lib/InstallAPI",
@@ -10,6 +17,7 @@ driver_executable("clang-installapi") {
     "//llvm/lib/Support",
     "//llvm/lib/TargetParser",
     "//llvm/lib/TextAPI",
+    "//llvm/lib/TextAPI/BinaryReader",
   ]
   sources = [
     "ClangInstallAPI.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/BUILD.gn
index 02a9595e2654b..c8347f6336d73 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/BUILD.gn
@@ -12,6 +12,7 @@ source_set("sources") {
   sources = [
     "allocator_common.h",
     "allocator_config.h",
+    "allocator_config_wrapper.h",
     "atomic_helpers.h",
     "bytemap.h",
     "checksum.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/tests/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/tests/BUILD.gn
index f69f1c413b47d..2ddc3abe47f74 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/tests/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/tests/BUILD.gn
@@ -8,6 +8,7 @@ unittest("ScudoUnitTest") {
   cflags = test_cflags
   deps = [ "//compiler-rt/lib/scudo/standalone:sources" ]
   sources = [
+    "allocator_config_test.cpp",
     "atomic_test.cpp",
     "bytemap_test.cpp",
     "checksum_test.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index f3d7b1bceb4d5..3c8b80526773a 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -824,10 +824,7 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_constant_evaluated.h",
       "__type_traits/is_constructible.h",
       "__type_traits/is_convertible.h",
-      "__type_traits/is_copy_assignable.h",
-      "__type_traits/is_copy_constructible.h",
       "__type_traits/is_core_convertible.h",
-      "__type_traits/is_default_constructible.h",
       "__type_traits/is_destructible.h",
       "__type_traits/is_empty.h",
       "__type_traits/is_enum.h",
@@ -843,17 +840,10 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_member_function_pointer.h",
       "__type_traits/is_member_object_pointer.h",
       "__type_traits/is_member_pointer.h",
-      "__type_traits/is_move_assignable.h",
-      "__type_traits/is_move_constructible.h",
       "__type_traits/is_nothrow_assignable.h",
       "__type_traits/is_nothrow_constructible.h",
       "__type_traits/is_nothrow_convertible.h",
-      "__type_traits/is_nothrow_copy_assignable.h",
-      "__type_traits/is_nothrow_copy_constructible.h",
-      "__type_traits/is_nothrow_default_constructible.h",
       "__type_traits/is_nothrow_destructible.h",
-      "__type_traits/is_nothrow_move_assignable.h",
-      "__type_traits/is_nothrow_move_constructible.h",
       "__type_traits/is_null_pointer.h",
       "__type_traits/is_object.h",
       "__type_traits/is_pod.h",
@@ -874,14 +864,9 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_trivial.h",
       "__type_traits/is_trivially_assignable.h",
       "__type_traits/is_trivially_constructible.h",
-      "__type_traits/is_trivially_copy_assignable.h",
-      "__type_traits/is_trivially_copy_constructible.h",
       "__type_traits/is_trivially_copyable.h",
-      "__type_traits/is_trivially_default_constructible.h",
       "__type_traits/is_trivially_destructible.h",
       "__type_traits/is_trivially_lexicographically_comparable.h",
-      "__type_traits/is_trivially_move_assignable.h",
-      "__type_traits/is_trivially_move_constructible.h",
       "__type_traits/is_trivially_relocatable.h",
       "__type_traits/is_unbounded_array.h",
       "__type_traits/is_union.h",
diff --git a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
index 39c3efaa040a5..7c9edfb016d7a 100644
--- a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
@@ -16,6 +16,7 @@ static_library("Host") {
   ]
   public_deps = [ "//llvm/utils/gn/build/libs/xml" ]
   sources = [
+    "common/Alarm.cpp",
     "common/File.cpp",
     "common/FileAction.cpp",
     "common/FileCache.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
index 6c667b280aa76..9d57f2f0877e3 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
@@ -34,6 +34,7 @@ static_library("CPlusPlus") {
     "LibCxxMap.cpp",
     "LibCxxQueue.cpp",
     "LibCxxRangesRefView.cpp",
+    "LibCxxSliceArray.cpp",
     "LibCxxSpan.cpp",
     "LibCxxTuple.cpp",
     "LibCxxUnorderedMap.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/LogicalView/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/LogicalView/BUILD.gn
index 763fdf21a1c81..12b9540cc6d98 100644
--- a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/LogicalView/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/LogicalView/BUILD.gn
@@ -29,6 +29,6 @@ static_library("LogicalView") {
     "Readers/LVBinaryReader.cpp",
     "Readers/LVCodeViewReader.cpp",
     "Readers/LVCodeViewVisitor.cpp",
-    "Readers/LVELFReader.cpp",
+    "Readers/LVDWARFReader.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 6caad81edec53..ba0f6d8c0f8cf 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -3,6 +3,7 @@ static_library("Support") {
   deps = [
     "BLAKE3",
     "//llvm/include/llvm/Config:config",
+    "//llvm/include/llvm/Support:write_vcsrevision",
     "//llvm/lib/Demangle",
     "//llvm/utils/gn/build/libs/pthread",
     "//llvm/utils/gn/build/libs/terminfo",
@@ -89,6 +90,8 @@ static_library("Support") {
     "GlobPattern.cpp",
     "GraphWriter.cpp",
     "Hashing.cpp",
+    "HexagonAttributeParser.cpp",
+    "HexagonAttributes.cpp",
     "InitLLVM.cpp",
     "InstructionCost.cpp",
     "IntEqClasses.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
index 3d8e3e66d26db..12d875cf40c98 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
@@ -103,6 +103,7 @@ static_library("MCTargetDesc") {
     "AMDGPUInstPrinter.cpp",
     "AMDGPUMCAsmInfo.cpp",
     "AMDGPUMCCodeEmitter.cpp",
+    "AMDGPUMCExpr.cpp",
     "AMDGPUMCTargetDesc.cpp",
     "AMDGPUTargetStreamer.cpp",
     "R600InstPrinter.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn
index d22d0433656e5..5710e07b07cf7 100644
--- a/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn
@@ -31,7 +31,6 @@ driver_executable("dsymutil") {
     "MachOUtils.cpp",
     "RelocationMap.cpp",
     "Reproducer.cpp",
-    "SymbolMap.cpp",
     "dsymutil.cpp",
   ]
   if (host_os == "mac") {
diff --git a/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/LogicalView/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/LogicalView/BUILD.gn
index 9cbddd8af9a8f..d87bac64e7b34 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/LogicalView/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/LogicalView/BUILD.gn
@@ -14,7 +14,7 @@ unittest("DebugInfoLogicalViewTests") {
     "CodeViewReaderTest.cpp",
     "CommandLineOptionsTest.cpp",
     "CompareElementsTest.cpp",
-    "ELFReaderTest.cpp",
+    "DWARFReaderTest.cpp",
     "LocationRangesTest.cpp",
     "LogicalElementsTest.cpp",
     "SelectElementsTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
index be1efff15f5aa..3e05b9b1f8c3f 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
@@ -24,6 +24,7 @@ unittest("OrcJITTests") {
     "JITTargetMachineBuilderTest.cpp",
     "LazyCallThroughAndReexportsTest.cpp",
     "LookupAndRecordAddrsTest.cpp",
+    "MachOPlatformTest.cpp",
     "MapperJITLinkMemoryManagerTest.cpp",
     "MemoryMapperTest.cpp",
     "ObjectFormatsTest.cpp",
diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh
index 9fd906c49ea6e..66bef82586a34 100755
--- a/llvm/utils/release/export.sh
+++ b/llvm/utils/release/export.sh
@@ -84,7 +84,7 @@ export_sources() {
     # Determine the release by fetching the version from LLVM's CMakeLists.txt
     # in the specified git ref.
     if [ -n "$snapshot" ]; then
-        release=$(git -C $llvm_src_dir show $snapshot:llvm/CMakeLists.txt | grep -ioP 'set\(\s*LLVM_VERSION_(MAJOR|MINOR|PATCH)\s\K[0-9]+' | paste -sd '.')
+        release=$(git -C $llvm_src_dir show $snapshot:cmake/Modules/LLVMVersion.cmake | grep -ioP 'set\(\s*LLVM_VERSION_(MAJOR|MINOR|PATCH)\s\K[0-9]+' | paste -sd '.')
     fi
     
     tag="llvmorg-$release"
diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index d823afb659c8d..b3d7a788ccbba 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -229,10 +229,10 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIBasicTypeAttrGet(
 
 /// Creates a LLVM DICompositeType attribute.
 MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDICompositeTypeAttrGet(
-    MlirContext ctx, unsigned int tag, MlirAttribute name, MlirAttribute file,
-    uint32_t line, MlirAttribute scope, MlirAttribute baseType, int64_t flags,
-    uint64_t sizeInBits, uint64_t alignInBits, intptr_t nElements,
-    MlirAttribute const *elements);
+    MlirContext ctx, unsigned int tag, MlirAttribute recId, MlirAttribute name,
+    MlirAttribute file, uint32_t line, MlirAttribute scope,
+    MlirAttribute baseType, int64_t flags, uint64_t sizeInBits,
+    uint64_t alignInBits, intptr_t nElements, MlirAttribute const *elements);
 
 /// Creates a LLVM DIDerivedType attribute.
 MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIDerivedTypeAttrGet(
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index 1f64b57cac578..7b92b930fb5f5 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -48,6 +48,11 @@ std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 /// this method is thus able to determine non-trivial divisors.
 uint64_t getLargestDivisorOfTripCount(AffineForOp forOp);
 
+/// Checks if an affine read or write operation depends on `forOp`'s IV, i.e.,
+/// if the memory access is invariant on `forOp`.
+template <typename LoadOrStoreOp>
+bool isInvariantAccess(LoadOrStoreOp memOp, AffineForOp forOp);
+
 /// Given an induction variable `iv` of type AffineForOp and `indices` of type
 /// IndexType, returns the set of `indices` that are independent of `iv`.
 ///
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
index 8439930a87467..7ad0e4a1e5ea0 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h
@@ -44,6 +44,11 @@ class AffineValueMap {
   // Resets this AffineValueMap with 'map', 'operands', and 'results'.
   void reset(AffineMap map, ValueRange operands, ValueRange results = {});
 
+  /// Composes all incoming affine.apply ops and then simplifies and
+  /// canonicalizes the map and operands. This can change the number of
+  /// operands, but the result count remains the same.
+  void composeSimplifyAndCanonicalize();
+
   /// Return the value map that is the difference of value maps 'a' and 'b',
   /// represented as an affine map and its operands. The output map + operands
   /// are canonicalized and simplified.
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index d8cfeee246636..3a61a4b34765e 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -142,11 +142,21 @@ class OpFilter {
   /// This function adds an ALLOW entry.
   void allowDialect(StringRef dialectNamespace) {
     Entry::FilterFn filterFn = [=](Operation *op) {
-      return op->getDialect()->getNamespace() == dialectNamespace;
+      return op->getName().getDialectNamespace() == dialectNamespace;
     };
     entries.push_back(Entry{filterFn, Entry::FilterType::ALLOW});
   }
 
+  /// Deny the given dialect.
+  ///
+  /// This function adds a DENY entry.
+  void denyDialect(StringRef dialectNamespace) {
+    Entry::FilterFn filterFn = [=](Operation *op) {
+      return op->getDialect()->getNamespace() == dialectNamespace;
+    };
+    entries.push_back(Entry{filterFn, Entry::FilterType::DENY});
+  }
+
   /// Allow the given ops.
   ///
   /// This function adds one or multiple ALLOW entries.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index 809f03407258a..a729bc99b987c 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -148,7 +148,7 @@ std::unique_ptr<Pass> createBufferLoopHoistingPass();
 
 // Options struct for BufferResultsToOutParams pass.
 // Note: defined only here, not in tablegen.
-struct BufferResultsToOutParamsOptions {
+struct BufferResultsToOutParamsOpts {
   /// Memcpy function: Generate a memcpy between two memrefs.
   using MemCpyFn =
       std::function<LogicalResult(OpBuilder &, Location, Value, Value)>;
@@ -162,17 +162,21 @@ struct BufferResultsToOutParamsOptions {
   /// Memcpy function; used to create a copy between two memrefs.
   /// If this is empty, memref.copy is used.
   std::optional<MemCpyFn> memCpyFn;
+
+  /// If true, the pass adds a "bufferize.result" attribute to each output
+  /// parameter.
+  bool addResultAttribute = false;
 };
 
 /// Creates a pass that converts memref function results to out-params.
 std::unique_ptr<Pass> createBufferResultsToOutParamsPass(
-    const BufferResultsToOutParamsOptions &options = {});
+    const BufferResultsToOutParamsOpts &options = {});
 
 /// Replace buffers that are returned from a function with an out parameter.
 /// Also update all call sites.
 LogicalResult
 promoteBufferResultsToOutParams(ModuleOp module,
-                                const BufferResultsToOutParamsOptions &options);
+                                const BufferResultsToOutParamsOpts &options);
 
 /// Creates a pass that drops memref function results that are equivalent to a
 /// function argument.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index e01f36b8daa18..1c3cdec81a39e 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -316,6 +316,11 @@ def BufferResultsToOutParams : Pass<"buffer-results-to-out-params", "ModuleOp">
     buffers for results need to be allocated in the caller. This currently only
     works for static shaped memrefs.
   }];
+  let options = [
+    Option<"addResultAttribute", "add-result-attr", "bool",
+       /*default=*/"false",
+       "Add the attribute 'bufferize.result' to all output parameters.">,
+  ];
   let constructor = "mlir::bufferization::createBufferResultsToOutParamsPass()";
   let dependentDialects = ["memref::MemRefDialect"];
 }
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index ec842f76628c0..78bfd561171f5 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -1155,4 +1155,36 @@ def EmitC_IfOp : EmitC_Op<"if",
   let hasCustomAssemblyFormat = 1;
 }
 
+def EmitC_SubscriptOp : EmitC_Op<"subscript",
+  [TypesMatchWith<"result type matches element type of 'array'",
+                  "array", "result",
+                  "::llvm::cast<ArrayType>($_self).getElementType()">]> {
+  let summary = "Array subscript operation";
+  let description = [{
+    With the `subscript` operation the subscript operator `[]` can be applied
+    to variables or arguments of array type.
+
+    Example:
+
+    ```mlir
+    %i = index.constant 1
+    %j = index.constant 7
+    %0 = emitc.subscript %arg0[%i, %j] : <4x8xf32>, index, index
+    ```
+  }];
+  let arguments = (ins Arg<EmitC_ArrayType, "the reference to load from">:$array,
+                       Variadic<IntegerIndexOrOpaqueType>:$indices);
+  let results = (outs AnyType:$result);
+
+  let builders = [
+    OpBuilder<(ins "Value":$array, "ValueRange":$indices), [{
+      build($_builder, $_state, cast<ArrayType>(array.getType()).getElementType(), array, indices);
+    }]>
+  ];
+
+  let hasVerifier = 1;
+  let assemblyFormat = "$array `[` $indices `]` attr-dict `:` type($array) `,` type($indices)";
+}
+
+
 #endif // MLIR_DIALECT_EMITC_IR_EMITC
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td
index 1ff41022eba8c..a2ba45a1f6a12 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td
@@ -39,7 +39,7 @@ def EmitC_ArrayType : EmitC_Type<"Array", "array", [ShapedTypeInterface]> {
     // Array emitted as `int32_t[10]`
     !emitc.array<10xi32>
     // Array emitted as `float[10][20]`
-    !emitc.ptr<10x20xf32>
+    !emitc.array<10x20xf32>
     ```
   }];
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
index 862abf00d0345..759de745440c2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
@@ -29,6 +29,8 @@ add_mlir_doc(LLVMIntrinsicOps LLVMIntrinsicOps Dialects/ -gen-op-doc)
 set(LLVM_TARGET_DEFINITIONS LLVMInterfaces.td)
 mlir_tablegen(LLVMInterfaces.h.inc -gen-op-interface-decls)
 mlir_tablegen(LLVMInterfaces.cpp.inc -gen-op-interface-defs)
+mlir_tablegen(LLVMAttrInterfaces.h.inc -gen-attr-interface-decls)
+mlir_tablegen(LLVMAttrInterfaces.cpp.inc -gen-attr-interface-defs)
 mlir_tablegen(LLVMTypeInterfaces.h.inc -gen-type-interface-decls)
 mlir_tablegen(LLVMTypeInterfaces.cpp.inc -gen-type-interface-defs)
 add_public_tablegen_target(MLIRLLVMInterfacesIncGen)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index a831b076fb864..d4d56ae0f762b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -10,6 +10,7 @@
 #define LLVMIR_ATTRDEFS
 
 include "mlir/Dialect/LLVMIR/LLVMDialect.td"
+include "mlir/Dialect/LLVMIR/LLVMInterfaces.td"
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/CommonAttrConstraints.td"
 
@@ -238,7 +239,7 @@ def LoopAnnotationAttr : LLVM_Attr<"LoopAnnotation", "loop_annotation"> {
 //===----------------------------------------------------------------------===//
 
 class LLVM_DIParameter<string summary, string default, string parseName,
-                       string printName = parseName>
+                       string errorCase, string printName = parseName>
     : AttrOrTypeParameter<"unsigned", "debug info " # summary> {
   let parser = [{ [&]() -> FailureOr<unsigned> {
     SMLoc tagLoc = $_parser.getCurrentLocation();
@@ -246,33 +247,35 @@ class LLVM_DIParameter<string summary, string default, string parseName,
     if ($_parser.parseKeyword(&name))
       return failure();
 
-    if (unsigned tag = llvm::dwarf::get}] # parseName # [{(name))
-      return tag;
-    return $_parser.emitError(tagLoc)
-      << "invalid debug info }] # summary # [{ name: " << name;
+    unsigned tag = llvm::dwarf::get}] # parseName # [{(name);
+    if (tag == }] # errorCase # [{)
+      return $_parser.emitError(tagLoc)
+        << "invalid debug info }] # summary # [{ name: " << name;
+    return tag;
   }() }];
   let printer = "$_printer << llvm::dwarf::" # printName # "String($_self)";
   let defaultValue = default;
 }
 
 def LLVM_DICallingConventionParameter : LLVM_DIParameter<
-  "calling convention", /*default=*/"0", "CallingConvention", "Convention"
+  "calling convention", /*default=*/"0", "CallingConvention", /*errorCase=*/"0",
+  "Convention"
 >;
 
 def LLVM_DIEncodingParameter : LLVM_DIParameter<
-  "encoding", /*default=*/"0", "AttributeEncoding"
+  "encoding", /*default=*/"0", "AttributeEncoding", /*errorCase=*/"0"
 >;
 
 def LLVM_DILanguageParameter : LLVM_DIParameter<
-  "language", /*default=*/"", "Language"
+  "language", /*default=*/"", "Language", /*errorCase=*/"0"
 >;
 
 def LLVM_DITagParameter : LLVM_DIParameter<
-  "tag", /*default=*/"", "Tag"
+  "tag", /*default=*/"", "Tag", /*errorCase=*/"llvm::dwarf::DW_TAG_invalid"
 >;
 
 def LLVM_DIOperationEncodingParameter : LLVM_DIParameter<
-  "operation encoding", /*default=*/"", "OperationEncoding"
+  "operation encoding", /*default=*/"", "OperationEncoding", /*errorCase=*/"0"
 >;
 
 //===----------------------------------------------------------------------===//
@@ -357,9 +360,11 @@ def LLVM_DICompileUnitAttr : LLVM_Attr<"DICompileUnit", "di_compile_unit",
 //===----------------------------------------------------------------------===//
 
 def LLVM_DICompositeTypeAttr : LLVM_Attr<"DICompositeType", "di_composite_type",
-                                         /*traits=*/[], "DITypeAttr"> {
+                                         [LLVM_DIRecursiveTypeAttrInterface],
+                                         "DITypeAttr"> {
   let parameters = (ins
     LLVM_DITagParameter:$tag,
+    OptionalParameter<"DistinctAttr">:$recId,
     OptionalParameter<"StringAttr">:$name,
     OptionalParameter<"DIFileAttr">:$file,
     OptionalParameter<"uint32_t">:$line,
@@ -371,6 +376,21 @@ def LLVM_DICompositeTypeAttr : LLVM_Attr<"DICompositeType", "di_composite_type",
     OptionalArrayRefParameter<"DINodeAttr">:$elements
   );
   let assemblyFormat = "`<` struct(params) `>`";
+  let extraClassDeclaration = [{
+    /// Requirements of DIRecursiveTypeAttrInterface.
+    /// @{
+
+    /// Get whether this attr describes a recursive self reference.
+    bool isRecSelf() { return getTag() == 0; }
+
+    /// Get a copy of this type attr but with the recursive ID set to `recId`.
+    DIRecursiveTypeAttrInterface withRecId(DistinctAttr recId);
+
+    /// Build a rec-self instance using the provided `recId`.
+    static DIRecursiveTypeAttrInterface getRecSelf(DistinctAttr recId);
+
+    /// @}
+  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h
index c370bfa2b733d..091a817caf3d6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h
@@ -84,6 +84,8 @@ using linkage::Linkage;
 } // namespace LLVM
 } // namespace mlir
 
+#include "mlir/Dialect/LLVMIR/LLVMAttrInterfaces.h.inc"
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
index 3b2a132a881e4..e7a1da8ee560e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines op and type interfaces for the LLVM dialect in MLIR.
+// This file defines interfaces for the LLVM dialect in MLIR.
 //
 //===----------------------------------------------------------------------===//
 
@@ -319,4 +319,58 @@ def LLVM_PointerElementTypeInterface
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// LLVM dialect attr interfaces.
+//===----------------------------------------------------------------------===//
+
+def LLVM_DIRecursiveTypeAttrInterface
+  : AttrInterface<"DIRecursiveTypeAttrInterface"> {
+  let description = [{
+    This attribute represents a DITypeAttr that is recursive. Only DITypeAttrs
+    that translate to LLVM DITypes that support mutation should implement this
+    interface.
+
+    There are two modes for conforming attributes:
+
+    1. "rec-decl":
+      - This attr is a recursive declaration identified by a recId.
+
+    2. "rec-self":
+      - This attr is considered a recursive self reference.
+      - This attr itself is a placeholder type that should be conceptually
+        replaced with the closest parent attr of the same type with the same
+        recId.
+
+    For example, to represent a linked list struct:
+
+      #rec_self = di_composite_type<recId = 0>
+      #ptr = di_derived_type<baseType: #rec_self, ...>
+      #field = di_derived_type<name = "next", baseType: #ptr, ...>
+      #rec = di_composite_type<recId = 0, name = "Node", elements: #field, ...>
+      #var = di_local_variable<type = #rec, ...>
+
+    Note that a rec-self without an outer rec-decl with the same recId is
+    conceptually the same as an "unbound" variable. The context needs to provide
+    meaning to the rec-self.
+  }];
+  let cppNamespace = "::mlir::LLVM";
+  let methods = [
+    InterfaceMethod<[{
+      Get whether this attr describes a recursive self reference.
+    }], "bool", "isRecSelf", (ins)>,
+    InterfaceMethod<[{
+      Get the recursive ID used for matching "rec-decl" with "rec-self".
+      If this attr instance is not recursive, return a null attribute.
+    }], "DistinctAttr", "getRecId", (ins)>,
+    InterfaceMethod<[{
+      Get a copy of this type attr but with the recursive ID set to `recId`.
+    }], "DIRecursiveTypeAttrInterface", "withRecId",
+    (ins "DistinctAttr":$recId)>,
+    StaticInterfaceMethod<[{
+      Build a rec-self instance using the provided `recId`.
+    }], "DIRecursiveTypeAttrInterface", "getRecSelf",
+    (ins "DistinctAttr":$recId)>
+  ];
+}
+
 #endif // LLVMIR_INTERFACES
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 8ec8e16f75c94..728e92c9dc8dc 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -409,6 +409,33 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
   let assemblyFormat = "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? attr-dict";
 }
 
+def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> 
+{
+  let arguments = (ins Optional<I32>:$barrierId, I32:$numberOfThreads);
+
+  let description = [{
+    Thread that executes this op announces their arrival at the barrier with 
+    given id and continue their execution.
+
+    The default barrier id is 0 that is similar to `nvvm.barrier` Op. When 
+    `barrierId` is not present, the default barrier id is used. 
+
+    [For more information, see PTX ISA]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
+  }];
+  
+  let assemblyFormat = "(`id` `=` $barrierId^)? `number_of_threads` `=` $numberOfThreads attr-dict";
+
+  let extraClassDefinition = [{
+    std::string $cppClass::getPtx() {
+      std::string ptx = "bar.arrive ";
+      if (getBarrierId()) { ptx += "%0, %1'"; } 
+      else { ptx += "0, %0;"; }
+      return ptx;
+    }
+  }];
+}
+
 def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> {
   let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index abb38a3df8068..1dabf5d7979b7 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -162,10 +162,18 @@ def ROCDL_BallotOp :
   ROCDL_Op<"ballot">,
   Results<(outs LLVM_Type:$res)>,
   Arguments<(ins I1:$pred)> {
+  let summary = "Vote across thread group";
+
+  let description = [{
+      Ballot provides a bit mask containing the 1-bit predicate value from each lane. 
+      The nth bit of the result contains the 1 bit contributed by the nth warp lane.
+  }];
+
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder,
-            llvm::Intrinsic::amdgcn_ballot, {$pred}, {llvm::Type::getInt32Ty(moduleTranslation.getLLVMContext())});
+            llvm::Intrinsic::amdgcn_ballot, {$pred}, {$_resultType});
   }];
+
   let assemblyFormat = "$pred attr-dict `:` type($res)";
 }
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index c64ecb79c5ca5..feb3b3f03cf53 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -460,7 +460,8 @@ LogicalResult promoteSubviewsPrecondition(Operation *op,
 LogicalResult vectorizeOpPrecondition(Operation *op,
                                       ArrayRef<int64_t> inputVectorSizes = {},
                                       ArrayRef<bool> inputScalableVecDims = {},
-                                      bool vectorizeNDExtract = false);
+                                      bool vectorizeNDExtract = false,
+                                      bool flatten1DDepthwiseConv = false);
 
 //===----------------------------------------------------------------------===//
 // Transformations exposed as functional-style API calls.
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 0bd402d626dc4..dcf04ab242257 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -575,7 +575,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
     The optional `order` attribute specifies which order the iterations of the
     associate loops are executed in. Currently the only option for this
     attribute is "concurrent".
-    
+
     The optional `byref` attribute indicates that reduction arguments should be
     passed by reference.
   }];
@@ -1347,10 +1347,10 @@ def MapInfoOp : OpenMP_Op<"map_info", [AttrSizedOperandSegments]> {
     - `var_type`: The type of the variable to copy.
     - `var_ptr_ptr`: Used when the variable copied is a member of a class, structure
       or derived type and refers to the originating struct.
-    - `members`:  Used to indicate mapped child members for the current MapInfoOp, 
-       represented as other MapInfoOp's, utilised in cases where a parent structure 
-       type and members of the structure type are being mapped at the same time. 
-       For example: map(to: parent, parent->member, parent->member2[:10])  
+    - `members`:  Used to indicate mapped child members for the current MapInfoOp,
+       represented as other MapInfoOp's, utilised in cases where a parent structure
+       type and members of the structure type are being mapped at the same time.
+       For example: map(to: parent, parent->member, parent->member2[:10])
     - `bounds`: Used when copying slices of array's, pointers or pointer members of
        objects (e.g. derived types or classes), indicates the bounds to be copied
        of the variable. When it's an array slice it is in rank order where rank 0
@@ -1391,7 +1391,7 @@ def MapInfoOp : OpenMP_Op<"map_info", [AttrSizedOperandSegments]> {
 // 2.14.2 target data Construct
 //===---------------------------------------------------------------------===//
 
-def Target_DataOp: OpenMP_Op<"target_data", [AttrSizedOperandSegments, 
+def Target_DataOp: OpenMP_Op<"target_data", [AttrSizedOperandSegments,
                                              MapClauseOwningOpInterface]>{
   let summary = "target data construct";
   let description = [{
@@ -1449,7 +1449,7 @@ def Target_DataOp: OpenMP_Op<"target_data", [AttrSizedOperandSegments,
 //===---------------------------------------------------------------------===//
 
 def Target_EnterDataOp: OpenMP_Op<"target_enter_data",
-                                                 [AttrSizedOperandSegments, 
+                                                 [AttrSizedOperandSegments,
                                                   MapClauseOwningOpInterface]>{
   let  summary = "target enter data construct";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index ed086d36424c1..2e37384ce3eb7 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -34,8 +34,8 @@ def OutlineableOpenMPOpInterface : OpInterface<"OutlineableOpenMPOpInterface"> {
 def MapClauseOwningOpInterface : OpInterface<"MapClauseOwningOpInterface"> {
   let description = [{
     OpenMP operations which own a list of omp::MapInfoOp's implement this interface
-    to allow generic access to deal with map operands to more easily manipulate 
-    this class of operations. 
+    to allow generic access to deal with map operands to more easily manipulate
+    this class of operations.
   }];
 
   let cppNamespace = "::mlir::omp";
@@ -45,7 +45,7 @@ def MapClauseOwningOpInterface : OpInterface<"MapClauseOwningOpInterface"> {
       (ins), [{
         return $_op.getMapOperands();
       }]>,
-      InterfaceMethod<"Get mutable map operands", "::mlir::MutableOperandRange", 
+      InterfaceMethod<"Get mutable map operands", "::mlir::MutableOperandRange",
                       "getMapOperandsMutable",
       (ins), [{
         return $_op.getMapOperandsMutable();
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
index e48a56f0625d3..3ee239d6e1e3e 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
@@ -800,6 +800,8 @@ def SPIRV_SelectOp : SPIRV_Op<"Select",
   // These ops require dynamic availability specification based on operand and
   // result types.
   bit autogenAvailability = 0;
+
+  let hasFolder = 1;
 }
 
 // -----
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index 0498576fcffc5..29cf8c32447ec 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -1419,7 +1419,7 @@ def SparseTensor_ForeachOp : SparseTensor_Op<"foreach",
 }
 
 //===----------------------------------------------------------------------===//
-// Sparse Tensor Debugging Operations.
+// Sparse Tensor Debugging and Test-Only Operations.
 //===----------------------------------------------------------------------===//
 
 def SparseTensor_PrintOp : SparseTensor_Op<"print">,
@@ -1440,4 +1440,25 @@ def SparseTensor_PrintOp : SparseTensor_Op<"print">,
   let assemblyFormat = "$tensor attr-dict `:` type($tensor)";
 }
 
+def SparseTensor_HasRuntimeLibraryOp
+    : SparseTensor_Op<"has_runtime_library", []>, Results<(outs I1:$result)> {
+  string summary = "Indicates whether running in runtime/codegen mode";
+  string description = [{
+    Returns a boolean value that indicates whether the sparsifier runs in
+    runtime library mode or not. For testing only! This operation is useful
+    for writing test cases that require different code depending on
+    runtime/codegen mode.
+
+    Example:
+
+    ```mlir
+    %has_runtime = sparse_tensor.has_runtime_library
+    scf.if %has_runtime {
+      ...
+    }
+    ```
+  }];
+  let assemblyFormat = "attr-dict";
+}
+
 #endif // SPARSETENSOR_OPS
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index f6b03a0f2c800..3ce16ef361f37 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -9,12 +9,15 @@
 #ifndef MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
 #define MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
 
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Support/LLVM.h"
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 namespace mlir {
 
@@ -98,6 +101,17 @@ bool isContiguousSlice(MemRefType memrefType, VectorType vectorType);
 std::optional<StaticTileOffsetRange>
 createUnrollIterator(VectorType vType, int64_t targetRank = 1);
 
+/// A wrapper for getMixedSizes for vector.transfer_read and
+/// vector.transfer_write Ops (for source and destination, respectively).
+///
+/// Tensor and MemRef types implement their own, very similar version of
+/// getMixedSizes. This method will call the appropriate version (depending on
+/// `hasTensorSemantics`). It will also automatically extract the operand for
+/// which to call it on (source for "read" and destination for "write" ops).
+SmallVector<OpFoldResult> getMixedSizesXfer(bool hasTensorSemantics,
+                                            Operation *xfer,
+                                            RewriterBase &rewriter);
+
 } // namespace vector
 
 /// Constructs a permutation map of invariant memref indices to vector
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
index f927b82628b1a..714e664dd0f4e 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -206,7 +206,7 @@ class SparseTensorReader final {
     auto *lvlCOO = readCOO<V>(map, lvlSizes);
     auto *tensor = SparseTensorStorage<P, I, V>::newFromCOO(
         dimRank, getDimSizes(), lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,
-        *lvlCOO);
+        lvlCOO);
     delete lvlCOO;
     return tensor;
   }
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index 468782ebef4e3..773957a8b5116 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -197,37 +197,22 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
                       const uint64_t *lvl2dim)
       : SparseTensorStorageBase(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
                                 dim2lvl, lvl2dim),
-        positions(lvlRank), coordinates(lvlRank), lvlCursor(lvlRank), coo() {}
+        positions(lvlRank), coordinates(lvlRank), lvlCursor(lvlRank) {}
 
 public:
   /// Constructs a sparse tensor with the given encoding, and allocates
-  /// overhead storage according to some simple heuristics. When the
-  /// `bool` argument is true and `lvlTypes` are all dense, then this
-  /// ctor will also initialize the values array with zeros. That
-  /// argument should be true when an empty tensor is intended; whereas
-  /// it should usually be false when the ctor will be followed up by
-  /// some other form of initialization.
+  /// overhead storage according to some simple heuristics. When lvlCOO
+  /// is set, the sparse tensor initializes with the contents from that
+  /// data structure. Otherwise, an empty sparse tensor results.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const uint64_t *lvlSizes,
                       const LevelType *lvlTypes, const uint64_t *dim2lvl,
-                      const uint64_t *lvl2dim, SparseTensorCOO<V> *lvlCOO,
-                      bool initializeValuesIfAllDense);
+                      const uint64_t *lvl2dim, SparseTensorCOO<V> *lvlCOO);
 
   /// Constructs a sparse tensor with the given encoding, and initializes
-  /// the contents from the COO. This ctor performs the same heuristic
-  /// overhead-storage allocation as the ctor above.
-  SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
-                      uint64_t lvlRank, const uint64_t *lvlSizes,
-                      const LevelType *lvlTypes, const uint64_t *dim2lvl,
-                      const uint64_t *lvl2dim, SparseTensorCOO<V> &lvlCOO);
-
-  /// Constructs a sparse tensor with the given encoding, and initializes
-  /// the contents from the level buffers. This ctor allocates exactly
-  /// the required amount of overhead storage, not using any heuristics.
-  /// It assumes that the data provided by `lvlBufs` can be directly used to
-  /// interpret the result sparse tensor and performs *NO* integrity test on the
-  /// input data. It also assume that the trailing COO coordinate buffer is
-  /// passed in as a single AoS memory.
+  /// the contents from the level buffers. The constructor assumes that the
+  /// data provided by `lvlBufs` can be directly used to interpret the result
+  /// sparse tensor and performs no integrity test on the input data.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const uint64_t *lvlSizes,
                       const LevelType *lvlTypes, const uint64_t *dim2lvl,
@@ -244,16 +229,14 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   newFromCOO(uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
              const uint64_t *lvlSizes, const LevelType *lvlTypes,
              const uint64_t *dim2lvl, const uint64_t *lvl2dim,
-             SparseTensorCOO<V> &lvlCOO);
+             SparseTensorCOO<V> *lvlCOO);
 
-  /// Allocates a new sparse tensor and initialize it with the data stored level
-  /// buffers directly.
+  /// Allocates a new sparse tensor and initialize it from the given buffers.
   static SparseTensorStorage<P, C, V> *
-  packFromLvlBuffers(uint64_t dimRank, const uint64_t *dimSizes,
-                     uint64_t lvlRank, const uint64_t *lvlSizes,
-                     const LevelType *lvlTypes, const uint64_t *dim2lvl,
-                     const uint64_t *lvl2dim, uint64_t srcRank,
-                     const intptr_t *buffers);
+  newFromBuffers(uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
+                 const uint64_t *lvlSizes, const LevelType *lvlTypes,
+                 const uint64_t *dim2lvl, const uint64_t *lvl2dim,
+                 uint64_t srcRank, const intptr_t *buffers);
 
   ~SparseTensorStorage() final = default;
 
@@ -337,16 +320,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     }
   }
 
-  /// Allocates a new COO object and initializes it with the contents.
-  /// Callers must make sure to delete the COO when they're done with it.
-  SparseTensorCOO<V> *toCOO() {
-    std::vector<uint64_t> dimCoords(getDimRank());
-    coo = new SparseTensorCOO<V>(getDimSizes(), values.size());
-    toCOO(0, 0, dimCoords);
-    assert(coo->getElements().size() == values.size());
-    return coo;
-  }
-
   /// Sort the unordered tensor in place, the method assumes that it is
   /// an unordered COO tensor.
   void sortInPlace() {
@@ -556,58 +529,10 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     return -1u;
   }
 
-  // Performs forall on level entries and inserts into dim COO.
-  void toCOO(uint64_t parentPos, uint64_t l, std::vector<uint64_t> &dimCoords) {
-    if (l == getLvlRank()) {
-      map.pushbackward(lvlCursor.data(), dimCoords.data());
-      assert(coo);
-      assert(parentPos < values.size());
-      coo->add(dimCoords, values[parentPos]);
-      return;
-    }
-    if (isCompressedLvl(l)) {
-      const std::vector<P> &positionsL = positions[l];
-      assert(parentPos + 1 < positionsL.size());
-      const uint64_t pstart = static_cast<uint64_t>(positionsL[parentPos]);
-      const uint64_t pstop = static_cast<uint64_t>(positionsL[parentPos + 1]);
-      const std::vector<C> &coordinatesL = coordinates[l];
-      assert(pstop <= coordinatesL.size());
-      for (uint64_t pos = pstart; pos < pstop; pos++) {
-        lvlCursor[l] = static_cast<uint64_t>(coordinatesL[pos]);
-        toCOO(pos, l + 1, dimCoords);
-      }
-    } else if (isLooseCompressedLvl(l)) {
-      const std::vector<P> &positionsL = positions[l];
-      assert(2 * parentPos + 1 < positionsL.size());
-      const uint64_t pstart = static_cast<uint64_t>(positionsL[2 * parentPos]);
-      const uint64_t pstop =
-          static_cast<uint64_t>(positionsL[2 * parentPos + 1]);
-      const std::vector<C> &coordinatesL = coordinates[l];
-      assert(pstop <= coordinatesL.size());
-      for (uint64_t pos = pstart; pos < pstop; pos++) {
-        lvlCursor[l] = static_cast<uint64_t>(coordinatesL[pos]);
-        toCOO(pos, l + 1, dimCoords);
-      }
-    } else if (isSingletonLvl(l) || isNOutOfMLvl(l)) {
-      assert(parentPos < coordinates[l].size());
-      lvlCursor[l] = static_cast<uint64_t>(coordinates[l][parentPos]);
-      toCOO(parentPos, l + 1, dimCoords);
-    } else { // Dense level.
-      assert(isDenseLvl(l));
-      const uint64_t sz = getLvlSizes()[l];
-      const uint64_t pstart = parentPos * sz;
-      for (uint64_t c = 0; c < sz; c++) {
-        lvlCursor[l] = c;
-        toCOO(pstart + c, l + 1, dimCoords);
-      }
-    }
-  }
-
   std::vector<std::vector<P>> positions;
   std::vector<std::vector<C>> coordinates;
   std::vector<V> values;
   std::vector<uint64_t> lvlCursor;
-  SparseTensorCOO<V> *coo;
 };
 
 //===----------------------------------------------------------------------===//
@@ -621,9 +546,9 @@ SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newEmpty(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const LevelType *lvlTypes,
     const uint64_t *dim2lvl, const uint64_t *lvl2dim) {
+  SparseTensorCOO<V> *noLvlCOO = nullptr;
   return new SparseTensorStorage<P, C, V>(dimRank, dimSizes, lvlRank, lvlSizes,
-                                          lvlTypes, dim2lvl, lvl2dim, nullptr,
-                                          true);
+                                          lvlTypes, dim2lvl, lvl2dim, noLvlCOO);
 }
 
 template <typename P, typename C, typename V>
@@ -631,13 +556,14 @@ SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newFromCOO(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const LevelType *lvlTypes,
     const uint64_t *dim2lvl, const uint64_t *lvl2dim,
-    SparseTensorCOO<V> &lvlCOO) {
+    SparseTensorCOO<V> *lvlCOO) {
+  assert(lvlCOO);
   return new SparseTensorStorage<P, C, V>(dimRank, dimSizes, lvlRank, lvlSizes,
                                           lvlTypes, dim2lvl, lvl2dim, lvlCOO);
 }
 
 template <typename P, typename C, typename V>
-SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::packFromLvlBuffers(
+SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newFromBuffers(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const LevelType *lvlTypes,
     const uint64_t *dim2lvl, const uint64_t *lvl2dim, uint64_t srcRank,
@@ -657,11 +583,9 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const LevelType *lvlTypes,
     const uint64_t *dim2lvl, const uint64_t *lvl2dim,
-    SparseTensorCOO<V> *lvlCOO, bool initializeValuesIfAllDense)
+    SparseTensorCOO<V> *lvlCOO)
     : SparseTensorStorage(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
                           dim2lvl, lvl2dim) {
-  assert(!lvlCOO || lvlRank == lvlCOO->getRank());
-  coo = lvlCOO;
   // Provide hints on capacity of positions and coordinates.
   // TODO: needs much fine-tuning based on actual sparsity; currently
   // we reserve position/coordinate space based on all previous dense
@@ -692,27 +616,20 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
       sz = detail::checkedMul(sz, lvlSizes[l]);
     }
   }
-  if (allDense && initializeValuesIfAllDense)
+  if (lvlCOO) {
+    /* New from COO: ensure it is sorted. */
+    assert(lvlCOO->getRank() == lvlRank);
+    lvlCOO->sort();
+    // Now actually insert the `elements`.
+    const auto &elements = lvlCOO->getElements();
+    const uint64_t nse = elements.size();
+    assert(values.size() == 0);
+    values.reserve(nse);
+    fromCOO(elements, 0, nse, 0);
+  } else if (allDense) {
+    /* New empty (all dense) */
     values.resize(sz, 0);
-}
-
-template <typename P, typename C, typename V>
-SparseTensorStorage<P, C, V>::SparseTensorStorage( // NOLINT
-    uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
-    const uint64_t *lvlSizes, const LevelType *lvlTypes,
-    const uint64_t *dim2lvl, const uint64_t *lvl2dim,
-    SparseTensorCOO<V> &lvlCOO)
-    : SparseTensorStorage(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
-                          dim2lvl, lvl2dim, nullptr, false) {
-  // Ensure lvlCOO is sorted.
-  assert(lvlRank == lvlCOO.getRank());
-  lvlCOO.sort();
-  // Now actually insert the `elements`.
-  const auto &elements = lvlCOO.getElements();
-  const uint64_t nse = elements.size();
-  assert(values.size() == 0);
-  values.reserve(nse);
-  fromCOO(elements, 0, nse, 0);
+  }
 }
 
 template <typename P, typename C, typename V>
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index ef53a0b82866f..070e6ed702f86 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -736,6 +736,8 @@ class RewriterBase : public OpBuilder {
       : OpBuilder(ctx, listener) {}
   explicit RewriterBase(const OpBuilder &otherBuilder)
       : OpBuilder(otherBuilder) {}
+  explicit RewriterBase(Operation *op, OpBuilder::Listener *listener = nullptr)
+      : OpBuilder(op, listener) {}
   virtual ~RewriterBase();
 
 private:
@@ -756,6 +758,8 @@ class IRRewriter : public RewriterBase {
   explicit IRRewriter(MLIRContext *ctx, OpBuilder::Listener *listener = nullptr)
       : RewriterBase(ctx, listener) {}
   explicit IRRewriter(const OpBuilder &builder) : RewriterBase(builder) {}
+  explicit IRRewriter(Operation *op, OpBuilder::Listener *listener = nullptr)
+      : RewriterBase(op, listener) {}
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Support/ToolUtilities.h b/mlir/include/mlir/Support/ToolUtilities.h
index d2c89409c0653..511cb118bb246 100644
--- a/mlir/include/mlir/Support/ToolUtilities.h
+++ b/mlir/include/mlir/Support/ToolUtilities.h
@@ -15,6 +15,8 @@
 
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+
 #include <memory>
 
 namespace llvm {
@@ -27,20 +29,24 @@ struct LogicalResult;
 using ChunkBufferHandler = function_ref<LogicalResult(
     std::unique_ptr<llvm::MemoryBuffer> chunkBuffer, raw_ostream &os)>;
 
-/// Splits the specified buffer on a marker (`// -----`), processes each chunk
-/// independently according to the normal `processChunkBuffer` logic, and writes
-/// all results to `os`.
+extern inline const char *const kDefaultSplitMarker = "// -----";
+
+/// Splits the specified buffer on a marker (`// -----` by default), processes
+/// each chunk independently according to the normal `processChunkBuffer` logic,
+/// and writes all results to `os`.
 ///
 /// This is used to allow a large number of small independent tests to be put
-/// into a single file. `enableSplitting` can be used to toggle if splitting
-/// should be enabled, e.g. to allow for merging split and non-split code paths.
-/// When `insertMarkerInOutput` is true, split markers (`//-----`) are placed
-/// between each of the processed output chunks.
+/// into a single file. The input split marker is configurable. If it is empty,
+/// merging is disabled, which allows for merging split and non-split code
+/// paths. Output split markers (`//-----` by default) followed by a new line
+/// character, respectively, are placed between each of the processed output
+/// chunks. (The new line character is inserted even if the split marker is
+/// empty.)
 LogicalResult
 splitAndProcessBuffer(std::unique_ptr<llvm::MemoryBuffer> originalBuffer,
                       ChunkBufferHandler processChunkBuffer, raw_ostream &os,
-                      bool enableSplitting = true,
-                      bool insertMarkerInOutput = false);
+                      llvm::StringRef inputSplitMarker = kDefaultSplitMarker,
+                      llvm::StringRef outputSplitMarker = "");
 } // namespace mlir
 
 #endif // MLIR_SUPPORT_TOOLUTILITIES_H
diff --git a/mlir/include/mlir/Target/Cpp/CppEmitter.h b/mlir/include/mlir/Target/Cpp/CppEmitter.h
index 30d3fff9fca88..d3c081061faa0 100644
--- a/mlir/include/mlir/Target/Cpp/CppEmitter.h
+++ b/mlir/include/mlir/Target/Cpp/CppEmitter.h
@@ -13,13 +13,11 @@
 #ifndef MLIR_TARGET_CPP_CPPEMITTER_H
 #define MLIR_TARGET_CPP_CPPEMITTER_H
 
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Value.h"
-#include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/Support/raw_ostream.h"
-#include <stack>
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
+struct LogicalResult;
+class Operation;
 namespace emitc {
 
 /// Translates the given operation to C++ code. The operation or operations in
diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
index 6e90fad1618d2..8adc80908de11 100644
--- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -15,6 +15,7 @@
 
 #include "mlir/Debug/CLOptionsSetup.h"
 #include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/ToolUtilities.h"
 #include "llvm/ADT/StringRef.h"
 
 #include <cstdlib>
@@ -136,13 +137,24 @@ class MlirOptMainConfig {
   }
   bool shouldShowDialects() const { return showDialectsFlag; }
 
-  /// Set whether to split the input file based on the `// -----` marker into
-  /// pieces and process each chunk independently.
-  MlirOptMainConfig &splitInputFile(bool split = true) {
-    splitInputFileFlag = split;
+  /// Set the marker on which to split the input into chunks and process each
+  /// chunk independently. Input is not split if empty.
+  MlirOptMainConfig &
+  splitInputFile(std::string splitMarker = kDefaultSplitMarker) {
+    splitInputFileFlag = std::move(splitMarker);
+    return *this;
+  }
+  bool shouldSplitInputFile() const { return splitInputFileFlag.empty(); }
+  StringRef inputSplitMarker() const { return splitInputFileFlag; }
+
+  /// Set whether to merge the output chunks into one file using the given
+  /// marker.
+  MlirOptMainConfig &
+  outputSplitMarker(std::string splitMarker = kDefaultSplitMarker) {
+    outputSplitMarkerFlag = std::move(splitMarker);
     return *this;
   }
-  bool shouldSplitInputFile() const { return splitInputFileFlag; }
+  StringRef outputSplitMarker() const { return outputSplitMarkerFlag; }
 
   /// Disable implicit addition of a top-level module op during parsing.
   MlirOptMainConfig &useExplicitModule(bool useExplicitModule) {
@@ -215,9 +227,12 @@ class MlirOptMainConfig {
   /// Show the registered dialects before trying to load the input file.
   bool showDialectsFlag = false;
 
-  /// Split the input file based on the `// -----` marker into pieces and
-  /// process each chunk independently.
-  bool splitInputFileFlag = false;
+  /// Split the input file based on the given marker into chunks and process
+  /// each chunk independently. Input is not split if empty.
+  std::string splitInputFileFlag = "";
+
+  /// Merge output chunks into one file using the given marker.
+  std::string outputSplitMarkerFlag = "";
 
   /// Use an explicit top-level module op during parsing.
   bool useExplicitModuleFlag = false;
diff --git a/mlir/include/mlir/Transforms/Mem2Reg.h b/mlir/include/mlir/Transforms/Mem2Reg.h
index 89244feb21754..d145f7ed43758 100644
--- a/mlir/include/mlir/Transforms/Mem2Reg.h
+++ b/mlir/include/mlir/Transforms/Mem2Reg.h
@@ -9,8 +9,6 @@
 #ifndef MLIR_TRANSFORMS_MEM2REG_H
 #define MLIR_TRANSFORMS_MEM2REG_H
 
-#include "mlir/IR/Dominance.h"
-#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "llvm/ADT/Statistic.h"
@@ -25,24 +23,6 @@ struct Mem2RegStatistics {
   llvm::Statistic *newBlockArgumentAmount = nullptr;
 };
 
-/// Pattern applying mem2reg to the regions of the operations on which it
-/// matches.
-class Mem2RegPattern
-    : public OpInterfaceRewritePattern<PromotableAllocationOpInterface> {
-public:
-  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
-
-  Mem2RegPattern(MLIRContext *context, Mem2RegStatistics statistics = {},
-                 PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern(context, benefit), statistics(statistics) {}
-
-  LogicalResult matchAndRewrite(PromotableAllocationOpInterface allocator,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  Mem2RegStatistics statistics;
-};
-
 /// Attempts to promote the memory slots of the provided allocators. Succeeds if
 /// at least one memory slot was promoted.
 LogicalResult
diff --git a/mlir/include/mlir/Transforms/SROA.h b/mlir/include/mlir/Transforms/SROA.h
index 0b3e72400c287..1af1fe930723f 100644
--- a/mlir/include/mlir/Transforms/SROA.h
+++ b/mlir/include/mlir/Transforms/SROA.h
@@ -9,11 +9,9 @@
 #ifndef MLIR_TRANSFORMS_SROA_H
 #define MLIR_TRANSFORMS_SROA_H
 
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/Statistic.h"
-#include <variant>
 
 namespace mlir {
 
@@ -29,24 +27,6 @@ struct SROAStatistics {
   llvm::Statistic *maxSubelementAmount = nullptr;
 };
 
-/// Pattern applying SROA to the regions of the operations on which it
-/// matches.
-class SROAPattern
-    : public OpInterfaceRewritePattern<DestructurableAllocationOpInterface> {
-public:
-  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
-
-  SROAPattern(MLIRContext *context, SROAStatistics statistics = {},
-              PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern(context, benefit), statistics(statistics) {}
-
-  LogicalResult matchAndRewrite(DestructurableAllocationOpInterface allocator,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  SROAStatistics statistics;
-};
-
 /// Attempts to destructure the slots of destructurable allocators. Returns
 /// failure if no slot was destructured.
 LogicalResult tryToDestructureMemorySlots(
diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp
index 2d938ce5f4834..d0fd5ceecfff5 100644
--- a/mlir/lib/CAPI/Dialect/LLVM.cpp
+++ b/mlir/lib/CAPI/Dialect/LLVM.cpp
@@ -152,18 +152,18 @@ MlirAttribute mlirLLVMDIBasicTypeAttrGet(MlirContext ctx, unsigned int tag,
 }
 
 MlirAttribute mlirLLVMDICompositeTypeAttrGet(
-    MlirContext ctx, unsigned int tag, MlirAttribute name, MlirAttribute file,
-    uint32_t line, MlirAttribute scope, MlirAttribute baseType, int64_t flags,
-    uint64_t sizeInBits, uint64_t alignInBits, intptr_t nElements,
-    MlirAttribute const *elements) {
+    MlirContext ctx, unsigned int tag, MlirAttribute recId, MlirAttribute name,
+    MlirAttribute file, uint32_t line, MlirAttribute scope,
+    MlirAttribute baseType, int64_t flags, uint64_t sizeInBits,
+    uint64_t alignInBits, intptr_t nElements, MlirAttribute const *elements) {
   SmallVector<Attribute> elementsStorage;
   elementsStorage.reserve(nElements);
 
   return wrap(DICompositeTypeAttr::get(
-      unwrap(ctx), tag, cast<StringAttr>(unwrap(name)),
-      cast<DIFileAttr>(unwrap(file)), line, cast<DIScopeAttr>(unwrap(scope)),
-      cast<DITypeAttr>(unwrap(baseType)), DIFlags(flags), sizeInBits,
-      alignInBits,
+      unwrap(ctx), tag, cast<DistinctAttr>(unwrap(recId)),
+      cast<StringAttr>(unwrap(name)), cast<DIFileAttr>(unwrap(file)), line,
+      cast<DIScopeAttr>(unwrap(scope)), cast<DITypeAttr>(unwrap(baseType)),
+      DIFlags(flags), sizeInBits, alignInBits,
       llvm::map_to_vector(unwrapList(nElements, elements, elementsStorage),
                           [](Attribute a) { return a.cast<DINodeAttr>(); })));
 }
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 12a5e20268588..76729278ec1b4 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -845,6 +845,7 @@ struct SqrtOpConversion : public OpConversionPattern<complex::SqrtOp> {
     auto type = cast<ComplexType>(op.getType());
     Type elementType = type.getElementType();
     Value arg = adaptor.getComplex();
+    arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
     Value zero =
         b.create<arith::ConstantOp>(elementType, b.getZeroAttr(elementType));
@@ -852,14 +853,14 @@ struct SqrtOpConversion : public OpConversionPattern<complex::SqrtOp> {
     Value real = b.create<complex::ReOp>(elementType, adaptor.getComplex());
     Value imag = b.create<complex::ImOp>(elementType, adaptor.getComplex());
 
-    Value absLhs = b.create<math::AbsFOp>(real);
-    Value absArg = b.create<complex::AbsOp>(elementType, arg);
-    Value addAbs = b.create<arith::AddFOp>(absLhs, absArg);
+    Value absLhs = b.create<math::AbsFOp>(real, fmf);
+    Value absArg = b.create<complex::AbsOp>(elementType, arg, fmf);
+    Value addAbs = b.create<arith::AddFOp>(absLhs, absArg, fmf);
 
     Value half = b.create<arith::ConstantOp>(elementType,
                                              b.getFloatAttr(elementType, 0.5));
-    Value halfAddAbs = b.create<arith::MulFOp>(addAbs, half);
-    Value sqrtAddAbs = b.create<math::SqrtOp>(halfAddAbs);
+    Value halfAddAbs = b.create<arith::MulFOp>(addAbs, half, fmf);
+    Value sqrtAddAbs = b.create<math::SqrtOp>(halfAddAbs, fmf);
 
     Value realIsNegative =
         b.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, real, zero);
@@ -869,7 +870,7 @@ struct SqrtOpConversion : public OpConversionPattern<complex::SqrtOp> {
     Value resultReal = sqrtAddAbs;
 
     Value imagDivTwoResultReal = b.create<arith::DivFOp>(
-        imag, b.create<arith::AddFOp>(resultReal, resultReal));
+        imag, b.create<arith::AddFOp>(resultReal, resultReal, fmf), fmf);
 
     Value negativeResultReal = b.create<arith::NegFOp>(resultReal);
 
@@ -882,7 +883,7 @@ struct SqrtOpConversion : public OpConversionPattern<complex::SqrtOp> {
     resultReal = b.create<arith::SelectOp>(
         realIsNegative,
         b.create<arith::DivFOp>(
-            imag, b.create<arith::AddFOp>(resultImag, resultImag)),
+            imag, b.create<arith::AddFOp>(resultImag, resultImag, fmf), fmf),
         resultReal);
 
     Value realIsZero =
diff --git a/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt b/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt
index 1400618c93e85..f0d95f5ada290 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_conversion_library(MLIRMemRefToLLVM
   LINK_LIBS PUBLIC
   MLIRAnalysis
   MLIRDataLayoutInterfaces
+  MLIRFuncDialect
   MLIRLLVMCommonConversion
   MLIRMemRefDialect
   MLIRMemRefUtils
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index fc0515ba95f4f..1c28d6b00b3c8 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -145,45 +145,36 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
   return *gcd;
 }
 
-/// Given an induction variable `iv` of type AffineForOp and an access `index`
-/// of type index, returns `true` if `index` is independent of `iv` and
-/// false otherwise. The determination supports composition with at most one
-/// AffineApplyOp. The 'at most one AffineApplyOp' comes from the fact that
-/// the composition of AffineApplyOp needs to be canonicalized by construction
-/// to avoid writing code that composes arbitrary numbers of AffineApplyOps
-/// everywhere. To achieve this, at the very least, the compose-affine-apply
-/// pass must have been run.
+/// Given an affine.for `iv` and an access `index` of type index, returns `true`
+/// if `index` is independent of `iv` and false otherwise.
 ///
-/// Prerequisites:
-///   1. `iv` and `index` of the proper type;
-///   2. at most one reachable AffineApplyOp from index;
-///
-/// Returns false in cases with more than one AffineApplyOp, this is
-/// conservative.
+/// Prerequisites: `iv` and `index` of the proper type;
 static bool isAccessIndexInvariant(Value iv, Value index) {
-  assert(isAffineForInductionVar(iv) && "iv must be a AffineForOp");
-  assert(isa<IndexType>(index.getType()) && "index must be of IndexType");
-  SmallVector<Operation *, 4> affineApplyOps;
-  getReachableAffineApplyOps({index}, affineApplyOps);
-
-  if (affineApplyOps.empty()) {
-    // Pointer equality test because of Value pointer semantics.
-    return index != iv;
-  }
-
-  if (affineApplyOps.size() > 1) {
-    affineApplyOps[0]->emitRemark(
-        "CompositionAffineMapsPass must have been run: there should be at most "
-        "one AffineApplyOp, returning false conservatively.");
-    return false;
-  }
+  assert(isAffineForInductionVar(iv) && "iv must be an affine.for iv");
+  assert(isa<IndexType>(index.getType()) && "index must be of 'index' type");
+  auto map = AffineMap::getMultiDimIdentityMap(/*numDims=*/1, iv.getContext());
+  SmallVector<Value> operands = {index};
+  AffineValueMap avm(map, operands);
+  avm.composeSimplifyAndCanonicalize();
+  return !avm.isFunctionOf(0, iv);
+}
 
-  auto composeOp = cast<AffineApplyOp>(affineApplyOps[0]);
-  // We need yet another level of indirection because the `dim` index of the
-  // access may not correspond to the `dim` index of composeOp.
-  return !composeOp.getAffineValueMap().isFunctionOf(0, iv);
+// Pre-requisite: Loop bounds should be in canonical form.
+template <typename LoadOrStoreOp>
+bool mlir::affine::isInvariantAccess(LoadOrStoreOp memOp, AffineForOp forOp) {
+  AffineValueMap avm(memOp.getAffineMap(), memOp.getMapOperands());
+  avm.composeSimplifyAndCanonicalize();
+  return !llvm::is_contained(avm.getOperands(), forOp.getInductionVar());
 }
 
+// Explicitly instantiate the template so that the compiler knows we need them.
+template bool mlir::affine::isInvariantAccess(AffineReadOpInterface,
+                                              AffineForOp);
+template bool mlir::affine::isInvariantAccess(AffineWriteOpInterface,
+                                              AffineForOp);
+template bool mlir::affine::isInvariantAccess(AffineLoadOp, AffineForOp);
+template bool mlir::affine::isInvariantAccess(AffineStoreOp, AffineForOp);
+
 DenseSet<Value> mlir::affine::getInvariantAccesses(Value iv,
                                                    ArrayRef<Value> indices) {
   DenseSet<Value> res;
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 8a070d2563987..c591e5056480c 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -17,6 +17,7 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/ShapedOpInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/ScopeExit.h"
@@ -220,6 +221,8 @@ void AffineDialect::initialize() {
 #include "mlir/Dialect/Affine/IR/AffineOps.cpp.inc"
                 >();
   addInterfaces<AffineInlinerInterface>();
+  declarePromisedInterfaces<ValueBoundsOpInterface, AffineApplyOp, AffineMaxOp,
+                            AffineMinOp>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
diff --git a/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp b/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
index 2800237fd05ac..6a52849186872 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp
@@ -24,6 +24,15 @@ void AffineValueMap::reset(AffineMap map, ValueRange operands,
   this->results.assign(results.begin(), results.end());
 }
 
+void AffineValueMap::composeSimplifyAndCanonicalize() {
+  AffineMap sMap = getAffineMap();
+  fullyComposeAffineMapAndOperands(&sMap, &operands);
+  // Full composition also canonicalizes and simplifies before returning. We
+  // need to canonicalize once more to drop unused operands.
+  canonicalizeMapAndOperands(&sMap, &operands);
+  this->map.reset(sMap);
+}
+
 void AffineValueMap::difference(const AffineValueMap &a,
                                 const AffineValueMap &b, AffineValueMap *res) {
   assert(a.getNumResults() == b.getNumResults() && "invalid inputs");
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index 11c4a29718e1d..caca2ff81964f 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -253,9 +253,6 @@ def CmpIExtUI :
 // SelectOp
 //===----------------------------------------------------------------------===//
 
-def GetScalarOrVectorTrueAttribute :
-  NativeCodeCall<"cast<TypedAttr>(getBoolAttribute($0.getType(), true))">;
-
 // select(not(pred), a, b) => select(pred, b, a)
 def SelectNotCond :
     Pat<(SelectOp (Arith_XOrIOp $pred, (ConstantLikeMatcher APIntAttr:$ones)), $a, $b),
@@ -272,31 +269,12 @@ def RedundantSelectFalse :
     Pat<(SelectOp $pred, $a, (SelectOp $pred, $b, $c)),
         (SelectOp $pred, $a, $c)>;
 
-// select(predA, select(predB, x, y), y) => select(and(predA, predB), x, y)
-def SelectAndCond :
-    Pat<(SelectOp $predA, (SelectOp $predB, $x, $y), $y),
-        (SelectOp (Arith_AndIOp $predA, $predB), $x, $y)>;
-
-// select(predA, select(predB, y, x), y) => select(and(predA, not(predB)), x, y)
-def SelectAndNotCond :
-    Pat<(SelectOp $predA, (SelectOp $predB, $y, $x), $y),
-        (SelectOp (Arith_AndIOp $predA,
-                                (Arith_XOrIOp $predB,
-                                (Arith_ConstantOp (GetScalarOrVectorTrueAttribute $predB)))),
-                  $x, $y)>;
-
-// select(predA, x, select(predB, x, y)) => select(or(predA, predB), x, y)
-def SelectOrCond :
-    Pat<(SelectOp $predA, $x, (SelectOp $predB, $x, $y)),
-        (SelectOp (Arith_OrIOp $predA, $predB), $x, $y)>;
-
-// select(predA, x, select(predB, y, x)) => select(or(predA, not(predB)), x, y)
-def SelectOrNotCond :
-    Pat<(SelectOp $predA, $x, (SelectOp $predB, $y, $x)),
-        (SelectOp (Arith_OrIOp $predA,
-                               (Arith_XOrIOp $predB,
-                               (Arith_ConstantOp (GetScalarOrVectorTrueAttribute $predB)))),
-                  $x, $y)>;
+// select(pred, false, true) => not(pred) 
+def SelectI1ToNot :
+    Pat<(SelectOp $pred,
+                  (ConstantLikeMatcher ConstantAttr<I1Attr, "0">),
+                  (ConstantLikeMatcher ConstantAttr<I1Attr, "1">)),
+        (Arith_XOrIOp $pred, (Arith_ConstantOp ConstantAttr<I1Attr, "1">))>;
 
 //===----------------------------------------------------------------------===//
 // IndexCastOp
diff --git a/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp b/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
index 745c5706a838c..6a593185ccedc 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
@@ -8,9 +8,12 @@
 
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -46,6 +49,12 @@ void arith::ArithDialect::initialize() {
       >();
   addInterfaces<ArithInlinerInterface>();
   declarePromisedInterface<ArithDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<SelectOp,
+                           bufferization::BufferDeallocationOpInterface>();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, ConstantOp,
+                            IndexCastOp, SelectOp>();
+  declarePromisedInterfaces<ValueBoundsOpInterface, AddIOp, ConstantOp, SubIOp,
+                            MulIOp>();
 }
 
 /// Materialize an integer or floating point constant.
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 0f71c19c23b65..9f64a07f31e3a 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -969,7 +969,6 @@ OpFoldResult arith::MaxNumFOp::fold(FoldAdaptor adaptor) {
       [](const APFloat &a, const APFloat &b) { return llvm::maximum(a, b); });
 }
 
-
 //===----------------------------------------------------------------------===//
 // MaxSIOp
 //===----------------------------------------------------------------------===//
@@ -2173,35 +2172,6 @@ void arith::CmpFOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
 // SelectOp
 //===----------------------------------------------------------------------===//
 
-// Transforms a select of a boolean to arithmetic operations
-//
-//  arith.select %arg, %x, %y : i1
-//
-//  becomes
-//
-//  and(%arg, %x) or and(!%arg, %y)
-struct SelectI1Simplify : public OpRewritePattern<arith::SelectOp> {
-  using OpRewritePattern<arith::SelectOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(arith::SelectOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!op.getType().isInteger(1))
-      return failure();
-
-    Value falseConstant =
-        rewriter.create<arith::ConstantIntOp>(op.getLoc(), true, 1);
-    Value notCondition = rewriter.create<arith::XOrIOp>(
-        op.getLoc(), op.getCondition(), falseConstant);
-
-    Value trueVal = rewriter.create<arith::AndIOp>(
-        op.getLoc(), op.getCondition(), op.getTrueValue());
-    Value falseVal = rewriter.create<arith::AndIOp>(op.getLoc(), notCondition,
-                                                    op.getFalseValue());
-    rewriter.replaceOpWithNewOp<arith::OrIOp>(op, trueVal, falseVal);
-    return success();
-  }
-};
-
 //  select %arg, %c1, %c0 => extui %arg
 struct SelectToExtUI : public OpRewritePattern<arith::SelectOp> {
   using OpRewritePattern<arith::SelectOp>::OpRewritePattern;
@@ -2238,9 +2208,8 @@ struct SelectToExtUI : public OpRewritePattern<arith::SelectOp> {
 
 void arith::SelectOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                   MLIRContext *context) {
-  results.add<RedundantSelectFalse, RedundantSelectTrue, SelectI1Simplify,
-              SelectAndCond, SelectAndNotCond, SelectOrCond, SelectOrNotCond,
-              SelectNotCond, SelectToExtUI>(context);
+  results.add<RedundantSelectFalse, RedundantSelectTrue, SelectNotCond,
+              SelectI1ToNot, SelectToExtUI>(context);
 }
 
 OpFoldResult arith::SelectOp::fold(FoldAdaptor adaptor) {
diff --git a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp
index 47c84708f3c38..1f48d27aa27b1 100644
--- a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp
+++ b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp
@@ -16,7 +16,9 @@
 #include "mlir/Dialect/ArmNeon/Transforms.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -36,8 +38,10 @@ static Type matchContainerType(Type element, Type container) {
   return element;
 }
 
-/// Lowering from a single vector::contractOp directly to the arm neon smmla
-/// intrinsic. The shapes of the contract and intrinsic must match.
+/// Lowering from a vector::contractOp arm neon smmla intrinsic. This will tile
+/// any vector.contract into multiple smmla instructions with unrolling so long
+/// as [2,2,8] is a divisor of its shape. If no unrolling is necessary, a single
+/// smmla instruction is emitted.
 class LowerContractionToSMMLAPattern
     : public OpRewritePattern<vector::ContractionOp> {
 public:
@@ -45,10 +49,6 @@ class LowerContractionToSMMLAPattern
   LogicalResult matchAndRewrite(vector::ContractionOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    Value lhs = op.getLhs();
-    Value rhs = op.getRhs();
-    Value res = op.getAcc();
-
     // Check index maps that represent M N K in contract.
     auto indexingMaps = op.getIndexingMapsArray();
     if (llvm::any_of(indexingMaps, [](mlir::AffineMap affineMap) {
@@ -57,7 +57,6 @@ class LowerContractionToSMMLAPattern
         })) {
       return failure();
     }
-
     // Check iterator types for contract.
     auto iteratorTypes = op.getIteratorTypesArray();
     if (iteratorTypes.size() != 3 ||
@@ -66,22 +65,24 @@ class LowerContractionToSMMLAPattern
         iteratorTypes[2] != vector::IteratorType::reduction) {
       return failure();
     }
-
-    // Check the tile size by mapping the dimensions of the contract.
+    // Infer tile sizes from operands; Note: RHS is not transposed.
     mlir::VectorType lhsType = op.getLhsType();
     mlir::VectorType rhsType = op.getRhsType();
     auto dimM = lhsType.getDimSize(0);
     auto dimN = rhsType.getDimSize(0);
     auto dimK = lhsType.getDimSize(1);
-    if (rhsType.getDimSize(1) != dimK || dimM != 2 || dimN != 2 || dimK != 8) {
+
+    // Unrolling patterns can handle any [2, 2, 8] shaped multiple of inputs for
+    // tiling.
+    if (dimM % 2 != 0 || dimN % 2 != 0 || dimK % 8 != 0) {
       return failure();
     }
 
     // Check two extsi inputs Rhs Lhs for contract.
     arith::ExtSIOp origLhsExtOp =
-        dyn_cast_or_null<arith::ExtSIOp>(lhs.getDefiningOp());
+        dyn_cast_or_null<arith::ExtSIOp>(op.getLhs().getDefiningOp());
     arith::ExtSIOp origRhsExtOp =
-        dyn_cast_or_null<arith::ExtSIOp>(rhs.getDefiningOp());
+        dyn_cast_or_null<arith::ExtSIOp>(op.getRhs().getDefiningOp());
     if (!origLhsExtOp || !origRhsExtOp) {
       return failure();
     }
@@ -113,26 +114,73 @@ class LowerContractionToSMMLAPattern
       return failure();
     }
 
-    // Collapse to 1D vectors required by smmla intrinsic
-    auto collapsedInputType = VectorType::get(
-        {16}, extsiLhs.getType().cast<ShapedType>().getElementType());
-    auto collapsedOutputType =
-        VectorType::get({4}, res.getType().cast<ShapedType>().getElementType());
-    auto collapsedLhs = rewriter.createOrFold<vector::ShapeCastOp>(
-        extsiLhs.getLoc(), collapsedInputType, extsiLhs);
-    auto collapsedRhs = rewriter.createOrFold<vector::ShapeCastOp>(
-        extsiRhs.getLoc(), collapsedInputType, extsiRhs);
-    auto collapsedRes = rewriter.createOrFold<vector::ShapeCastOp>(
-        res.getLoc(), collapsedOutputType, res);
-
-    // Replace the contract with a neon op
-    auto smmlaOp = rewriter.createOrFold<arm_neon::SmmlaOp>(
-        op.getLoc(), collapsedRes.getType(), collapsedRes, collapsedLhs,
-        collapsedRhs);
-
-    // Reshape output back to 2D
-    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, op.getResultType(),
-                                                     smmlaOp);
+    // Initial accumulator for the final result. This is the un-tiled result if
+    // tiling is done.
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, op.getResultType(), rewriter.getZeroAttr(op.getResultType()));
+
+    SmallVector<int64_t> unrolledSize = *op.getShapeForUnroll();
+    SmallVector<int64_t> smmlaShape{2, 2, 8};
+    SmallVector<int64_t> loopOrder{0, 1, 2};
+    for (SmallVector<int64_t> offsets :
+         StaticTileOffsetRange(unrolledSize, smmlaShape, loopOrder)) {
+
+      // Helper to compute the new shape of each operand and extract the slice.
+      auto extractOperand = [&](Value operand, AffineMap permutationMap,
+                                ArrayRef<int64_t> operandOffsets) {
+        SmallVector<int64_t> operandShape =
+            applyPermutationMap(permutationMap, ArrayRef<int64_t>(smmlaShape));
+        SmallVector<int64_t> operandStrides(operandOffsets.size(), 1);
+        return rewriter.createOrFold<vector::ExtractStridedSliceOp>(
+            loc, operand, operandOffsets, operandShape, operandStrides);
+      };
+
+      // Extract tiled lhs, rhs, and acc
+      AffineMap lhsPermutationMap = op.getIndexingMapsArray()[0];
+      SmallVector<int64_t> lhsOffsets =
+          applyPermutationMap(lhsPermutationMap, ArrayRef<int64_t>(offsets));
+      Value tiledLhs = extractOperand(extsiLhs, lhsPermutationMap, lhsOffsets);
+      AffineMap rhsPermutationMap = op.getIndexingMapsArray()[1];
+      SmallVector<int64_t> rhsOffsets =
+          applyPermutationMap(rhsPermutationMap, ArrayRef<int64_t>(offsets));
+      Value tiledRhs = extractOperand(extsiRhs, rhsPermutationMap, rhsOffsets);
+      AffineMap accPermutationMap = op.getIndexingMapsArray()[2];
+      SmallVector<int64_t> accOffsets =
+          applyPermutationMap(accPermutationMap, ArrayRef<int64_t>(offsets));
+      Value tiledAcc =
+          extractOperand(op.getAcc(), accPermutationMap, accOffsets);
+
+      // Collapse tiled operands to 1D vectors required by smmla intrinsic
+      auto collapsedInputType = VectorType::get(
+          tiledLhs.getType().cast<ShapedType>().getNumElements(),
+          tiledLhs.getType().cast<ShapedType>().getElementType());
+      auto collapsedOutputType = VectorType::get(
+          {4}, tiledAcc.getType().cast<ShapedType>().getElementType());
+      auto collapsedLhs = rewriter.createOrFold<vector::ShapeCastOp>(
+          tiledLhs.getLoc(), collapsedInputType, tiledLhs);
+      auto collapsedRhs = rewriter.createOrFold<vector::ShapeCastOp>(
+          tiledRhs.getLoc(), collapsedInputType, tiledRhs);
+      auto collapsedRes = rewriter.createOrFold<vector::ShapeCastOp>(
+          tiledAcc.getLoc(), collapsedOutputType, tiledAcc);
+
+      // Insert contract op
+      auto smmlaOp = rewriter.createOrFold<arm_neon::SmmlaOp>(
+          op.getLoc(), collapsedRes.getType(), collapsedRes, collapsedLhs,
+          collapsedRhs);
+
+      // Reshape output back to 2D
+      Value tiledRes = rewriter.createOrFold<vector::ShapeCastOp>(
+          smmlaOp.getLoc(), tiledAcc.getType(), smmlaOp);
+
+      // Insert the tiled result back into the non tiled result of the
+      // contract op.
+      SmallVector<int64_t> strides(
+          tiledRes.getType().cast<ShapedType>().getRank(), 1);
+      result = rewriter.createOrFold<vector::InsertStridedSliceOp>(
+          loc, tiledRes, result, accOffsets, strides);
+    }
+
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 8f0f6d1fcc849..55c9299c58eff 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -344,11 +344,11 @@ bool BufferizationOptions::isOpAllowed(Operation *op) const {
 
 BufferizableOpInterface
 BufferizationOptions::dynCastBufferizableOp(Operation *op) const {
+  if (!isOpAllowed(op))
+    return nullptr;
   auto bufferizableOp = dyn_cast<BufferizableOpInterface>(op);
   if (!bufferizableOp)
     return nullptr;
-  if (!isOpAllowed(op))
-    return nullptr;
   return bufferizableOp;
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index 34a0c594a5a5a..2b226c7a1207c 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -252,16 +252,6 @@ LogicalResult AllocTensorOp::verify() {
            << getType().getNumDynamicDims() << " dynamic sizes";
   if (getCopy() && getCopy().getType() != getType())
     return emitError("expected that `copy` and return type match");
-
-  // For sparse tensor allocation, we require that none of its
-  // uses escapes the function boundary directly.
-  if (sparse_tensor::getSparseTensorEncoding(getType())) {
-    for (auto &use : getOperation()->getUses())
-      if (isa<func::ReturnOp, func::CallOp, func::CallIndirectOp>(
-              use.getOwner()))
-        return emitError("sparse tensor allocation should not escape function");
-  }
-
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
index 930f035339c1d..a2222e169c4d6 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
@@ -21,7 +21,7 @@ namespace bufferization {
 } // namespace mlir
 
 using namespace mlir;
-using MemCpyFn = bufferization::BufferResultsToOutParamsOptions::MemCpyFn;
+using MemCpyFn = bufferization::BufferResultsToOutParamsOpts::MemCpyFn;
 
 /// Return `true` if the given MemRef type has a fully dynamic layout.
 static bool hasFullyDynamicLayoutMap(MemRefType type) {
@@ -45,9 +45,12 @@ static bool hasStaticIdentityLayout(MemRefType type) {
 // Updates the func op and entry block.
 //
 // Any args appended to the entry block are added to `appendedEntryArgs`.
+// If `addResultAttribute` is true, adds the unit attribute `bufferize.result`
+// to each newly created function argument.
 static LogicalResult
 updateFuncOp(func::FuncOp func,
-             SmallVectorImpl<BlockArgument> &appendedEntryArgs) {
+             SmallVectorImpl<BlockArgument> &appendedEntryArgs,
+             bool addResultAttribute) {
   auto functionType = func.getFunctionType();
 
   // Collect information about the results will become appended arguments.
@@ -80,6 +83,10 @@ updateFuncOp(func::FuncOp func,
   for (int i = 0, e = erasedResultTypes.size(); i < e; ++i, ++erasedIndicesIt) {
     func.setArgAttrs(functionType.getNumInputs() + i,
                      func.getResultAttrs(*erasedIndicesIt));
+    if (addResultAttribute)
+      func.setArgAttr(functionType.getNumInputs() + i,
+                      StringAttr::get(func.getContext(), "bufferize.result"),
+                      UnitAttr::get(func.getContext()));
   }
 
   // Erase the results.
@@ -127,7 +134,7 @@ static LogicalResult updateReturnOps(func::FuncOp func,
 // temporary buffers for newly introduced out params.
 static LogicalResult
 updateCalls(ModuleOp module,
-            const bufferization::BufferResultsToOutParamsOptions &options) {
+            const bufferization::BufferResultsToOutParamsOpts &options) {
   bool didFail = false;
   SymbolTable symtab(module);
   module.walk([&](func::CallOp op) {
@@ -189,12 +196,13 @@ updateCalls(ModuleOp module,
 
 LogicalResult mlir::bufferization::promoteBufferResultsToOutParams(
     ModuleOp module,
-    const bufferization::BufferResultsToOutParamsOptions &options) {
+    const bufferization::BufferResultsToOutParamsOpts &options) {
   for (auto func : module.getOps<func::FuncOp>()) {
     if (!options.filterFn(&func))
       continue;
     SmallVector<BlockArgument, 6> appendedEntryArgs;
-    if (failed(updateFuncOp(func, appendedEntryArgs)))
+    if (failed(
+            updateFuncOp(func, appendedEntryArgs, options.addResultAttribute)))
       return failure();
     if (func.isExternal())
       continue;
@@ -218,21 +226,25 @@ struct BufferResultsToOutParamsPass
     : bufferization::impl::BufferResultsToOutParamsBase<
           BufferResultsToOutParamsPass> {
   explicit BufferResultsToOutParamsPass(
-      const bufferization::BufferResultsToOutParamsOptions &options)
+      const bufferization::BufferResultsToOutParamsOpts &options)
       : options(options) {}
 
   void runOnOperation() override {
+    // Convert from pass options in tablegen to BufferResultsToOutParamsOpts.
+    if (addResultAttribute)
+      options.addResultAttribute = true;
+
     if (failed(bufferization::promoteBufferResultsToOutParams(getOperation(),
                                                               options)))
       return signalPassFailure();
   }
 
 private:
-  bufferization::BufferResultsToOutParamsOptions options;
+  bufferization::BufferResultsToOutParamsOpts options;
 };
 } // namespace
 
 std::unique_ptr<Pass> mlir::bufferization::createBufferResultsToOutParamsPass(
-    const bufferization::BufferResultsToOutParamsOptions &options) {
+    const bufferization::BufferResultsToOutParamsOpts &options) {
   return std::make_unique<BufferResultsToOutParamsPass>(options);
 }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 6a0ad66549965..8dbf701620121 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -453,7 +453,7 @@ LogicalResult bufferization::bufferizeOp(Operation *op,
   // canonicalize away (or canonicalize to more precise layouts).
   SmallVector<Operation *> worklist;
   op->walk<WalkOrder::PostOrder>([&](Operation *op) {
-    if (hasTensorSemantics(op))
+    if (options.isOpAllowed(op) && hasTensorSemantics(op))
       worklist.push_back(op);
   });
 
@@ -472,8 +472,6 @@ LogicalResult bufferization::bufferizeOp(Operation *op,
     auto bufferizableOp = options.dynCastBufferizableOp(nextOp);
     if (!bufferizableOp)
       continue;
-    if (!options.isOpAllowed(nextOp))
-      continue;
     // Skip ops that no longer have tensor semantics.
     if (!hasTensorSemantics(nextOp))
       continue;
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index d242d75bd51fa..c6b02b9703e75 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -10,6 +10,8 @@
 
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
@@ -69,6 +71,10 @@ void ControlFlowDialect::initialize() {
       >();
   addInterfaces<ControlFlowInlinerInterface>();
   declarePromisedInterface<ControlFlowDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, BranchOp,
+                            CondBranchOp>();
+  declarePromisedInterface<CondBranchOp,
+                           bufferization::BufferDeallocationOpInterface>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 9426bbbe2370f..e401a83bcb42e 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -132,9 +132,10 @@ LogicalResult ApplyOp::verify() {
 LogicalResult emitc::AssignOp::verify() {
   Value variable = getVar();
   Operation *variableDef = variable.getDefiningOp();
-  if (!variableDef || !llvm::isa<emitc::VariableOp>(variableDef))
+  if (!variableDef ||
+      !llvm::isa<emitc::VariableOp, emitc::SubscriptOp>(variableDef))
     return emitOpError() << "requires first operand (" << variable
-                         << ") to be a Variable";
+                         << ") to be a Variable or subscript";
 
   Value value = getValue();
   if (variable.getType() != value.getType())
@@ -746,6 +747,20 @@ LogicalResult emitc::YieldOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// SubscriptOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult emitc::SubscriptOp::verify() {
+  if (getIndices().size() != (size_t)getArray().getType().getRank()) {
+    return emitOpError() << "requires number of indices ("
+                         << getIndices().size()
+                         << ") to match the rank of the array type ("
+                         << getArray().getType().getRank() << ")";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Func/IR/FuncOps.cpp b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
index d18ec279e85c0..ed2ecfe9d0fb5 100644
--- a/mlir/lib/Dialect/Func/IR/FuncOps.cpp
+++ b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
@@ -43,6 +44,8 @@ void FuncDialect::initialize() {
       >();
   declarePromisedInterface<FuncDialect, DialectInlinerInterface>();
   declarePromisedInterface<FuncDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, CallOp,
+                            FuncOp, ReturnOp>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 33ce5c159db4f..a02eca8b11790 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -215,6 +216,8 @@ void GPUDialect::initialize() {
 #include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc"
       >();
   addInterfaces<GPUInlinerInterface>();
+  declarePromisedInterface<TerminatorOp,
+                           bufferization::BufferDeallocationOpInterface>();
 }
 
 static std::string getSparseHandleKeyword(SparseHandleKind kind) {
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
index 645a45dd96bef..a44e83313c9c1 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
@@ -35,6 +35,7 @@ static LogicalResult parseExpressionArg(AsmParser &parser, uint64_t opcode,
 static void printExpressionArg(AsmPrinter &printer, uint64_t opcode,
                                ArrayRef<uint64_t> args);
 
+#include "mlir/Dialect/LLVMIR/LLVMAttrInterfaces.cpp.inc"
 #include "mlir/Dialect/LLVMIR/LLVMOpsEnums.cpp.inc"
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.cpp.inc"
@@ -185,6 +186,24 @@ void printExpressionArg(AsmPrinter &printer, uint64_t opcode,
   });
 }
 
+//===----------------------------------------------------------------------===//
+// DICompositeTypeAttr
+//===----------------------------------------------------------------------===//
+
+DIRecursiveTypeAttrInterface
+DICompositeTypeAttr::withRecId(DistinctAttr recId) {
+  return DICompositeTypeAttr::get(getContext(), getTag(), recId, getName(),
+                                  getFile(), getLine(), getScope(),
+                                  getBaseType(), getFlags(), getSizeInBits(),
+                                  getAlignInBits(), getElements());
+}
+
+DIRecursiveTypeAttrInterface
+DICompositeTypeAttr::getRecSelf(DistinctAttr recId) {
+  return DICompositeTypeAttr::get(recId.getContext(), 0, recId, {}, {}, 0, {},
+                                  {}, DIFlags(), 0, 0, {});
+}
+
 //===----------------------------------------------------------------------===//
 // TargetFeaturesAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index bf24194d03ddb..81766b307d3d7 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -344,6 +344,10 @@ LogicalResult LLVM::GEPOp::ensureOnlySafeAccesses(
     return failure();
   if (!isFirstIndexZero(*this))
     return failure();
+  // Dynamic indices can be out-of-bounds (even negative), so an access with
+  // dynamic indices can never be considered safe.
+  if (!getDynamicIndices().empty())
+    return failure();
   Type reachedType = getResultPtrElementType();
   if (!reachedType)
     return failure();
@@ -362,6 +366,10 @@ bool LLVM::GEPOp::canRewire(const DestructurableMemorySlot &slot,
     return false;
   if (!isFirstIndexZero(*this))
     return false;
+  // Dynamic indices can be out-of-bounds (even negative), so an access with
+  // dynamic indices can never be properly rewired.
+  if (!getDynamicIndices().empty())
+    return false;
   Type reachedType = getResultPtrElementType();
   if (!reachedType || getIndices().size() < 2)
     return false;
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 4780ec09b81b9..9e8407451a085 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -214,8 +214,7 @@ void MmaOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict(this->getOperation()->getAttrs(), ignoreAttrNames);
 
   // Print the types of the operands and result.
-  p << " : "
-    << "(";
+  p << " : " << "(";
   llvm::interleaveComma(SmallVector<Type, 3>{frags[0].regs[0].getType(),
                                              frags[1].regs[0].getType(),
                                              frags[2].regs[0].getType()},
@@ -956,9 +955,7 @@ std::string NVVM::WgmmaMmaAsyncOp::getPtx() {
   ss << "},";
   // Need to map read/write registers correctly.
   regCnt = (regCnt * 2);
-  ss << " $" << (regCnt) << ","
-     << " $" << (regCnt + 1) << ","
-     << " p";
+  ss << " $" << (regCnt) << "," << " $" << (regCnt + 1) << "," << " p";
   if (getTypeD() != WGMMATypes::s32) {
     ss << ", $" << (regCnt + 3) << ",  $" << (regCnt + 4);
   }
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
index 027058d4de632..a6936fde43709 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
@@ -21,7 +21,10 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/InliningUtils.h"
@@ -123,6 +126,23 @@ void mlir::linalg::LinalgDialect::initialize() {
   declarePromisedInterface<GenericOp, mesh::ShardingInterface>();
   declarePromisedInterfaces<mesh::ShardingInterface,
 #define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+                            >();
+  declarePromisedInterface<CopyOp, SubsetOpInterface>();
+  declarePromisedInterface<CopyOp, SubsetInsertionOpInterface>();
+  declarePromisedInterface<IndexOp, ValueBoundsOpInterface>();
+  declarePromisedInterface<linalg::GenericOp, TilingInterface>();
+  declarePromisedInterface<linalg::GenericOp, PartialReductionOpInterface>();
+  declarePromisedInterfaces<TilingInterface,
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+                            >();
+  declarePromisedInterfaces<PartialReductionOpInterface,
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+                            >();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface,
+#define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
                             >();
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 1e703dacfd0c7..c74ab1e6448be 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -24,6 +24,7 @@
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Interfaces/MaskableOpInterface.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
@@ -54,6 +55,8 @@ using namespace mlir::linalg;
 /// Try to vectorize `convOp` as a convolution.
 static FailureOr<Operation *>
 vectorizeConvolution(RewriterBase &rewriter, LinalgOp convOp,
+                     ArrayRef<int64_t> inputVecSizes = {},
+                     ArrayRef<bool> inputVecScalableFlags = {},
                      bool flatten1DDepthwiseConv = false);
 
 /// Return the unique instance of OpType in `block` if it is indeed unique.
@@ -1712,7 +1715,40 @@ static LogicalResult reductionPreconditions(LinalgOp op) {
   return success();
 }
 
-static LogicalResult vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op) {
+static LogicalResult
+vectorizeDynamicConvOpPrecondition(linalg::LinalgOp conv,
+                                   bool flatten1DDepthwiseConv) {
+  if (flatten1DDepthwiseConv) {
+    LDBG("Vectorization of flattened convs with dynamic shapes is not "
+         "supported\n");
+    return failure();
+  }
+
+  if (!isa<linalg::DepthwiseConv1DNwcWcOp>(conv)) {
+    LDBG("Not a 1D depth-wise WC conv, dynamic shapes are not supported\n");
+    return failure();
+  }
+
+  // Support dynamic shapes in 1D depthwise convolution, but only in the
+  // _channel_ dimension.
+  Value lhs = conv.getDpsInputOperand(0)->get();
+  ArrayRef<int64_t> lhsShape = cast<ShapedType>(lhs.getType()).getShape();
+  auto shapeWithoutCh = lhsShape.drop_back(1);
+  if (ShapedType::isDynamicShape(shapeWithoutCh)) {
+    LDBG("Dynamically-shaped op vectorization precondition failed: only "
+         "channel dim can be dynamic\n");
+    return failure();
+  }
+
+  return success();
+}
+
+static LogicalResult
+vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
+                                     bool flatten1DDepthwiseConv) {
+  if (isa<ConvolutionOpInterface>(op.getOperation()))
+    return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv);
+
   // TODO: Masking only supports dynamic element-wise ops, linalg.generic ops,
   // linalg.copy ops and ops that implement ContractionOpInterface for now.
   if (!isElementwise(op) &&
@@ -1778,10 +1814,9 @@ vectorizeUnPackOpPrecondition(tensor::UnPackOp unpackOp,
   return success();
 }
 
-static LogicalResult
-vectorizeLinalgOpPrecondition(LinalgOp linalgOp,
-                              ArrayRef<int64_t> inputVectorSizes,
-                              bool vectorizeNDExtract) {
+static LogicalResult vectorizeLinalgOpPrecondition(
+    LinalgOp linalgOp, ArrayRef<int64_t> inputVectorSizes,
+    bool vectorizeNDExtract, bool flatten1DDepthwiseConv) {
   // tensor with dimension of 0 cannot be vectorized.
   if (llvm::is_contained(linalgOp.getStaticShape(), 0))
     return failure();
@@ -1791,8 +1826,8 @@ vectorizeLinalgOpPrecondition(LinalgOp linalgOp,
                                       inputVectorSizes)))
     return failure();
 
-  if (linalgOp.hasDynamicShape() &&
-      failed(vectorizeDynamicLinalgOpPrecondition(linalgOp))) {
+  if (linalgOp.hasDynamicShape() && failed(vectorizeDynamicLinalgOpPrecondition(
+                                        linalgOp, flatten1DDepthwiseConv))) {
     LDBG("Dynamically-shaped op failed vectorization pre-conditions\n");
     return failure();
   }
@@ -1911,14 +1946,17 @@ vectorizeScalableVectorPrecondition(Operation *op,
   if (!isScalable)
     return success();
 
-  // Only element-wise ops supported in the presence of scalable dims.
+  // Only element-wise and 1d depthwise conv ops supported in the presence of
+  // scalable dims.
   auto linalgOp = dyn_cast<LinalgOp>(op);
-  return success(linalgOp && isElementwise(linalgOp));
+  return success(linalgOp && (isElementwise(linalgOp) ||
+                              isa<linalg::DepthwiseConv1DNwcWcOp>(op)));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
     Operation *op, ArrayRef<int64_t> inputVectorSizes,
-    ArrayRef<bool> inputScalableVecDims, bool vectorizeNDExtract) {
+    ArrayRef<bool> inputScalableVecDims, bool vectorizeNDExtract,
+    bool flatten1DDepthwiseConv) {
   if (failed(vectorizeScalableVectorPrecondition(op, inputVectorSizes,
                                                  inputScalableVecDims)))
     return failure();
@@ -1926,7 +1964,8 @@ LogicalResult mlir::linalg::vectorizeOpPrecondition(
   return TypeSwitch<Operation *, LogicalResult>(op)
       .Case<linalg::LinalgOp>([&](auto linalgOp) {
         return vectorizeLinalgOpPrecondition(linalgOp, inputVectorSizes,
-                                             vectorizeNDExtract);
+                                             vectorizeNDExtract,
+                                             flatten1DDepthwiseConv);
       })
       .Case<tensor::PadOp>([&](auto padOp) {
         return vectorizePadOpPrecondition(padOp, inputVectorSizes);
@@ -1975,7 +2014,8 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
   LLVM_DEBUG(llvm::dbgs() << "\n");
 
   if (failed(vectorizeOpPrecondition(op, inputVectorSizes, inputScalableVecDims,
-                                     vectorizeNDExtract))) {
+                                     vectorizeNDExtract,
+                                     flatten1DDepthwiseConv))) {
     LDBG("Vectorization pre-conditions failed\n");
     return failure();
   }
@@ -1999,7 +2039,8 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
             // inference.
             if (isa<ConvolutionOpInterface>(linalgOp.getOperation())) {
               FailureOr<Operation *> convOr = vectorizeConvolution(
-                  rewriter, linalgOp, flatten1DDepthwiseConv);
+                  rewriter, linalgOp, inputVectorSizes, inputScalableVecDims,
+                  flatten1DDepthwiseConv);
               if (succeeded(convOr)) {
                 llvm::append_range(results, (*convOr)->getResults());
                 return success();
@@ -3131,13 +3172,30 @@ struct Conv1DGenerator
   /// kw is always unrolled.
   /// TODO: w (resp. kw) is unrolled when the strideW ( resp. dilationW) is
   /// > 1.
-  FailureOr<Operation *> depthwiseConv(bool flatten) {
+  FailureOr<Operation *> depthwiseConv(uint64_t channelDimVecSize,
+                                       bool channelDimScalableFlag,
+                                       bool flatten) {
     if (!valid)
       return rewriter.notifyMatchFailure(op, "unvectorizable depthwise conv");
 
+    bool scalableChDim = false;
+    bool useMasking = false;
     int64_t nSize, wSize, cSize, kwSize;
     // kernel{kw, c}
     bindShapeDims(rhsShapedType, kwSize, cSize);
+    if (ShapedType::isDynamic(cSize)) {
+      assert(channelDimVecSize != 0 && "Channel dim vec size must be > 0");
+      cSize = channelDimVecSize;
+      // Scalable vectors are only used when both conditions are met:
+      //  1. channel dim is dynamic
+      //  2. channelDimScalableFlag is set
+      scalableChDim = channelDimScalableFlag;
+      useMasking = true;
+    }
+
+    assert(!(useMasking && flatten) &&
+           "Unsupported flattened conv with dynamic shapes");
+
     // out{n, w, c}
     bindShapeDims(resShapedType, nSize, wSize);
 
@@ -3158,20 +3216,51 @@ struct Conv1DGenerator
          //   (i.e. 16 convolved with 3 (@stride 1 dilation 1) -> 14)
          ((wSize - 1) * strideW + 1) + ((kwSize - 1) * dilationW + 1) - 1,
          cSize},
-        lhsEltType);
-    VectorType rhsType = VectorType::get({kwSize, cSize}, rhsEltType);
-    VectorType resType = VectorType::get({nSize, wSize, cSize}, resEltType);
+        lhsEltType, /*scalableDims=*/{false, false, scalableChDim});
+    VectorType rhsType =
+        VectorType::get({kwSize, cSize}, rhsEltType,
+                        /*scalableDims=*/{false, scalableChDim});
+    VectorType resType =
+        VectorType::get({nSize, wSize, cSize}, resEltType,
+                        /*scalableDims=*/{false, false, scalableChDim});
+
+    // Masks the input xfer Op along the channel dim, iff the corresponding
+    // scalable flag is set.
+    auto maybeMaskXferOp = [&](ArrayRef<int64_t> maskShape,
+                               ArrayRef<bool> scalableDims,
+                               Operation *opToMask) {
+      if (!useMasking)
+        return opToMask;
+      auto maskType =
+          VectorType::get(maskShape, rewriter.getI1Type(), scalableDims);
+
+      SmallVector<OpFoldResult> mixedDims = vector::getMixedSizesXfer(
+          cast<LinalgOp>(op).hasPureTensorSemantics(), opToMask, rewriter);
+
+      Value maskOp =
+          rewriter.create<vector::CreateMaskOp>(loc, maskType, mixedDims);
+
+      return mlir::vector::maskOperation(rewriter, opToMask, maskOp);
+    };
 
     // Read lhs slice of size {n, w * strideW + kw * dilationW, c} @ [0, 0,
     // 0].
     Value lhs = rewriter.create<vector::TransferReadOp>(
         loc, lhsType, lhsShaped, ValueRange{zero, zero, zero});
+    auto maybeMaskedLhs = maybeMaskXferOp(
+        lhsType.getShape(), lhsType.getScalableDims(), lhs.getDefiningOp());
+
     // Read rhs slice of size {kw, c} @ [0, 0].
     Value rhs = rewriter.create<vector::TransferReadOp>(loc, rhsType, rhsShaped,
                                                         ValueRange{zero, zero});
+    auto maybeMaskedRhs = maybeMaskXferOp(
+        rhsType.getShape(), rhsType.getScalableDims(), rhs.getDefiningOp());
+
     // Read res slice of size {n, w, c} @ [0, 0, 0].
     Value res = rewriter.create<vector::TransferReadOp>(
         loc, resType, resShaped, ValueRange{zero, zero, zero});
+    auto maybeMaskedRes = maybeMaskXferOp(
+        resType.getShape(), resType.getScalableDims(), res.getDefiningOp());
 
     //===------------------------------------------------------------------===//
     // Begin vector-only rewrite part
@@ -3186,7 +3275,7 @@ struct Conv1DGenerator
     for (int64_t kw = 0; kw < kwSize; ++kw) {
       for (int64_t w = 0; w < wSize; w += wSizeStep) {
         lhsVals.push_back(rewriter.create<vector::ExtractStridedSliceOp>(
-            loc, lhs,
+            loc, maybeMaskedLhs->getResult(0),
             /*offsets=*/ArrayRef<int64_t>{0, w * strideW + kw * dilationW, 0},
             inOutSliceSizes, inOutStrides));
       }
@@ -3194,12 +3283,13 @@ struct Conv1DGenerator
     // Extract rhs slice of size {c} @ [kw].
     for (int64_t kw = 0; kw < kwSize; ++kw) {
       rhsVals.push_back(rewriter.create<vector::ExtractOp>(
-          loc, rhs, /*offsets=*/ArrayRef<int64_t>{kw}));
+          loc, maybeMaskedRhs->getResult(0),
+          /*offsets=*/ArrayRef<int64_t>{kw}));
     }
     // Extract res slice: {n, wSizeStep, c} @ [0, w, 0].
     for (int64_t w = 0; w < wSize; w += wSizeStep) {
       resVals.push_back(rewriter.create<vector::ExtractStridedSliceOp>(
-          loc, res,
+          loc, maybeMaskedRes->getResult(0),
           /*offsets=*/ArrayRef<int64_t>{0, w, 0}, inOutSliceSizes,
           inOutStrides));
     }
@@ -3208,10 +3298,15 @@ struct Conv1DGenerator
       return kw * (wSize / wSizeStep) + w;
     };
 
+    // Note - the scalable flags are ignored as flattening combined with
+    // scalable vectorization is not supported.
     auto inOutFlattenSliceSizes =
         SmallVector<int64_t>{nSize, wSizeStep * cSize};
-    auto lhsCastType = VectorType::get(inOutFlattenSliceSizes, lhsEltType);
-    auto resCastType = VectorType::get(inOutFlattenSliceSizes, resEltType);
+    auto lhsTypeAfterFlattening =
+        VectorType::get(inOutFlattenSliceSizes, lhsEltType);
+    auto resTypeAfterFlattening =
+        VectorType::get(inOutFlattenSliceSizes, resEltType);
+
     // Compute contraction: O{n, w, c} += I{n, sw * w + dw * kw, c} * F{c}
     for (int64_t kw = 0; kw < kwSize; ++kw) {
       for (int64_t w = 0; w < wSize; w += wSizeStep) {
@@ -3221,9 +3316,9 @@ struct Conv1DGenerator
           // Flatten the input and output vectors (collapse the channel
           // dimension)
           lhsVal = rewriter.create<vector::ShapeCastOp>(
-              loc, lhsCastType, lhsVals[linearIndex(kw, w)]);
-          resVal = rewriter.create<vector::ShapeCastOp>(loc, resCastType,
-                                                        resVals[w]);
+              loc, lhsTypeAfterFlattening, lhsVals[linearIndex(kw, w)]);
+          resVal = rewriter.create<vector::ShapeCastOp>(
+              loc, resTypeAfterFlattening, resVals[w]);
         }
         resVals[w] = depthwiseConv1dSliceAsMulAcc(rewriter, loc, lhsVal,
                                                   rhsVals[kw], resVal, flatten);
@@ -3248,8 +3343,8 @@ struct Conv1DGenerator
     // Write back res slice: {n, wSizeStep, c} @ [0, w, 0].
     // This does not depend on kw.
     for (int64_t w = 0; w < wSize; w += wSizeStep) {
-      res = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, resVals[w], res,
+      maybeMaskedRes = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, resVals[w], maybeMaskedRes->getResult(0),
           /*offsets=*/ArrayRef<int64_t>{0, w, 0},
           /*strides=*/ArrayRef<int64_t>{1, 1, 1});
     }
@@ -3258,10 +3353,11 @@ struct Conv1DGenerator
     //===------------------------------------------------------------------===//
 
     // Write back res slice of size {n, w, c} @ [0, 0, 0].
-    return rewriter
-        .create<vector::TransferWriteOp>(loc, res, resShaped,
-                                         ValueRange{zero, zero, zero})
-        .getOperation();
+    Operation *resOut = rewriter.create<vector::TransferWriteOp>(
+        loc, maybeMaskedRes->getResult(0), resShaped,
+        ValueRange{zero, zero, zero});
+    return maybeMaskXferOp(resType.getShape(), resType.getScalableDims(),
+                           resOut);
   }
 
   /// Lower:
@@ -3278,6 +3374,10 @@ struct Conv1DGenerator
     lhs = promote(rewriter, loc, lhs, resTy);
 
     if (flatten) {
+      // NOTE: This following logic won't work for scalable vectors. For this
+      // reason, "flattening" is not supported when shapes are dynamic (this
+      // should be captured by one of the pre-conditions).
+
       // There are two options for handling the filter:
       //  * shape_cast(broadcast(filter))
       //  * broadcast(shuffle(filter))
@@ -3399,7 +3499,9 @@ struct Conv1DGenerator
 
   /// Entry point that transposes into the common form:
   ///   {{n, strideW * w + dilationW * kw, c}, {kw, c}, {n, w, c}}
-  FailureOr<Operation *> generateDilatedConv(bool flatten = false) {
+  FailureOr<Operation *> generateDilatedConv(uint64_t vecChDimSize = 0,
+                                             bool vecChDimScalableFlag = false,
+                                             bool flatten = false) {
     AffineExpr n, w, c, kw;
     bindDims(ctx, n, w, c, kw);
     if (!iters({Par(), Par(), Par(), Red()}))
@@ -3410,7 +3512,7 @@ struct Conv1DGenerator
     if (layout({/*lhsIndex*/ {n, strideW * w + dilationW * kw, c},
                 /*rhsIndex*/ {kw, c},
                 /*resIndex*/ {n, w, c}}))
-      return depthwiseConv(flatten);
+      return depthwiseConv(vecChDimSize, vecChDimScalableFlag, flatten);
 
     return rewriter.notifyMatchFailure(op, "not a depthwise::Nwc layout");
   }
@@ -3475,9 +3577,9 @@ struct Conv1DGenerator
 
 /// Helper function to vectorize a LinalgOp with convolution semantics.
 // TODO: extend the generic vectorization to support windows and drop this.
-static FailureOr<Operation *>
-vectorizeConvolution(RewriterBase &rewriter, LinalgOp op,
-                     bool flatten1DDepthwiseConv) {
+static FailureOr<Operation *> vectorizeConvolution(
+    RewriterBase &rewriter, LinalgOp op, ArrayRef<int64_t> inputVecSizes,
+    ArrayRef<bool> inputScalableVecDims, bool flatten1DDepthwiseConv) {
   // The ConvolutionOpInterface gives us guarantees of existence for
   // strides/dilations. However, we do not need to rely on those, we can
   // simply use them if present, otherwise use the default and let the generic
@@ -3502,7 +3604,28 @@ vectorizeConvolution(RewriterBase &rewriter, LinalgOp op,
   res = e.generateNcwPooling();
   if (succeeded(res))
     return res;
-  return e.generateDilatedConv(flatten1DDepthwiseConv);
+
+  // Only depthwise 1D NWC convs are left - these can be vectorized using masks
+  // and scalable vectors. Note that ATM the only dim that can be dynamic (i.e.
+  // masked/scalable) is the channel dim (i.e. the trailing dim).
+  uint64_t vecChDimSize = ShapedType::kDynamic;
+  bool vecChDimScalableFlag = false;
+  if (!inputVecSizes.empty()) {
+    // Only use the input vector size corresponding to the channel dim. Other
+    // vector dims will be inferred from the Ops.
+    assert((isa<linalg::DepthwiseConv1DNwcWcOp>(*op) ||
+            isa<linalg::DepthwiseConv1DNcwCwOp>(*op)) &&
+           "Not a 1D depthwise conv!");
+    size_t chDimIdx =
+        TypeSwitch<Operation *, size_t>(op)
+            .Case<linalg::DepthwiseConv1DNwcWcOp>([](auto conv) { return 2; })
+            .Case<linalg::DepthwiseConv1DNcwCwOp>([](auto conv) { return 1; });
+
+    vecChDimSize = inputVecSizes[chDimIdx];
+    vecChDimScalableFlag = inputScalableVecDims[chDimIdx];
+  }
+  return e.generateDilatedConv(vecChDimSize, vecChDimScalableFlag,
+                               flatten1DDepthwiseConv);
 }
 
 struct VectorizeConvolution : public OpInterfaceRewritePattern<LinalgOp> {
diff --git a/mlir/lib/Dialect/MLProgram/IR/CMakeLists.txt b/mlir/lib/Dialect/MLProgram/IR/CMakeLists.txt
index 90c9c9aa78750..725bb5fd9da9e 100644
--- a/mlir/lib/Dialect/MLProgram/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/MLProgram/IR/CMakeLists.txt
@@ -15,5 +15,6 @@ add_mlir_dialect_library(MLIRMLProgramDialect
   MLIRControlFlowInterfaces
   MLIRFunctionInterfaces
   MLIRInferTypeOpInterface
+  MLIRTransforms
   MLIRIR
   )
diff --git a/mlir/lib/Dialect/MLProgram/IR/MLProgramDialect.cpp b/mlir/lib/Dialect/MLProgram/IR/MLProgramDialect.cpp
index 1a8fe208d4099..bda1032ed9884 100644
--- a/mlir/lib/Dialect/MLProgram/IR/MLProgramDialect.cpp
+++ b/mlir/lib/Dialect/MLProgram/IR/MLProgramDialect.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/MLProgram/IR/MLProgram.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
@@ -24,6 +25,18 @@ using namespace mlir::ml_program;
 #include "mlir/Dialect/MLProgram/IR/MLProgramTypes.cpp.inc"
 
 namespace {
+
+struct MLProgramInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  bool isLegalToInline(Operation *, Region *, bool,
+                       IRMapping &) const override {
+    // We have no specific opinion on whether ops defined in this dialect should
+    // be inlined.
+    return true;
+  }
+};
+
 struct MLProgramOpAsmDialectInterface : public OpAsmDialectInterface {
   using OpAsmDialectInterface::OpAsmDialectInterface;
 
@@ -53,5 +66,5 @@ void ml_program::MLProgramDialect::initialize() {
 #include "mlir/Dialect/MLProgram/IR/MLProgramOps.cpp.inc"
       >();
 
-  addInterfaces<MLProgramOpAsmDialectInterface>();
+  addInterfaces<MLProgramInlinerInterface, MLProgramOpAsmDialectInterface>();
 }
diff --git a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
index 989a3e5536ec6..e1ab9c905447b 100644
--- a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
@@ -91,34 +91,42 @@ static LogicalResult convertCoshOp(math::CoshOp op, PatternRewriter &rewriter) {
 }
 
 /// Expands tanh op into
-///   1) 1-exp^{-2x} / 1+exp^{-2x}, if x => 0
-///   2) exp^{2x}-1 / exp^{2x}+1  , if x < 0
+/// 1-exp^{-2x} / 1+exp^{-2x}
+/// To avoid overflow we exploit the reflection symmetry `tanh(-x) = -tanh(x)`.
+/// We compute a "signs" value which is -1 if input is negative and +1 if input
+/// is positive.  Then multiply the input by this value, guaranteeing that the
+/// result is positive, which also guarantees `exp^{-2x * sign(x)}` is in (0,
+/// 1]. Expand the computation on the input `x * sign(x)`, then multiply the
+/// result by `sign(x)` to retain sign of the real result.
 static LogicalResult convertTanhOp(math::TanhOp op, PatternRewriter &rewriter) {
   auto floatType = op.getOperand().getType();
   Location loc = op.getLoc();
+  Value zero = createFloatConst(loc, floatType, 0.0, rewriter);
   Value one = createFloatConst(loc, floatType, 1.0, rewriter);
-  Value two = createFloatConst(loc, floatType, 2.0, rewriter);
-  Value doubledX = rewriter.create<arith::MulFOp>(loc, op.getOperand(), two);
-
-  // Case 1: tanh(x) = 1-exp^{-2x} / 1+exp^{-2x}
-  Value negDoubledX = rewriter.create<arith::NegFOp>(loc, doubledX);
+  Value negTwo = createFloatConst(loc, floatType, -2.0, rewriter);
+
+  // Compute sign(x) = cast<float_type>(x < 0) * (-2) + 1
+  Value isNegative = rewriter.create<arith::CmpFOp>(
+      loc, arith::CmpFPredicate::OLT, op.getOperand(), zero);
+  Value isNegativeFloat =
+      rewriter.create<arith::UIToFPOp>(loc, floatType, isNegative);
+  Value isNegativeTimesNegTwo =
+      rewriter.create<arith::MulFOp>(loc, isNegativeFloat, negTwo);
+  Value sign = rewriter.create<arith::AddFOp>(loc, isNegativeTimesNegTwo, one);
+
+  // Normalize input to positive value: y = sign(x) * x
+  Value positiveX = rewriter.create<arith::MulFOp>(loc, sign, op.getOperand());
+
+  // Decompose on normalized input
+  Value negDoubledX = rewriter.create<arith::MulFOp>(loc, negTwo, positiveX);
   Value exp2x = rewriter.create<math::ExpOp>(loc, negDoubledX);
   Value dividend = rewriter.create<arith::SubFOp>(loc, one, exp2x);
   Value divisor = rewriter.create<arith::AddFOp>(loc, one, exp2x);
   Value positiveRes = rewriter.create<arith::DivFOp>(loc, dividend, divisor);
 
-  // Case 2: tanh(x) = exp^{2x}-1 / exp^{2x}+1
-  exp2x = rewriter.create<math::ExpOp>(loc, doubledX);
-  dividend = rewriter.create<arith::SubFOp>(loc, exp2x, one);
-  divisor = rewriter.create<arith::AddFOp>(loc, exp2x, one);
-  Value negativeRes = rewriter.create<arith::DivFOp>(loc, dividend, divisor);
+  // Multiply result by sign(x) to retain signs from negative inputs
+  rewriter.replaceOpWithNewOp<arith::MulFOp>(op, sign, positiveRes);
 
-  // tanh(x) = x >= 0 ? positiveRes : negativeRes
-  Value zero = createFloatConst(loc, floatType, 0.0, rewriter);
-  Value cmpRes = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::OGE,
-                                                op.getOperand(), zero);
-  rewriter.replaceOpWithNewOp<arith::SelectOp>(op, cmpRes, positiveRes,
-                                               negativeRes);
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index 962cb28b7c2ab..428c1c37c4e8b 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -39,14 +39,24 @@ using namespace mlir;
 using namespace mlir::math;
 using namespace mlir::vector;
 
+// Helper to encapsulate a vector's shape (including scalable dims).
+struct VectorShape {
+  ArrayRef<int64_t> sizes;
+  ArrayRef<bool> scalableFlags;
+
+  bool empty() const { return sizes.empty(); }
+};
+
 // Returns vector shape if the type is a vector. Returns an empty shape if it is
 // not a vector.
-static ArrayRef<int64_t> vectorShape(Type type) {
+static VectorShape vectorShape(Type type) {
   auto vectorType = dyn_cast<VectorType>(type);
-  return vectorType ? vectorType.getShape() : ArrayRef<int64_t>();
+  return vectorType
+             ? VectorShape{vectorType.getShape(), vectorType.getScalableDims()}
+             : VectorShape{};
 }
 
-static ArrayRef<int64_t> vectorShape(Value value) {
+static VectorShape vectorShape(Value value) {
   return vectorShape(value.getType());
 }
 
@@ -55,14 +65,16 @@ static ArrayRef<int64_t> vectorShape(Value value) {
 //----------------------------------------------------------------------------//
 
 // Broadcasts scalar type into vector type (iff shape is non-scalar).
-static Type broadcast(Type type, ArrayRef<int64_t> shape) {
+static Type broadcast(Type type, VectorShape shape) {
   assert(!isa<VectorType>(type) && "must be scalar type");
-  return !shape.empty() ? VectorType::get(shape, type) : type;
+  return !shape.empty()
+             ? VectorType::get(shape.sizes, type, shape.scalableFlags)
+             : type;
 }
 
 // Broadcasts scalar value into vector (iff shape is non-scalar).
 static Value broadcast(ImplicitLocOpBuilder &builder, Value value,
-                       ArrayRef<int64_t> shape) {
+                       VectorShape shape) {
   assert(!isa<VectorType>(value.getType()) && "must be scalar value");
   auto type = broadcast(value.getType(), shape);
   return !shape.empty() ? builder.create<BroadcastOp>(type, value) : value;
@@ -215,7 +227,7 @@ static Value clamp(ImplicitLocOpBuilder &builder, Value value, Value lowerBound,
 static std::pair<Value, Value> frexp(ImplicitLocOpBuilder &builder, Value arg,
                                      bool isPositive = false) {
   assert(getElementTypeOrSelf(arg).isF32() && "arg must be f32 type");
-  ArrayRef<int64_t> shape = vectorShape(arg);
+  VectorShape shape = vectorShape(arg);
 
   auto bcast = [&](Value value) -> Value {
     return broadcast(builder, value, shape);
@@ -255,7 +267,7 @@ static std::pair<Value, Value> frexp(ImplicitLocOpBuilder &builder, Value arg,
 // Computes exp2 for an i32 argument.
 static Value exp2I32(ImplicitLocOpBuilder &builder, Value arg) {
   assert(getElementTypeOrSelf(arg).isInteger(32) && "arg must be i32 type");
-  ArrayRef<int64_t> shape = vectorShape(arg);
+  VectorShape shape = vectorShape(arg);
 
   auto bcast = [&](Value value) -> Value {
     return broadcast(builder, value, shape);
@@ -281,7 +293,7 @@ Value makePolynomialCalculation(ImplicitLocOpBuilder &builder,
   Type elementType = getElementTypeOrSelf(x);
   assert((elementType.isF32() || elementType.isF16()) &&
          "x must be f32 or f16 type");
-  ArrayRef<int64_t> shape = vectorShape(x);
+  VectorShape shape = vectorShape(x);
 
   if (coeffs.empty())
     return broadcast(builder, floatCst(builder, 0.0f, elementType), shape);
@@ -379,7 +391,7 @@ AtanApproximation::matchAndRewrite(math::AtanOp op,
   if (!getElementTypeOrSelf(operand).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   Value abs = builder.create<math::AbsFOp>(operand);
@@ -478,7 +490,7 @@ Atan2Approximation::matchAndRewrite(math::Atan2Op op,
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
-  ArrayRef<int64_t> shape = vectorShape(op.getResult());
+  VectorShape shape = vectorShape(op.getResult());
 
   // Compute atan in the valid range.
   auto div = builder.create<arith::DivFOp>(y, x);
@@ -544,7 +556,7 @@ TanhApproximation::matchAndRewrite(math::TanhOp op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -632,7 +644,7 @@ LogApproximationBase<Op>::logMatchAndRewrite(Op op, PatternRewriter &rewriter,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -779,7 +791,7 @@ Log1pApproximation::matchAndRewrite(math::Log1pOp op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -829,7 +841,7 @@ ErfPolynomialApproximation::matchAndRewrite(math::ErfOp op,
   if (!(elementType.isF32() || elementType.isF16()))
     return rewriter.notifyMatchFailure(op,
                                        "only f32 and f16 type is supported.");
-  ArrayRef<int64_t> shape = vectorShape(operand);
+  VectorShape shape = vectorShape(operand);
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -938,9 +950,8 @@ ErfPolynomialApproximation::matchAndRewrite(math::ErfOp op,
 
 namespace {
 
-Value clampWithNormals(ImplicitLocOpBuilder &builder,
-                       const llvm::ArrayRef<int64_t> shape, Value value,
-                       float lowerBound, float upperBound) {
+Value clampWithNormals(ImplicitLocOpBuilder &builder, const VectorShape shape,
+                       Value value, float lowerBound, float upperBound) {
   assert(!std::isnan(lowerBound));
   assert(!std::isnan(upperBound));
 
@@ -1131,7 +1142,7 @@ ExpM1Approximation::matchAndRewrite(math::ExpM1Op op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -1201,7 +1212,7 @@ LogicalResult SinAndCosApproximation<isSine, OpTy>::matchAndRewrite(
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -1328,7 +1339,7 @@ CbrtApproximation::matchAndRewrite(math::CbrtOp op,
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-  ArrayRef<int64_t> shape = vectorShape(operand);
+  VectorShape shape = vectorShape(operand);
 
   Type floatTy = getElementTypeOrSelf(operand.getType());
   Type intTy = b.getIntegerType(floatTy.getIntOrFloatBitWidth());
@@ -1417,10 +1428,10 @@ RsqrtApproximation::matchAndRewrite(math::RsqrtOp op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   // Only support already-vectorized rsqrt's.
-  if (shape.empty() || shape.back() % 8 != 0)
+  if (shape.empty() || shape.sizes.back() % 8 != 0)
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
index d71669a274b8f..41082a85a485f 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
@@ -8,8 +8,13 @@
 
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Interfaces/MemorySlotInterfaces.h"
+#include "mlir/Interfaces/RuntimeVerifiableOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include <optional>
 
@@ -43,6 +48,13 @@ void mlir::memref::MemRefDialect::initialize() {
       >();
   addInterfaces<MemRefInlinerInterface>();
   declarePromisedInterface<MemRefDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterfaces<bufferization::AllocationOpInterface, AllocOp,
+                            AllocaOp, ReallocOp>();
+  declarePromisedInterfaces<RuntimeVerifiableOpInterface, CastOp, ExpandShapeOp,
+                            LoadOp, ReinterpretCastOp, StoreOp, SubViewOp>();
+  declarePromisedInterfaces<ValueBoundsOpInterface, AllocOp, AllocaOp, CastOp,
+                            DimOp, GetGlobalOp, RankOp, SubViewOp>();
+  declarePromisedInterface<MemRefType, DestructurableTypeInterface>();
 }
 
 /// Finds the unique dealloc operation (if one exists) for `allocValue`.
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 233e702dbb229..ddb9676eb4f62 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -9,6 +9,8 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
@@ -18,6 +20,7 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/MapVector.h"
@@ -71,6 +74,12 @@ void SCFDialect::initialize() {
 #include "mlir/Dialect/SCF/IR/SCFOps.cpp.inc"
       >();
   addInterfaces<SCFInlinerInterface>();
+  declarePromisedInterfaces<bufferization::BufferDeallocationOpInterface,
+                            InParallelOp, ReduceReturnOp>();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, ConditionOp,
+                            ExecuteRegionOp, ForOp, IfOp, IndexSwitchOp,
+                            ForallOp, InParallelOp, WhileOp, YieldOp>();
+  declarePromisedInterface<ForOp, ValueBoundsOpInterface>();
 }
 
 /// Default callback for IfOp builders. Inserts a yield without arguments.
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
index 4c62289a1e945..ff4bace9a4d88 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
@@ -797,6 +797,49 @@ OpFoldResult spirv::LogicalOrOp::fold(FoldAdaptor adaptor) {
   return Attribute();
 }
 
+//===----------------------------------------------------------------------===//
+// spirv.SelectOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult spirv::SelectOp::fold(FoldAdaptor adaptor) {
+  // spirv.Select _ x x -> x
+  Value trueVals = getTrueValue();
+  Value falseVals = getFalseValue();
+  if (trueVals == falseVals)
+    return trueVals;
+
+  ArrayRef<Attribute> operands = adaptor.getOperands();
+
+  // spirv.Select true  x y -> x
+  // spirv.Select false x y -> y
+  if (auto boolAttr = getScalarOrSplatBoolAttr(operands[0]))
+    return *boolAttr ? trueVals : falseVals;
+
+  // Check that all the operands are constant
+  if (!operands[0] || !operands[1] || !operands[2])
+    return Attribute();
+
+  // Note: getScalarOrSplatBoolAttr will always return a boolAttr if we are in
+  // the scalar case. Hence, we are only required to consider the case of
+  // DenseElementsAttr in foldSelectOp.
+  auto condAttrs = dyn_cast<DenseElementsAttr>(operands[0]);
+  auto trueAttrs = dyn_cast<DenseElementsAttr>(operands[1]);
+  auto falseAttrs = dyn_cast<DenseElementsAttr>(operands[2]);
+  if (!condAttrs || !trueAttrs || !falseAttrs)
+    return Attribute();
+
+  auto elementResults = llvm::to_vector<4>(trueAttrs.getValues<Attribute>());
+  auto iters = llvm::zip_equal(elementResults, condAttrs.getValues<BoolAttr>(),
+                               falseAttrs.getValues<Attribute>());
+  for (auto [result, cond, falseRes] : iters) {
+    if (!cond.getValue())
+      result = falseRes;
+  }
+
+  auto resultType = trueAttrs.getType();
+  return DenseElementsAttr::get(cast<ShapedType>(resultType), elementResults);
+}
+
 //===----------------------------------------------------------------------===//
 // spirv.IEqualOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index d9ee39a4e8dd3..f5a3717f815de 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Dialect/Shape/IR/Shape.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/CommonFolders.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Traits.h"
@@ -143,6 +144,8 @@ void ShapeDialect::initialize() {
   // still evolving it makes it simple to start with an unregistered ops and
   // try different variants before actually defining the op.
   allowUnknownOperations();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, AssumingOp,
+                            AssumingYieldOp>();
 }
 
 Operation *ShapeDialect::materializeConstant(OpBuilder &builder,
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 7750efdd9add0..6da51bb6b9cac 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -1956,6 +1957,10 @@ void SparseTensorDialect::initialize() {
 #define GET_OP_LIST
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorOps.cpp.inc"
       >();
+  declarePromisedInterfaces<
+      bufferization::BufferizableOpInterface, ConcatenateOp, ConvertOp, LoadOp,
+      NewOp, NumberOfEntriesOp, AssembleOp, DisassembleOp,
+      ToCoordinatesBufferOp, ToCoordinatesOp, ToPositionsOp, ToValuesOp>();
 }
 
 #define GET_OP_CLASSES
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.cpp
index a942a721e218f..7734d1d258453 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -187,6 +187,35 @@ struct DisassembleOpInterface
   }
 };
 
+struct ForeachOpInterface : public SparseBufferizableOpInterfaceExternalModel<
+                                ForeachOpInterface, sparse_tensor::ForeachOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const AnalysisState &state) const {
+    return false;
+  }
+
+  AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
+                                      const AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult verifyAnalysis(Operation *op,
+                               const AnalysisState &state) const {
+    // A more complex analysis (similar to scf.for) is needed if the op returns
+    // a tensor. That tensor would have to be bufferized (not implemented yet).
+    for (OpResult result : op->getResults()) {
+      if (isa<TensorType>(result.getType()))
+        return op->emitOpError("tensor results are not supported yet");
+    }
+    return success();
+  }
+};
+
 struct NumberOfEntriesOpInterface
     : public SparseBufferizableOpInterfaceExternalModel<
           NumberOfEntriesOpInterface, sparse_tensor::NumberOfEntriesOp> {
@@ -307,6 +336,7 @@ void mlir::sparse_tensor::registerBufferizableOpInterfaceExternalModels(
         NumberOfEntriesOpInterface>(*ctx);
     sparse_tensor::AssembleOp::attachInterface<AssembleOpInterface>(*ctx);
     sparse_tensor::DisassembleOp::attachInterface<DisassembleOpInterface>(*ctx);
+    sparse_tensor::ForeachOp::attachInterface<ForeachOpInterface>(*ctx);
     sparse_tensor::ToCoordinatesBufferOp::attachInterface<
         ToCoordinatesBufferOpInterface>(*ctx);
     sparse_tensor::ToCoordinatesOp::attachInterface<ToCoordinatesOpInterface>(
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index f93b59de29e57..14ea07f0b54b8 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -573,6 +573,12 @@ struct GenericOpScheduler : public OpRewritePattern<linalg::GenericOp> {
       rewriter.modifyOpInPlace(linalgOp, [&]() {
         linalgOp->setOperand(t->getOperandNumber(), dst);
       });
+
+      // Release the transposed form afterwards.
+      // TODO: CSE when used in more than one following op?
+      rewriter.setInsertionPointAfter(linalgOp);
+      rewriter.create<bufferization::DeallocTensorOp>(dst.getLoc(), dst);
+
       return success();
     }
     // Cannot be resolved with a single conversion.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 7ff2fc25328a6..5679f277e1486 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -1561,6 +1561,19 @@ struct SparseNewConverter : public OpConversionPattern<NewOp> {
   }
 };
 
+struct SparseHasRuntimeLibraryConverter
+    : public OpConversionPattern<HasRuntimeLibraryOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(HasRuntimeLibraryOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto i1Type = rewriter.getI1Type();
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(
+        op, i1Type, rewriter.getIntegerAttr(i1Type, 0));
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -1572,21 +1585,21 @@ struct SparseNewConverter : public OpConversionPattern<NewOp> {
 void mlir::populateSparseTensorCodegenPatterns(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     bool createSparseDeallocs, bool enableBufferInitialization) {
-  patterns.add<SparseAssembleOpConverter, SparseDisassembleOpConverter,
-               SparseReturnConverter, SparseCallConverter, SparseLvlOpConverter,
-               SparseCastConverter, SparseExtractSliceConverter,
-               SparseTensorLoadConverter, SparseExpandConverter,
-               SparseCompressConverter, SparseInsertConverter,
-               SparseReorderCOOConverter, SparseReMapConverter,
-               SparseSliceGetterOpConverter<ToSliceOffsetOp,
-                                            StorageSpecifierKind::DimOffset>,
-               SparseSliceGetterOpConverter<ToSliceStrideOp,
-                                            StorageSpecifierKind::DimStride>,
-               SparseToPositionsConverter, SparseToCoordinatesConverter,
-               SparseToCoordinatesBufferConverter, SparseToValuesConverter,
-               SparseConvertConverter, SparseNewConverter,
-               SparseNumberOfEntriesConverter>(typeConverter,
-                                               patterns.getContext());
+  patterns.add<
+      SparseAssembleOpConverter, SparseDisassembleOpConverter,
+      SparseReturnConverter, SparseCallConverter, SparseLvlOpConverter,
+      SparseCastConverter, SparseExtractSliceConverter,
+      SparseTensorLoadConverter, SparseExpandConverter, SparseCompressConverter,
+      SparseInsertConverter, SparseReorderCOOConverter, SparseReMapConverter,
+      SparseSliceGetterOpConverter<ToSliceOffsetOp,
+                                   StorageSpecifierKind::DimOffset>,
+      SparseSliceGetterOpConverter<ToSliceStrideOp,
+                                   StorageSpecifierKind::DimStride>,
+      SparseToPositionsConverter, SparseToCoordinatesConverter,
+      SparseToCoordinatesBufferConverter, SparseToValuesConverter,
+      SparseConvertConverter, SparseNewConverter,
+      SparseNumberOfEntriesConverter, SparseHasRuntimeLibraryConverter>(
+      typeConverter, patterns.getContext());
   patterns.add<SparseTensorDeallocConverter>(
       typeConverter, patterns.getContext(), createSparseDeallocs);
   patterns.add<SparseTensorAllocConverter, SparseTensorEmptyConverter>(
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index 0937c10f25728..92c98b34af602 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -840,6 +840,19 @@ class SparseTensorDisassembleConverter
   }
 };
 
+struct SparseHasRuntimeLibraryConverter
+    : public OpConversionPattern<HasRuntimeLibraryOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(HasRuntimeLibraryOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto i1Type = rewriter.getI1Type();
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(
+        op, i1Type, rewriter.getIntegerAttr(i1Type, 1));
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -868,6 +881,7 @@ void mlir::populateSparseTensorConversionPatterns(TypeConverter &typeConverter,
            SparseTensorToValuesConverter, SparseNumberOfEntriesConverter,
            SparseTensorLoadConverter, SparseTensorInsertConverter,
            SparseTensorExpandConverter, SparseTensorCompressConverter,
-           SparseTensorAssembleConverter, SparseTensorDisassembleConverter>(
-          typeConverter, patterns.getContext());
+           SparseTensorAssembleConverter, SparseTensorDisassembleConverter,
+           SparseHasRuntimeLibraryConverter>(typeConverter,
+                                             patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
index 5b4395cc31a46..c370d104e0985 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
@@ -16,6 +17,37 @@ using namespace mlir::sparse_tensor;
 
 namespace {
 
+struct GuardSparseAlloc
+    : public OpRewritePattern<bufferization::AllocTensorOp> {
+  using OpRewritePattern<bufferization::AllocTensorOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(bufferization::AllocTensorOp op,
+                                PatternRewriter &rewriter) const override {
+    // Only rewrite sparse allocations.
+    if (!getSparseTensorEncoding(op.getResult().getType()))
+      return failure();
+
+    // Only rewrite sparse allocations that escape the method
+    // without any chance of a finalizing operation in between.
+    // Here we assume that sparse tensor setup never crosses
+    // method boundaries. The current rewriting only repairs
+    // the most obvious allocate-call/return cases.
+    if (!llvm::all_of(op->getUses(), [](OpOperand &use) {
+          return isa<func::ReturnOp, func::CallOp, func::CallIndirectOp>(
+              use.getOwner());
+        }))
+      return failure();
+
+    // Guard escaping empty sparse tensor allocations with a finalizing
+    // operation that leaves the underlying storage in a proper state
+    // before the tensor escapes across the method boundary.
+    rewriter.setInsertionPointAfter(op);
+    auto load = rewriter.create<LoadOp>(op.getLoc(), op.getResult(), true);
+    rewriter.replaceAllUsesExcept(op, load, load);
+    return success();
+  }
+};
+
 template <typename StageWithSortOp>
 struct StageUnorderedSparseOps : public OpRewritePattern<StageWithSortOp> {
   using OpRewritePattern<StageWithSortOp>::OpRewritePattern;
@@ -37,6 +69,6 @@ struct StageUnorderedSparseOps : public OpRewritePattern<StageWithSortOp> {
 } // namespace
 
 void mlir::populateStageSparseOperationsPatterns(RewritePatternSet &patterns) {
-  patterns.add<StageUnorderedSparseOps<ConvertOp>,
+  patterns.add<GuardSparseAlloc, StageUnorderedSparseOps<ConvertOp>,
                StageUnorderedSparseOps<ConcatenateOp>>(patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
index 62032ff301bec..5ca9510408c30 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
@@ -8,8 +8,11 @@
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 
 using namespace mlir;
@@ -45,4 +48,22 @@ void TensorDialect::initialize() {
 #include "mlir/Dialect/Tensor/IR/TensorOps.cpp.inc"
       >();
   addInterfaces<TensorInlinerInterface>();
+  declarePromisedInterfaces<
+      bufferization::BufferizableOpInterface, CastOp, CollapseShapeOp, DimOp,
+      EmptyOp, ExpandShapeOp, ExtractSliceOp, ExtractOp, FromElementsOp,
+      GenerateOp, InsertOp, InsertSliceOp, PadOp, ParallelInsertSliceOp, RankOp,
+      ReshapeOp, SplatOp>();
+  declarePromisedInterfaces<transform::FindPayloadReplacementOpInterface,
+                            CollapseShapeOp, ExpandShapeOp, ExtractSliceOp,
+                            InsertSliceOp, ReshapeOp>();
+  declarePromisedInterfaces<ReifyRankedShapedTypeOpInterface, ExpandShapeOp,
+                            CollapseShapeOp, PadOp>();
+  declarePromisedInterfaces<SubsetOpInterface, ExtractSliceOp, InsertSliceOp,
+                            ParallelInsertSliceOp>();
+  declarePromisedInterfaces<SubsetInsertionOpInterface, InsertSliceOp,
+                            ParallelInsertSliceOp>();
+  declarePromisedInterface<ExtractSliceOp, SubsetExtractionOpInterface>();
+  declarePromisedInterfaces<TilingInterface, PadOp, PackOp, UnPackOp>();
+  declarePromisedInterfaces<ValueBoundsOpInterface, CastOp, DimOp, EmptyOp,
+                            ExtractSliceOp, PadOp, RankOp>();
 }
diff --git a/mlir/lib/Dialect/Tosa/CMakeLists.txt b/mlir/lib/Dialect/Tosa/CMakeLists.txt
index ba5343dcd7ac6..1911405c63cd5 100644
--- a/mlir/lib/Dialect/Tosa/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tosa/CMakeLists.txt
@@ -12,6 +12,7 @@ add_mlir_dialect_library(MLIRTosaDialect
   MLIRTosaDialectBytecodeIncGen
   MLIRTosaOpsIncGen
   MLIRTosaInterfacesIncGen
+  MLIRShardingInterfaceIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 62d07859e32f6..f461e7e1a555b 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"
 #include "mlir/Dialect/Quant/QuantOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tosa/Utils/QuantUtils.h"
@@ -136,6 +137,14 @@ void TosaDialect::initialize() {
 #include "mlir/Dialect/Tosa/IR/TosaAttributes.cpp.inc"
       >();
   addInterfaces<TosaDialectBytecodeInterface, TosaInlinerInterface>();
+  declarePromisedInterfaces<
+      mesh::ShardingInterface, ClampOp, SigmoidOp, TanhOp, AddOp,
+      ArithmeticRightShiftOp, BitwiseAndOp, BitwiseOrOp, BitwiseXorOp, DivOp,
+      LogicalAndOp, LogicalLeftShiftOp, LogicalRightShiftOp, LogicalOrOp,
+      LogicalXorOp, MaximumOp, MinimumOp, MulOp, PowOp, SubOp, AbsOp,
+      BitwiseNotOp, CeilOp, ClzOp, ExpOp, FloorOp, LogOp, LogicalNotOp,
+      NegateOp, ReciprocalOp, RsqrtOp, SelectOp, EqualOp, GreaterOp,
+      GreaterEqualOp, MatMulOp>();
 }
 
 Operation *TosaDialect::materializeConstant(OpBuilder &builder, Attribute value,
diff --git a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
index 71a9d61198e3f..fe2eea535ffdc 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
@@ -1278,14 +1278,11 @@ void transform::TrackingListener::notifyMatchFailure(
 }
 
 void transform::TrackingListener::notifyOperationErased(Operation *op) {
-  // TODO: Walk can be removed when D144193 has landed.
-  op->walk([&](Operation *op) {
-    // Remove mappings for result values.
-    for (OpResult value : op->getResults())
-      (void)replacePayloadValue(value, nullptr);
-    // Remove mapping for op.
-    (void)replacePayloadOp(op, nullptr);
-  });
+  // Remove mappings for result values.
+  for (OpResult value : op->getResults())
+    (void)replacePayloadValue(value, nullptr);
+  // Remove mapping for op.
+  (void)replacePayloadOp(op, nullptr);
 }
 
 void transform::TrackingListener::notifyOperationReplaced(
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 75f6220ad8f3f..35296824246eb 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -31,6 +32,7 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/InliningUtils.h"
@@ -374,6 +376,14 @@ void VectorDialect::initialize() {
       >();
 
   addInterfaces<VectorInlinerInterface>();
+
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface,
+                            TransferReadOp, TransferWriteOp, GatherOp, MaskOp,
+                            YieldOp>();
+  declarePromisedInterfaces<SubsetOpInterface, TransferReadOp,
+                            TransferWriteOp>();
+  declarePromisedInterface<TransferReadOp, SubsetExtractionOpInterface>();
+  declarePromisedInterface<TransferWriteOp, SubsetInsertionOpInterface>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
@@ -6054,7 +6064,7 @@ LogicalResult MaskOp::fold(FoldAdaptor adaptor,
   maskableOp->dropAllUses();
   maskableOp->moveBefore(getOperation());
 
-  results.push_back(maskableOp->getResult(0));
+  llvm::append_range(results, maskableOp->getResults());
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index d613672608c3a..63ed0947cf6ce 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -300,3 +300,20 @@ vector::createUnrollIterator(VectorType vType, int64_t targetRank) {
   shapeToUnroll = shapeToUnroll.slice(0, firstScalableDim);
   return StaticTileOffsetRange(shapeToUnroll, /*unrollStep=*/1);
 }
+
+SmallVector<OpFoldResult> vector::getMixedSizesXfer(bool hasTensorSemantics,
+                                                    Operation *xfer,
+                                                    RewriterBase &rewriter) {
+  auto loc = xfer->getLoc();
+
+  Value base = TypeSwitch<Operation *, Value>(xfer)
+                   .Case<vector::TransferReadOp>(
+                       [&](auto readOp) { return readOp.getSource(); })
+                   .Case<vector::TransferWriteOp>(
+                       [&](auto writeOp) { return writeOp.getOperand(1); });
+
+  SmallVector<OpFoldResult> mixedSourceDims =
+      hasTensorSemantics ? tensor::getMixedSizes(rewriter, loc, base)
+                         : memref::getMixedSizes(rewriter, loc, base);
+  return mixedSourceDims;
+}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 0e89ac4df6ef2..b356c397fb836 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -12,6 +12,7 @@
 
 namespace mlir {
 namespace xegpu {
+
 // this file is for position occupation,
 // we will add functions in following PRs.
 
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index b9a3429e37b88..09dc30365e37c 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -28,6 +28,7 @@
 #endif // MLIR_ENABLE_CUDA_CUSPARSE
 
 #ifdef _WIN32
+#include <malloc.h>
 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
 #else
 #define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default")))
@@ -287,7 +288,11 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
                           int64_t elementSizeBytes) {
   // Only densely packed tensors are currently supported.
+#ifdef _WIN32
+  int64_t *denseStrides = (int64_t *)_alloca(rank * sizeof(int64_t));
+#else
   int64_t *denseStrides = (int64_t *)alloca(rank * sizeof(int64_t));
+#endif // _WIN32
   int64_t *sizes = descriptor->sizes;
   for (int64_t i = rank - 1, runningStride = 1; i >= 0; i--) {
     denseStrides[i] = runningStride;
@@ -423,24 +428,27 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled(
               elementStrides[4], interleave, swizzle, l2Promotion, oobFill);
 }
 
-namespace {
-
-template <int rank>
-void mgpuGetMemRefDataAndShape(void *raw_descriptor, char **addr,
-                               uint64_t *globalDim) {
+template <int Rank>
+void mgpuGetMemRefDataAndShape(void *rawDescriptor, char **addr,
+                               uint64_t *globalDim, uint64_t *globalStrides,
+                               const CUtensorMapDataType tensorDataType) {
   auto descriptor =
-      reinterpret_cast<StridedMemRefType<char, rank> *>(raw_descriptor);
+      reinterpret_cast<StridedMemRefType<char, Rank> *>(rawDescriptor);
   *addr = descriptor->data;
-  for (int i = 0; i < rank; ++i) {
-    globalDim[i] = static_cast<uint64_t>(descriptor->sizes[rank - i - 1]);
+  for (int i = 0; i < Rank; ++i) {
+    globalDim[i] = static_cast<uint64_t>(descriptor->sizes[Rank - i - 1]);
+  }
+  static constexpr int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2,
+                                               4, 8, 2, 4, 4, 4};
+  for (int i = 0; i < Rank - 1; ++i) {
+    globalStrides[i] = static_cast<uint64_t>(
+        descriptor->strides[Rank - i - 2] * elementSizeInBytes[tensorDataType]);
   }
 }
 
-} // namespace
-
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
     int64_t tensorRank,                       // Dimensionality of tensor
-    void *ranked_descriptor,                  // Ranked MemRef descriptor
+    void *rankedDescriptor,                   // Ranked MemRef descriptor
     const CUtensorMapDataType tensorDataType, // Stride size (in bytes)
     CUtensorMapInterleave interleave,         // Type of interleaved layout
     CUtensorMapSwizzle swizzle,               // Bank swizzling pattern
@@ -457,38 +465,36 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
   char *globalAddress = nullptr;
   switch (tensorRank) {
   case 1:
-    mgpuGetMemRefDataAndShape<1>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<1>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   case 2:
-    mgpuGetMemRefDataAndShape<2>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<2>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   case 3:
-    mgpuGetMemRefDataAndShape<3>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<3>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   case 4:
-    mgpuGetMemRefDataAndShape<4>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<4>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   case 5:
-    mgpuGetMemRefDataAndShape<5>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<5>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   default:
     fprintf(
         stderr,
         "'mgpuTensorMapEncodeTiledMemref' failed with 'rank is too high'\n");
-    return NULL;
+    return nullptr;
   }
 
-  static const int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2,
-                                           4, 8, 2, 4, 4, 4};
   for (int64_t r = 0; r < tensorRank; ++r) {
-    elementStrides[r] = uint32_t(1);
     boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - 1]);
   }
 
-  globalStrides[0] = globalDim[0] * elementSizeInBytes[tensorDataType];
-  for (int r = 1; r < tensorRank - 1; r++)
-    globalStrides[r] = globalStrides[r - 1] * globalDim[r];
-
   ScopedContext scopedContext;
   mgpuTensorMapEncodeTiled(&tensorMap, tensorDataType, tensorRank32,
                            globalAddress, globalDim, globalStrides, boxDim,
diff --git a/mlir/lib/ExecutionEngine/Float16bits.cpp b/mlir/lib/ExecutionEngine/Float16bits.cpp
index 841610e3c161d..e5b4f18dd644b 100644
--- a/mlir/lib/ExecutionEngine/Float16bits.cpp
+++ b/mlir/lib/ExecutionEngine/Float16bits.cpp
@@ -165,7 +165,7 @@ bool operator==(const bf16 &f1, const bf16 &f2) { return f1.bits == f2.bits; }
 #endif
 #endif
 
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(_M_X64)
 // On x86 bfloat16 is passed in SSE registers. Since both float and __bf16
 // are passed in the same register we can use the wider type and careful casting
 // to conform to x86_64 psABI. This only works with the assumption that we're
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
index 731abcbbf1f39..8835056099d23 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -127,7 +127,7 @@ extern "C" {
     case Action::kPack: {                                                      \
       assert(ptr && "Received nullptr for SparseTensorStorage object");        \
       intptr_t *buffers = static_cast<intptr_t *>(ptr);                        \
-      return SparseTensorStorage<P, C, V>::packFromLvlBuffers(                 \
+      return SparseTensorStorage<P, C, V>::newFromBuffers(                     \
           dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
           dimRank, buffers);                                                   \
     }                                                                          \
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 56a2d850ad3b6..94562d0f15a24 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -774,7 +774,8 @@ static AffineExpr simplifyMul(AffineExpr lhs, AffineExpr rhs) {
     return getAffineConstantExpr(lhsConst.getValue() * rhsConst.getValue(),
                                  lhs.getContext());
 
-  assert(lhs.isSymbolicOrConstant() || rhs.isSymbolicOrConstant());
+  if (!lhs.isSymbolicOrConstant() && !rhs.isSymbolicOrConstant())
+    return nullptr;
 
   // Canonicalize the mul expression so that the constant/symbolic term is the
   // RHS. If both the lhs and rhs are symbolic, swap them if the lhs is a
diff --git a/mlir/lib/Interfaces/ViewLikeInterface.cpp b/mlir/lib/Interfaces/ViewLikeInterface.cpp
index 7c5905010eb41..6d1ff03756ace 100644
--- a/mlir/lib/Interfaces/ViewLikeInterface.cpp
+++ b/mlir/lib/Interfaces/ViewLikeInterface.cpp
@@ -27,7 +27,7 @@ LogicalResult mlir::verifyListOfOperandsOrIntegers(Operation *op,
     return op->emitError("expected ") << numElements << " " << name
                                       << " values, got " << staticVals.size();
   unsigned expectedNumDynamicEntries =
-      llvm::count_if(staticVals, [&](int64_t staticVal) {
+      llvm::count_if(staticVals, [](int64_t staticVal) {
         return ShapedType::isDynamic(staticVal);
       });
   if (values.size() != expectedNumDynamicEntries)
diff --git a/mlir/lib/Support/ToolUtilities.cpp b/mlir/lib/Support/ToolUtilities.cpp
index ee0214f3d8ac0..f05b4f5c643d5 100644
--- a/mlir/lib/Support/ToolUtilities.cpp
+++ b/mlir/lib/Support/ToolUtilities.cpp
@@ -21,22 +21,20 @@ using namespace mlir;
 LogicalResult
 mlir::splitAndProcessBuffer(std::unique_ptr<llvm::MemoryBuffer> originalBuffer,
                             ChunkBufferHandler processChunkBuffer,
-                            raw_ostream &os, bool enableSplitting,
-                            bool insertMarkerInOutput) {
+                            raw_ostream &os, llvm::StringRef inputSplitMarker,
+                            llvm::StringRef outputSplitMarker) {
   // If splitting is disabled, we process the full input buffer.
-  if (!enableSplitting)
+  if (inputSplitMarker.empty())
     return processChunkBuffer(std::move(originalBuffer), os);
 
-  const char splitMarkerConst[] = "// -----";
-  StringRef splitMarker(splitMarkerConst);
-  const int splitMarkerLen = splitMarker.size();
+  const int inputSplitMarkerLen = inputSplitMarker.size();
 
   auto *origMemBuffer = originalBuffer.get();
   SmallVector<StringRef, 8> rawSourceBuffers;
   const int checkLen = 2;
   // Split dropping the last checkLen chars to enable flagging near misses.
   origMemBuffer->getBuffer().split(rawSourceBuffers,
-                                   splitMarker.drop_back(checkLen));
+                                   inputSplitMarker.drop_back(checkLen));
   if (rawSourceBuffers.empty())
     return success();
 
@@ -58,8 +56,9 @@ mlir::splitAndProcessBuffer(std::unique_ptr<llvm::MemoryBuffer> originalBuffer,
     }
 
     // Check that suffix is as expected and doesn't have any dash post.
-    bool expectedSuffix = buffer.starts_with(splitMarker.take_back(checkLen)) &&
-                          buffer.size() > checkLen && buffer[checkLen] != '0';
+    bool expectedSuffix =
+        buffer.starts_with(inputSplitMarker.take_back(checkLen)) &&
+        buffer.size() > checkLen && buffer[checkLen] != '0';
     if (expectedSuffix) {
       sourceBuffers.push_back(prev);
       prev = buffer.drop_front(checkLen);
@@ -69,8 +68,8 @@ mlir::splitAndProcessBuffer(std::unique_ptr<llvm::MemoryBuffer> originalBuffer,
       fileSourceMgr.PrintMessage(llvm::errs(), splitLoc,
                                  llvm::SourceMgr::DK_Warning,
                                  "near miss with file split marker");
-      prev = StringRef(prev.data(),
-                       prev.size() + splitMarkerLen - checkLen + buffer.size());
+      prev = StringRef(prev.data(), prev.size() + inputSplitMarkerLen -
+                                        checkLen + buffer.size());
     }
   }
   if (!prev.empty())
@@ -89,7 +88,7 @@ mlir::splitAndProcessBuffer(std::unique_ptr<llvm::MemoryBuffer> originalBuffer,
       hadFailure = true;
   };
   llvm::interleave(sourceBuffers, os, interleaveFn,
-                   insertMarkerInOutput ? "\n// -----\n" : "");
+                   (llvm::Twine(outputSplitMarker) + "\n").str());
 
   // If any fails, then return a failure of the tool.
   return failure(hadFailure);
diff --git a/mlir/lib/Target/Cpp/CMakeLists.txt b/mlir/lib/Target/Cpp/CMakeLists.txt
index d8f372cf16245..578cb2f22767a 100644
--- a/mlir/lib/Target/Cpp/CMakeLists.txt
+++ b/mlir/lib/Target/Cpp/CMakeLists.txt
@@ -10,8 +10,6 @@ add_mlir_translation_library(MLIRTargetCpp
   MLIREmitCDialect
   MLIRFuncDialect
   MLIRIR
-  MLIRMathDialect
-  MLIRSCFDialect
   MLIRSupport
   MLIRTranslateLib
   )
diff --git a/mlir/lib/Target/Cpp/TranslateRegistration.cpp b/mlir/lib/Target/Cpp/TranslateRegistration.cpp
index 4104b177d7d9a..1aa98834a73f4 100644
--- a/mlir/lib/Target/Cpp/TranslateRegistration.cpp
+++ b/mlir/lib/Target/Cpp/TranslateRegistration.cpp
@@ -9,8 +9,6 @@
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/Target/Cpp/CppEmitter.h"
@@ -42,9 +40,7 @@ void registerToCppTranslation() {
         // clang-format off
         registry.insert<cf::ControlFlowDialect,
                         emitc::EmitCDialect,
-                        func::FuncDialect,
-                        math::MathDialect,
-                        scf::SCFDialect>();
+                        func::FuncDialect>();
         // clang-format on
       });
 }
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 7cbb1e9265e17..95c7af2f07be4 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -18,11 +18,13 @@
 #include "mlir/Support/LLVM.h"
 #include "mlir/Target/Cpp/CppEmitter.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
+#include <stack>
 #include <utility>
 
 #define DEBUG_TYPE "translate-to-cpp"
@@ -172,6 +174,9 @@ struct CppEmitter {
   /// Return the existing or a new name for a Value.
   StringRef getOrCreateName(Value val);
 
+  // Returns the textual representation of a subscript operation.
+  std::string getSubscriptName(emitc::SubscriptOp op);
+
   /// Return the existing or a new label of a Block.
   StringRef getOrCreateName(Block &block);
 
@@ -341,8 +346,7 @@ static LogicalResult printOperation(CppEmitter &emitter,
 
 static LogicalResult printOperation(CppEmitter &emitter,
                                     emitc::AssignOp assignOp) {
-  auto variableOp = cast<emitc::VariableOp>(assignOp.getVar().getDefiningOp());
-  OpResult result = variableOp->getResult(0);
+  OpResult result = assignOp.getVar().getDefiningOp()->getResult(0);
 
   if (failed(emitter.emitVariableAssignment(result)))
     return failure();
@@ -350,6 +354,13 @@ static LogicalResult printOperation(CppEmitter &emitter,
   return emitter.emitOperand(assignOp.getValue());
 }
 
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    emitc::SubscriptOp subscriptOp) {
+  // Add name to cache so that `hasValueInScope` works.
+  emitter.getOrCreateName(subscriptOp.getResult());
+  return success();
+}
+
 static LogicalResult printBinaryOperation(CppEmitter &emitter,
                                           Operation *operation,
                                           StringRef binaryOperator) {
@@ -1091,12 +1102,28 @@ CppEmitter::CppEmitter(raw_ostream &os, bool declareVariablesAtTop)
   labelInScopeCount.push(0);
 }
 
+std::string CppEmitter::getSubscriptName(emitc::SubscriptOp op) {
+  std::string out;
+  llvm::raw_string_ostream ss(out);
+  ss << getOrCreateName(op.getArray());
+  for (auto index : op.getIndices()) {
+    ss << "[" << getOrCreateName(index) << "]";
+  }
+  return out;
+}
+
 /// Return the existing or a new name for a Value.
 StringRef CppEmitter::getOrCreateName(Value val) {
   if (auto literal = dyn_cast_if_present<emitc::LiteralOp>(val.getDefiningOp()))
     return literal.getValue();
-  if (!valueMapper.count(val))
-    valueMapper.insert(val, formatv("v{0}", ++valueInScopeCount.top()));
+  if (!valueMapper.count(val)) {
+    if (auto subscript =
+            dyn_cast_if_present<emitc::SubscriptOp>(val.getDefiningOp())) {
+      valueMapper.insert(val, getSubscriptName(subscript));
+    } else {
+      valueMapper.insert(val, formatv("v{0}", ++valueInScopeCount.top()));
+    }
+  }
   return *valueMapper.begin(val);
 }
 
@@ -1144,17 +1171,16 @@ LogicalResult CppEmitter::emitAttribute(Location loc, Attribute attr) {
       SmallString<128> strValue;
       // Use default values of toString except don't truncate zeros.
       val.toString(strValue, 0, 0, false);
+      os << strValue;
       switch (llvm::APFloatBase::SemanticsToEnum(val.getSemantics())) {
       case llvm::APFloatBase::S_IEEEsingle:
-        os << "(float)";
+        os << "f";
         break;
       case llvm::APFloatBase::S_IEEEdouble:
-        os << "(double)";
         break;
       default:
-        break;
+        llvm_unreachable("unsupported floating point type");
       };
-      os << strValue;
     } else if (val.isNaN()) {
       os << "NAN";
     } else if (val.isInfinity()) {
@@ -1166,10 +1192,18 @@ LogicalResult CppEmitter::emitAttribute(Location loc, Attribute attr) {
 
   // Print floating point attributes.
   if (auto fAttr = dyn_cast<FloatAttr>(attr)) {
+    if (!isa<Float32Type, Float64Type>(fAttr.getType())) {
+      return emitError(loc,
+                       "expected floating point attribute to be f32 or f64");
+    }
     printFloat(fAttr.getValue());
     return success();
   }
   if (auto dense = dyn_cast<DenseFPElementsAttr>(attr)) {
+    if (!isa<Float32Type, Float64Type>(dense.getElementType())) {
+      return emitError(loc,
+                       "expected floating point attribute to be f32 or f64");
+    }
     os << '{';
     interleaveComma(dense, os, [&](const APFloat &val) { printFloat(val); });
     os << '}';
@@ -1336,6 +1370,8 @@ LogicalResult CppEmitter::emitVariableAssignment(OpResult result) {
 
 LogicalResult CppEmitter::emitVariableDeclaration(OpResult result,
                                                   bool trailingSemicolon) {
+  if (isa<emitc::SubscriptOp>(result.getDefiningOp()))
+    return success();
   if (hasValueInScope(result)) {
     return result.getDefiningOp()->emitError(
         "result variable for the operation already declared");
@@ -1411,7 +1447,7 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
                 emitc::DivOp, emitc::ExpressionOp, emitc::ForOp, emitc::FuncOp,
                 emitc::IfOp, emitc::IncludeOp, emitc::LogicalAndOp,
                 emitc::LogicalNotOp, emitc::LogicalOrOp, emitc::MulOp,
-                emitc::RemOp, emitc::ReturnOp, emitc::SubOp,
+                emitc::RemOp, emitc::ReturnOp, emitc::SubOp, emitc::SubscriptOp,
                 emitc::UnaryMinusOp, emitc::UnaryPlusOp, emitc::VariableOp,
                 emitc::VerbatimOp>(
               [&](auto op) { return printOperation(*this, op); })
@@ -1426,7 +1462,7 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
   if (failed(status))
     return failure();
 
-  if (isa<emitc::LiteralOp>(op))
+  if (isa<emitc::LiteralOp, emitc::SubscriptOp>(op))
     return success();
 
   if (getEmittedExpression() ||
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
index c631617f97354..5ba90bba18b19 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
@@ -13,6 +13,7 @@
 #include "mlir/IR/Location.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -51,10 +52,9 @@ DICompileUnitAttr DebugImporter::translateImpl(llvm::DICompileUnit *node) {
   std::optional<DIEmissionKind> emissionKind =
       symbolizeDIEmissionKind(node->getEmissionKind());
   return DICompileUnitAttr::get(
-      context, DistinctAttr::create(UnitAttr::get(context)),
-      node->getSourceLanguage(), translate(node->getFile()),
-      getStringAttrOrNull(node->getRawProducer()), node->isOptimized(),
-      emissionKind.value());
+      context, getOrCreateDistinctID(node), node->getSourceLanguage(),
+      translate(node->getFile()), getStringAttrOrNull(node->getRawProducer()),
+      node->isOptimized(), emissionKind.value());
 }
 
 DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) {
@@ -64,11 +64,7 @@ DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) {
     assert(element && "expected a non-null element type");
     elements.push_back(translate(element));
   }
-  // Drop the elements parameter if a cyclic dependency is detected. We
-  // currently cannot model these cycles and thus drop the parameter if
-  // required. A cyclic dependency is detected if one of the element nodes
-  // translates to a nullptr since the node is already on the translation stack.
-  // TODO: Support debug metadata with cyclic dependencies.
+  // Drop the elements parameter if any of the elements are invalid.
   if (llvm::is_contained(elements, nullptr))
     elements.clear();
   DITypeAttr baseType = translate(node->getBaseType());
@@ -77,14 +73,15 @@ DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) {
   if (node->getTag() == llvm::dwarf::DW_TAG_array_type && !baseType)
     return nullptr;
   return DICompositeTypeAttr::get(
-      context, node->getTag(), getStringAttrOrNull(node->getRawName()),
-      translate(node->getFile()), node->getLine(), translate(node->getScope()),
-      baseType, flags.value_or(DIFlags::Zero), node->getSizeInBits(),
+      context, node->getTag(), /*recId=*/{},
+      getStringAttrOrNull(node->getRawName()), translate(node->getFile()),
+      node->getLine(), translate(node->getScope()), baseType,
+      flags.value_or(DIFlags::Zero), node->getSizeInBits(),
       node->getAlignInBits(), elements);
 }
 
 DIDerivedTypeAttr DebugImporter::translateImpl(llvm::DIDerivedType *node) {
-  // Return nullptr if the base type is a cyclic dependency.
+  // Return nullptr if the base type is invalid.
   DITypeAttr baseType = translate(node->getBaseType());
   if (node->getBaseType() && !baseType)
     return nullptr;
@@ -179,10 +176,10 @@ DISubprogramAttr DebugImporter::translateImpl(llvm::DISubprogram *node) {
   // Only definitions require a distinct identifier.
   mlir::DistinctAttr id;
   if (node->isDistinct())
-    id = DistinctAttr::create(UnitAttr::get(context));
+    id = getOrCreateDistinctID(node);
   std::optional<DISubprogramFlags> subprogramFlags =
       symbolizeDISubprogramFlags(node->getSubprogram()->getSPFlags());
-  // Return nullptr if the scope or type is a cyclic dependency.
+  // Return nullptr if the scope or type is invalid.
   DIScopeAttr scope = translate(node->getScope());
   if (node->getScope() && !scope)
     return nullptr;
@@ -229,7 +226,7 @@ DebugImporter::translateImpl(llvm::DISubroutineType *node) {
     }
     types.push_back(translate(type));
   }
-  // Return nullptr if any of the types is a cyclic dependency.
+  // Return nullptr if any of the types is invalid.
   if (llvm::is_contained(types, nullptr))
     return nullptr;
   return DISubroutineTypeAttr::get(context, node->getCC(), types);
@@ -247,12 +244,42 @@ DINodeAttr DebugImporter::translate(llvm::DINode *node) {
   if (DINodeAttr attr = nodeToAttr.lookup(node))
     return attr;
 
-  // Return nullptr if a cyclic dependency is detected since the same node is
-  // being traversed twice. This check avoids infinite recursion if the debug
-  // metadata contains cycles.
-  if (!translationStack.insert(node))
-    return nullptr;
-  auto guard = llvm::make_scope_exit([&]() { translationStack.pop_back(); });
+  // If the node type is capable of being recursive, check if it's seen before.
+  auto recSelfCtor = getRecSelfConstructor(node);
+  if (recSelfCtor) {
+    // If a cyclic dependency is detected since the same node is being traversed
+    // twice, emit a recursive self type, and mark the duplicate node on the
+    // translationStack so it can emit a recursive decl type.
+    auto [iter, inserted] = translationStack.try_emplace(node, nullptr);
+    if (!inserted) {
+      // The original node may have already been assigned a recursive ID from
+      // a different self-reference. Use that if possible.
+      DistinctAttr recId = iter->second;
+      if (!recId) {
+        recId = DistinctAttr::create(UnitAttr::get(context));
+        iter->second = recId;
+      }
+      unboundRecursiveSelfRefs.back().insert(recId);
+      return cast<DINodeAttr>(recSelfCtor(recId));
+    }
+  }
+
+  unboundRecursiveSelfRefs.emplace_back();
+
+  auto guard = llvm::make_scope_exit([&]() {
+    if (recSelfCtor)
+      translationStack.pop_back();
+
+    // Copy unboundRecursiveSelfRefs down to the previous level.
+    if (unboundRecursiveSelfRefs.size() == 1)
+      assert(unboundRecursiveSelfRefs.back().empty() &&
+             "internal error: unbound recursive self reference at top level.");
+    else
+      unboundRecursiveSelfRefs[unboundRecursiveSelfRefs.size() - 2].insert(
+          unboundRecursiveSelfRefs.back().begin(),
+          unboundRecursiveSelfRefs.back().end());
+    unboundRecursiveSelfRefs.pop_back();
+  });
 
   // Convert the debug metadata if possible.
   auto translateNode = [this](llvm::DINode *node) -> DINodeAttr {
@@ -289,7 +316,19 @@ DINodeAttr DebugImporter::translate(llvm::DINode *node) {
     return nullptr;
   };
   if (DINodeAttr attr = translateNode(node)) {
-    nodeToAttr.insert({node, attr});
+    // If this node was marked as recursive, set its recId.
+    if (auto recType = dyn_cast<DIRecursiveTypeAttrInterface>(attr)) {
+      if (DistinctAttr recId = translationStack.lookup(node)) {
+        attr = cast<DINodeAttr>(recType.withRecId(recId));
+        // Remove the unbound recursive ID from the set of unbound self
+        // references in the translation stack.
+        unboundRecursiveSelfRefs.back().erase(recId);
+      }
+    }
+
+    // Only cache fully self-contained nodes.
+    if (unboundRecursiveSelfRefs.back().empty())
+      nodeToAttr.try_emplace(node, attr);
     return attr;
   }
   return nullptr;
@@ -346,3 +385,20 @@ StringAttr DebugImporter::getStringAttrOrNull(llvm::MDString *stringNode) {
     return StringAttr();
   return StringAttr::get(context, stringNode->getString());
 }
+
+DistinctAttr DebugImporter::getOrCreateDistinctID(llvm::DINode *node) {
+  DistinctAttr &id = nodeToDistinctAttr[node];
+  if (!id)
+    id = DistinctAttr::create(UnitAttr::get(context));
+  return id;
+}
+
+function_ref<DIRecursiveTypeAttrInterface(DistinctAttr)>
+DebugImporter::getRecSelfConstructor(llvm::DINode *node) {
+  using CtorType = function_ref<DIRecursiveTypeAttrInterface(DistinctAttr)>;
+  return TypeSwitch<llvm::DINode *, CtorType>(node)
+      .Case([&](llvm::DICompositeType *concreteNode) {
+        return CtorType(DICompositeTypeAttr::getRecSelf);
+      })
+      .Default(CtorType());
+}
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.h b/mlir/lib/Target/LLVMIR/DebugImporter.h
index 7d4a371284b68..bcf628fc4234f 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.h
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.h
@@ -82,12 +82,28 @@ class DebugImporter {
   /// null attribute otherwise.
   StringAttr getStringAttrOrNull(llvm::MDString *stringNode);
 
+  /// Get the DistinctAttr used to represent `node` if one was already created
+  /// for it, or create a new one if not.
+  DistinctAttr getOrCreateDistinctID(llvm::DINode *node);
+
+  /// Get the `getRecSelf` constructor for the translated type of `node` if its
+  /// translated DITypeAttr supports recursion. Otherwise, returns nullptr.
+  function_ref<DIRecursiveTypeAttrInterface(DistinctAttr)>
+  getRecSelfConstructor(llvm::DINode *node);
+
   /// A mapping between LLVM debug metadata and the corresponding attribute.
   DenseMap<llvm::DINode *, DINodeAttr> nodeToAttr;
+  /// A mapping between distinct LLVM debug metadata nodes and the corresponding
+  /// distinct id attribute.
+  DenseMap<llvm::DINode *, DistinctAttr> nodeToDistinctAttr;
 
   /// A stack that stores the metadata nodes that are being traversed. The stack
   /// is used to detect cyclic dependencies during the metadata translation.
-  SetVector<llvm::DINode *> translationStack;
+  /// A node is pushed with a null value. If it is ever seen twice, it is given
+  /// a recursive id attribute, indicating that it is a recursive node.
+  llvm::MapVector<llvm::DINode *, DistinctAttr> translationStack;
+  /// All the unbound recursive self references in the translation stack.
+  SmallVector<DenseSet<DistinctAttr>> unboundRecursiveSelfRefs;
 
   MLIRContext *context;
   ModuleOp mlirModule;
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
index 16918aab54978..126a671a58e80 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
@@ -116,12 +116,20 @@ static DINodeT *getDistinctOrUnique(bool isDistinct, Ts &&...args) {
   return DINodeT::get(std::forward<Ts>(args)...);
 }
 
+llvm::TempDICompositeType
+DebugTranslation::translateTemporaryImpl(DICompositeTypeAttr attr) {
+  return llvm::DICompositeType::getTemporary(
+      llvmCtx, attr.getTag(), getMDStringOrNull(attr.getName()), nullptr,
+      attr.getLine(), nullptr, nullptr, attr.getSizeInBits(),
+      attr.getAlignInBits(),
+      /*OffsetInBits=*/0,
+      /*Flags=*/static_cast<llvm::DINode::DIFlags>(attr.getFlags()),
+      /*Elements=*/nullptr, /*RuntimeLang=*/0,
+      /*VTableHolder=*/nullptr);
+}
+
 llvm::DICompositeType *
 DebugTranslation::translateImpl(DICompositeTypeAttr attr) {
-  SmallVector<llvm::Metadata *> elements;
-  for (auto member : attr.getElements())
-    elements.push_back(translate(member));
-
   // TODO: Use distinct attributes to model this, once they have landed.
   // Depending on the tag, composite types must be distinct.
   bool isDistinct = false;
@@ -133,6 +141,10 @@ DebugTranslation::translateImpl(DICompositeTypeAttr attr) {
     isDistinct = true;
   }
 
+  SmallVector<llvm::Metadata *> elements;
+  for (DINodeAttr member : attr.getElements())
+    elements.push_back(translate(member));
+
   return getDistinctOrUnique<llvm::DICompositeType>(
       isDistinct, llvmCtx, attr.getTag(), getMDStringOrNull(attr.getName()),
       translate(attr.getFile()), attr.getLine(), translate(attr.getScope()),
@@ -150,7 +162,8 @@ llvm::DIDerivedType *DebugTranslation::translateImpl(DIDerivedTypeAttr attr) {
       /*File=*/nullptr, /*Line=*/0,
       /*Scope=*/nullptr, translate(attr.getBaseType()), attr.getSizeInBits(),
       attr.getAlignInBits(), attr.getOffsetInBits(),
-      /*DWARFAddressSpace=*/std::nullopt, /*Flags=*/llvm::DINode::FlagZero);
+      /*DWARFAddressSpace=*/std::nullopt, /*PtrAuthData=*/std::nullopt,
+      /*Flags=*/llvm::DINode::FlagZero);
 }
 
 llvm::DIFile *DebugTranslation::translateImpl(DIFileAttr attr) {
@@ -200,22 +213,76 @@ DebugTranslation::translateImpl(DIGlobalVariableAttr attr) {
       attr.getIsDefined(), nullptr, nullptr, attr.getAlignInBits(), nullptr);
 }
 
+llvm::DIType *
+DebugTranslation::translateRecursive(DIRecursiveTypeAttrInterface attr) {
+  DistinctAttr recursiveId = attr.getRecId();
+  if (attr.isRecSelf()) {
+    auto *iter = recursiveTypeMap.find(recursiveId);
+    assert(iter != recursiveTypeMap.end() && "unbound DI recursive self type");
+    return iter->second;
+  }
+
+  auto setRecursivePlaceholder = [&](llvm::DIType *placeholder) {
+    [[maybe_unused]] auto [iter, inserted] =
+        recursiveTypeMap.try_emplace(recursiveId, placeholder);
+    (void)iter;
+    (void)inserted;
+    assert(inserted && "illegal reuse of recursive id");
+  };
+
+  llvm::DIType *result =
+      TypeSwitch<DIRecursiveTypeAttrInterface, llvm::DIType *>(attr)
+          .Case<DICompositeTypeAttr>([&](auto attr) {
+            auto temporary = translateTemporaryImpl(attr);
+            setRecursivePlaceholder(temporary.get());
+            // Must call `translateImpl` directly instead of `translate` to
+            // avoid handling the recursive interface again.
+            auto *concrete = translateImpl(attr);
+            temporary->replaceAllUsesWith(concrete);
+            return concrete;
+          });
+
+  assert(recursiveTypeMap.back().first == recursiveId &&
+         "internal inconsistency: unexpected recursive translation stack");
+  recursiveTypeMap.pop_back();
+
+  return result;
+}
+
 llvm::DIScope *DebugTranslation::translateImpl(DIScopeAttr attr) {
   return cast<llvm::DIScope>(translate(DINodeAttr(attr)));
 }
 
 llvm::DISubprogram *DebugTranslation::translateImpl(DISubprogramAttr attr) {
+  if (auto iter = distinctAttrToNode.find(attr.getId());
+      iter != distinctAttrToNode.end())
+    return cast<llvm::DISubprogram>(iter->second);
+
+  llvm::DIScope *scope = translate(attr.getScope());
+  llvm::DIFile *file = translate(attr.getFile());
+  llvm::DIType *type = translate(attr.getType());
+  llvm::DICompileUnit *compileUnit = translate(attr.getCompileUnit());
+
+  // Check again after recursive calls in case this distinct node recurses back
+  // to itself.
+  if (auto iter = distinctAttrToNode.find(attr.getId());
+      iter != distinctAttrToNode.end())
+    return cast<llvm::DISubprogram>(iter->second);
+
   bool isDefinition = static_cast<bool>(attr.getSubprogramFlags() &
                                         LLVM::DISubprogramFlags::Definition);
-  return getDistinctOrUnique<llvm::DISubprogram>(
-      isDefinition, llvmCtx, translate(attr.getScope()),
-      getMDStringOrNull(attr.getName()),
-      getMDStringOrNull(attr.getLinkageName()), translate(attr.getFile()),
-      attr.getLine(), translate(attr.getType()), attr.getScopeLine(),
+  llvm::DISubprogram *node = getDistinctOrUnique<llvm::DISubprogram>(
+      isDefinition, llvmCtx, scope, getMDStringOrNull(attr.getName()),
+      getMDStringOrNull(attr.getLinkageName()), file, attr.getLine(), type,
+      attr.getScopeLine(),
       /*ContainingType=*/nullptr, /*VirtualIndex=*/0,
       /*ThisAdjustment=*/0, llvm::DINode::FlagZero,
       static_cast<llvm::DISubprogram::DISPFlags>(attr.getSubprogramFlags()),
-      translate(attr.getCompileUnit()));
+      compileUnit);
+
+  if (attr.getId())
+    distinctAttrToNode.try_emplace(attr.getId(), node);
+  return node;
 }
 
 llvm::DIModule *DebugTranslation::translateImpl(DIModuleAttr attr) {
@@ -268,15 +335,23 @@ llvm::DINode *DebugTranslation::translate(DINodeAttr attr) {
   if (llvm::DINode *node = attrToNode.lookup(attr))
     return node;
 
-  llvm::DINode *node =
-      TypeSwitch<DINodeAttr, llvm::DINode *>(attr)
-          .Case<DIBasicTypeAttr, DICompileUnitAttr, DICompositeTypeAttr,
-                DIDerivedTypeAttr, DIFileAttr, DIGlobalVariableAttr,
-                DILabelAttr, DILexicalBlockAttr, DILexicalBlockFileAttr,
-                DILocalVariableAttr, DIModuleAttr, DINamespaceAttr,
-                DINullTypeAttr, DISubprogramAttr, DISubrangeAttr,
-                DISubroutineTypeAttr>(
-              [&](auto attr) { return translateImpl(attr); });
+  llvm::DINode *node = nullptr;
+  // Recursive types go through a dedicated handler. All other types are
+  // dispatched directly to their specific handlers.
+  if (auto recTypeAttr = dyn_cast<DIRecursiveTypeAttrInterface>(attr))
+    if (recTypeAttr.getRecId())
+      node = translateRecursive(recTypeAttr);
+
+  if (!node)
+    node = TypeSwitch<DINodeAttr, llvm::DINode *>(attr)
+               .Case<DIBasicTypeAttr, DICompileUnitAttr, DICompositeTypeAttr,
+                     DIDerivedTypeAttr, DIFileAttr, DIGlobalVariableAttr,
+                     DILabelAttr, DILexicalBlockAttr, DILexicalBlockFileAttr,
+                     DILocalVariableAttr, DIModuleAttr, DINamespaceAttr,
+                     DINullTypeAttr, DISubprogramAttr, DISubrangeAttr,
+                     DISubroutineTypeAttr>(
+                   [&](auto attr) { return translateImpl(attr); });
+
   attrToNode.insert({attr, node});
   return node;
 }
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.h b/mlir/lib/Target/LLVMIR/DebugTranslation.h
index 627c684684498..c7a5228cbf5e9 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.h
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.h
@@ -88,6 +88,17 @@ class DebugTranslation {
   llvm::DISubroutineType *translateImpl(DISubroutineTypeAttr attr);
   llvm::DIType *translateImpl(DITypeAttr attr);
 
+  /// Attributes that support self recursion need to implement an additional
+  /// method to hook into `translateRecursive`.
+  /// - `<temp llvm type> translateTemporaryImpl(<mlir type>)`:
+  ///   Create a temporary translation of the DI attr without recursively
+  ///   translating any nested DI attrs.
+  llvm::DIType *translateRecursive(DIRecursiveTypeAttrInterface attr);
+
+  /// Translate the given attribute to a temporary llvm debug metadata of the
+  /// corresponding type.
+  llvm::TempDICompositeType translateTemporaryImpl(DICompositeTypeAttr attr);
+
   /// Constructs a string metadata node from the string attribute. Returns
   /// nullptr if `stringAttr` is null or contains and empty string.
   llvm::MDString *getMDStringOrNull(StringAttr stringAttr);
@@ -102,6 +113,14 @@ class DebugTranslation {
   /// metadata.
   DenseMap<Attribute, llvm::DINode *> attrToNode;
 
+  /// A mapping between recursive ID and the translated DIType.
+  llvm::MapVector<DistinctAttr, llvm::DIType *> recursiveTypeMap;
+
+  /// A mapping between a distinct ID and the translated LLVM metadata node.
+  /// This helps identify attrs that should translate into the same LLVM debug
+  /// node.
+  DenseMap<DistinctAttr, llvm::DINode *> distinctAttrToNode;
+
   /// A mapping between filename and llvm debug file.
   /// TODO: Change this to DenseMap<Identifier, ...> when we can
   /// access the Identifier filename in FileLineColLoc.
diff --git a/mlir/lib/Tools/lsp-server-support/Transport.cpp b/mlir/lib/Tools/lsp-server-support/Transport.cpp
index df675cf78210b..64dea35614c07 100644
--- a/mlir/lib/Tools/lsp-server-support/Transport.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Transport.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Tools/lsp-server-support/Transport.h"
+#include "mlir/Support/ToolUtilities.h"
 #include "mlir/Tools/lsp-server-support/Logging.h"
 #include "mlir/Tools/lsp-server-support/Protocol.h"
 #include "llvm/ADT/SmallString.h"
@@ -347,7 +348,7 @@ LogicalResult JSONTransport::readDelimitedMessage(std::string &json) {
     StringRef lineRef = line.str().trim();
     if (lineRef.starts_with("//")) {
       // Found a delimiter for the message.
-      if (lineRef == "// -----")
+      if (lineRef == kDefaultSplitMarker)
         break;
       continue;
     }
diff --git a/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp b/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
index de657a3df9ef7..ed75b4a90536e 100644
--- a/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
+++ b/mlir/lib/Tools/mlir-lsp-server/MLIRServer.cpp
@@ -15,6 +15,7 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Parser/Parser.h"
+#include "mlir/Support/ToolUtilities.h"
 #include "mlir/Tools/lsp-server-support/Logging.h"
 #include "mlir/Tools/lsp-server-support/SourceMgrUtils.h"
 #include "llvm/ADT/StringExtras.h"
@@ -1052,11 +1053,8 @@ MLIRTextFile::MLIRTextFile(const lsp::URIForFile &uri, StringRef fileContents,
   context.allowUnregisteredDialects();
 
   // Split the file into separate MLIR documents.
-  // TODO: Find a way to share the split file marker with other tools. We don't
-  // want to use `splitAndProcessBuffer` here, but we do want to make sure this
-  // marker doesn't go out of sync.
   SmallVector<StringRef, 8> subContents;
-  StringRef(contents).split(subContents, "// -----");
+  StringRef(contents).split(subContents, kDefaultSplitMarker);
   chunks.emplace_back(std::make_unique<MLIRTextFileChunk>(
       context, /*lineOffset=*/0, uri, subContents.front(), diagnostics));
 
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index b62557153b416..44c5e9826f3b7 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -31,6 +31,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Support/Timing.h"
 #include "mlir/Support/ToolUtilities.h"
 #include "mlir/Tools/ParseUtilities.h"
@@ -127,11 +128,21 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
         cl::desc("Print the list of registered dialects and exit"),
         cl::location(showDialectsFlag), cl::init(false));
 
-    static cl::opt<bool, /*ExternalStorage=*/true> splitInputFile(
-        "split-input-file",
-        cl::desc("Split the input file into pieces and process each "
-                 "chunk independently"),
-        cl::location(splitInputFileFlag), cl::init(false));
+    static cl::opt<std::string, /*ExternalStorage=*/true> splitInputFile{
+        "split-input-file", llvm::cl::ValueOptional,
+        cl::callback([&](const std::string &str) {
+          // Implicit value: use default marker if flag was used without value.
+          if (str.empty())
+            splitInputFile.setValue(kDefaultSplitMarker);
+        }),
+        cl::desc("Split the input file into chunks using the given or "
+                 "default marker and process each chunk independently"),
+        cl::location(splitInputFileFlag), cl::init("")};
+
+    static cl::opt<std::string, /*ExternalStorage=*/true> outputSplitMarker(
+        "output-split-marker",
+        cl::desc("Split marker to use for merging the ouput"),
+        cl::location(outputSplitMarkerFlag), cl::init(kDefaultSplitMarker));
 
     static cl::opt<bool, /*ExternalStorage=*/true> verifyDiagnostics(
         "verify-diagnostics",
@@ -503,15 +514,19 @@ mlir::registerAndParseCLIOptions(int argc, char **argv,
   return std::make_pair(inputFilename.getValue(), outputFilename.getValue());
 }
 
+static LogicalResult printRegisteredDialects(DialectRegistry &registry) {
+  llvm::outs() << "Available Dialects: ";
+  interleave(registry.getDialectNames(), llvm::outs(), ",");
+  llvm::outs() << "\n";
+  return success();
+}
+
 LogicalResult mlir::MlirOptMain(llvm::raw_ostream &outputStream,
                                 std::unique_ptr<llvm::MemoryBuffer> buffer,
                                 DialectRegistry &registry,
                                 const MlirOptMainConfig &config) {
-  if (config.shouldShowDialects()) {
-    llvm::outs() << "Available Dialects: ";
-    interleave(registry.getDialectNames(), llvm::outs(), ",");
-    llvm::outs() << "\n";
-  }
+  if (config.shouldShowDialects())
+    return printRegisteredDialects(registry);
 
   // The split-input-file mode is a very specific mode that slices the file
   // up into small pieces and checks each independently.
@@ -533,8 +548,8 @@ LogicalResult mlir::MlirOptMain(llvm::raw_ostream &outputStream,
                          threadPool);
   };
   return splitAndProcessBuffer(std::move(buffer), chunkFn, outputStream,
-                               config.shouldSplitInputFile(),
-                               /*insertMarkerInOutput=*/true);
+                               config.inputSplitMarker(),
+                               config.outputSplitMarker());
 }
 
 LogicalResult mlir::MlirOptMain(int argc, char **argv,
@@ -546,6 +561,9 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv,
 
   MlirOptMainConfig config = MlirOptMainConfig::createFromCLOptions();
 
+  if (config.shouldShowDialects())
+    return printRegisteredDialects(registry);
+
   // When reading from stdin and the input is a tty, it is often a user mistake
   // and the process "appears to be stuck". Print a message to let the user know
   // about it!
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
index a5c6c2bb2c6a0..d282ee8f61d8f 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
@@ -10,6 +10,7 @@
 
 #include "Protocol.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/Support/ToolUtilities.h"
 #include "mlir/Tools/PDLL/AST/Context.h"
 #include "mlir/Tools/PDLL/AST/Nodes.h"
 #include "mlir/Tools/PDLL/AST/Types.h"
@@ -1621,7 +1622,8 @@ PDLTextFile::getPDLLViewOutput(lsp::PDLLViewOutputKind kind) {
         [&](PDLTextFileChunk &chunk) {
           chunk.document.getPDLLViewOutput(outputOS, kind);
         },
-        [&] { outputOS << "\n// -----\n\n"; });
+        [&] { outputOS << "\n"
+                       << kDefaultSplitMarker << "\n\n"; });
   }
   return result;
 }
@@ -1632,11 +1634,8 @@ void PDLTextFile::initialize(const lsp::URIForFile &uri, int64_t newVersion,
   chunks.clear();
 
   // Split the file into separate PDL documents.
-  // TODO: Find a way to share the split file marker with other tools. We don't
-  // want to use `splitAndProcessBuffer` here, but we do want to make sure this
-  // marker doesn't go out of sync.
   SmallVector<StringRef, 8> subContents;
-  StringRef(contents).split(subContents, "// -----");
+  StringRef(contents).split(subContents, kDefaultSplitMarker);
   chunks.emplace_back(std::make_unique<PDLTextFileChunk>(
       /*lineOffset=*/0, uri, subContents.front(), extraIncludeDirs,
       diagnostics));
diff --git a/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
index 92adb8d6ac97c..bd9928950ecc7 100644
--- a/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
+++ b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
@@ -62,11 +62,16 @@ LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
       llvm::cl::desc("Allow operation with no registered dialects (discouraged: testing only!)"),
       llvm::cl::init(false));
 
-  static llvm::cl::opt<bool> splitInputFile(
-      "split-input-file",
-      llvm::cl::desc("Split the input file into pieces and "
-                     "process each chunk independently"),
-      llvm::cl::init(false));
+  static llvm::cl::opt<std::string> inputSplitMarker{
+      "split-input-file", llvm::cl::ValueOptional,
+      llvm::cl::callback([&](const std::string &str) {
+        // Implicit value: use default marker if flag was used without value.
+        if (str.empty())
+          inputSplitMarker.setValue(kDefaultSplitMarker);
+      }),
+      llvm::cl::desc("Split the input file into chunks using the given or "
+                     "default marker and process each chunk independently"),
+      llvm::cl::init("")};
 
   static llvm::cl::opt<bool> verifyDiagnostics(
       "verify-diagnostics",
@@ -80,6 +85,11 @@ LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
                      "(discouraged: testing only!)"),
       llvm::cl::init(false));
 
+  static llvm::cl::opt<std::string> outputSplitMarker(
+      "output-split-marker",
+      llvm::cl::desc("Split marker to use for merging the ouput"),
+      llvm::cl::init(""));
+
   llvm::InitLLVM y(argc, argv);
 
   // Add flags for all the registered translations.
@@ -176,7 +186,8 @@ LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
   };
 
   if (failed(splitAndProcessBuffer(std::move(input), processBuffer,
-                                   output->os(), splitInputFile)))
+                                   output->os(), inputSplitMarker,
+                                   outputSplitMarker)))
     return failure();
 
   output->keep();
diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index f3a973d999408..84ac69b4514b4 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -14,10 +14,8 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/GenericIteratedDominanceFrontier.h"
@@ -635,13 +633,6 @@ LogicalResult mlir::tryToPromoteMemorySlots(
   return success(promotedAny);
 }
 
-LogicalResult
-Mem2RegPattern::matchAndRewrite(PromotableAllocationOpInterface allocator,
-                                PatternRewriter &rewriter) const {
-  hasBoundedRewriteRecursion();
-  return tryToPromoteMemorySlots({allocator}, rewriter, statistics);
-}
-
 namespace {
 
 struct Mem2Reg : impl::Mem2RegBase<Mem2Reg> {
@@ -650,17 +641,36 @@ struct Mem2Reg : impl::Mem2RegBase<Mem2Reg> {
   void runOnOperation() override {
     Operation *scopeOp = getOperation();
 
-    Mem2RegStatistics statictics{&promotedAmount, &newBlockArgumentAmount};
+    Mem2RegStatistics statistics{&promotedAmount, &newBlockArgumentAmount};
+
+    bool changed = false;
+
+    for (Region &region : scopeOp->getRegions()) {
+      if (region.getBlocks().empty())
+        continue;
 
-    GreedyRewriteConfig config;
-    config.enableRegionSimplification = enableRegionSimplification;
+      OpBuilder builder(&region.front(), region.front().begin());
+      IRRewriter rewriter(builder);
 
-    RewritePatternSet rewritePatterns(&getContext());
-    rewritePatterns.add<Mem2RegPattern>(&getContext(), statictics);
-    FrozenRewritePatternSet frozen(std::move(rewritePatterns));
+      // Promoting a slot can allow for further promotion of other slots,
+      // promotion is tried until no promotion succeeds.
+      while (true) {
+        SmallVector<PromotableAllocationOpInterface> allocators;
+        // Build a list of allocators to attempt to promote the slots of.
+        region.walk([&](PromotableAllocationOpInterface allocator) {
+          allocators.emplace_back(allocator);
+        });
 
-    if (failed(applyPatternsAndFoldGreedily(scopeOp, frozen, config)))
-      signalPassFailure();
+        // Attempt promoting until no promotion succeeds.
+        if (failed(tryToPromoteMemorySlots(allocators, rewriter, statistics)))
+          break;
+
+        changed = true;
+        getAnalysisManager().invalidate({});
+      }
+    }
+    if (!changed)
+      markAllAnalysesPreserved();
   }
 };
 
diff --git a/mlir/lib/Transforms/SROA.cpp b/mlir/lib/Transforms/SROA.cpp
index 3ceda51e1c894..6111489bdebef 100644
--- a/mlir/lib/Transforms/SROA.cpp
+++ b/mlir/lib/Transforms/SROA.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Transforms/SROA.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
 namespace mlir {
@@ -205,13 +204,6 @@ LogicalResult mlir::tryToDestructureMemorySlots(
   return success(destructuredAny);
 }
 
-LogicalResult
-SROAPattern::matchAndRewrite(DestructurableAllocationOpInterface allocator,
-                             PatternRewriter &rewriter) const {
-  hasBoundedRewriteRecursion();
-  return tryToDestructureMemorySlots({allocator}, rewriter, statistics);
-}
-
 namespace {
 
 struct SROA : public impl::SROABase<SROA> {
@@ -223,12 +215,35 @@ struct SROA : public impl::SROABase<SROA> {
     SROAStatistics statistics{&destructuredAmount, &slotsWithMemoryBenefit,
                               &maxSubelementAmount};
 
-    RewritePatternSet rewritePatterns(&getContext());
-    rewritePatterns.add<SROAPattern>(&getContext(), statistics);
-    FrozenRewritePatternSet frozen(std::move(rewritePatterns));
+    bool changed = false;
+
+    for (Region &region : scopeOp->getRegions()) {
+      if (region.getBlocks().empty())
+        continue;
 
-    if (failed(applyPatternsAndFoldGreedily(scopeOp, frozen)))
-      signalPassFailure();
+      OpBuilder builder(&region.front(), region.front().begin());
+      IRRewriter rewriter(builder);
+
+      // Destructuring a slot can allow for further destructuring of other
+      // slots, destructuring is tried until no destructuring succeeds.
+      while (true) {
+        SmallVector<DestructurableAllocationOpInterface> allocators;
+        // Build a list of allocators to attempt to destructure the slots of.
+        // TODO: Update list on the fly to avoid repeated visiting of the same
+        // allocators.
+        region.walk([&](DestructurableAllocationOpInterface allocator) {
+          allocators.emplace_back(allocator);
+        });
+
+        if (failed(
+                tryToDestructureMemorySlots(allocators, rewriter, statistics)))
+          break;
+
+        changed = true;
+      }
+    }
+    if (!changed)
+      markAllAnalysesPreserved();
   }
 };
 
diff --git a/mlir/test/CAPI/llvm.c b/mlir/test/CAPI/llvm.c
index 2fd98b29f487c..48b887ee4a954 100644
--- a/mlir/test/CAPI/llvm.c
+++ b/mlir/test/CAPI/llvm.c
@@ -301,7 +301,7 @@ static void testDebugInfoAttributes(MlirContext ctx) {
 
   // CHECK: #llvm.di_composite_type<{{.*}}>
   mlirAttributeDump(mlirLLVMDICompositeTypeAttrGet(
-      ctx, 0, foo, file, 1, compile_unit, di_type, 0, 64, 8, 1, &di_type));
+      ctx, 0, id, foo, file, 1, compile_unit, di_type, 0, 64, 8, 1, &di_type));
 
   MlirAttribute subroutine_type =
       mlirLLVMDISubroutineTypeAttrGet(ctx, 0x0, 1, &di_type);
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index c80ba48cf8c1a..5918ff2e0f36c 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -708,11 +708,60 @@ func.func @complex_tanh(%arg: complex<f32>) -> complex<f32> {
 // -----
 
 // CHECK-LABEL: func @complex_sqrt
+// CHECK-SAME: %[[ARG:.*]]: complex<f32>
 func.func @complex_sqrt(%arg: complex<f32>) -> complex<f32> {
   %sqrt = complex.sqrt %arg : complex<f32>
   return %sqrt : complex<f32>
 }
 
+// CHECK: %[[CST:.*]]  = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VAR0:.*]] = complex.re %[[ARG]] : complex<f32>
+// CHECK: %[[VAR1:.*]] = complex.im %[[ARG]] : complex<f32>
+// CHECK: %[[VAR2:.*]] = math.absf %[[VAR0]] : f32
+// CHECK: %[[CST0:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[CST1:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK: %[[VAR3:.*]] = complex.re %[[ARG]] : complex<f32>
+// CHECK: %[[VAR4:.*]] = complex.im %[[ARG]] : complex<f32>
+// CHECK: %[[VAR5:.*]] = arith.cmpf oeq, %[[VAR3]], %[[CST0]] : f32
+// CHECK: %[[VAR6:.*]] = arith.cmpf oeq, %[[VAR4]], %[[CST0]] : f32
+// CHECK: %[[VAR7:.*]] = arith.divf %[[VAR4]], %[[VAR3]] : f32
+// CHECK: %[[VAR8:.*]] = arith.mulf %[[VAR7]], %[[VAR7]] : f32
+// CHECK: %[[VAR9:.*]] = arith.addf %[[VAR8]], %[[CST1]] : f32
+// CHECK: %[[VAR10:.*]] = math.sqrt %[[VAR9]] : f32
+// CHECK: %[[VAR11:.*]] = math.absf %[[VAR3]] : f32
+// CHECK: %[[VAR12:.*]] = arith.mulf %[[VAR10]], %[[VAR11]] : f32
+// CHECK: %[[VAR13:.*]] = arith.divf %[[VAR3]], %[[VAR4]] : f32
+// CHECK: %[[VAR14:.*]] = arith.mulf %[[VAR13]], %[[VAR13]] : f32
+// CHECK: %[[VAR15:.*]] = arith.addf %[[VAR14]], %[[CST1]] : f32
+// CHECK: %[[VAR16:.*]] = math.sqrt %[[VAR15]] : f32
+// CHECK: %[[VAR17:.*]] = math.absf %[[VAR4]] : f32
+// CHECK: %[[VAR18:.*]] = arith.mulf %[[VAR16]], %[[VAR17]] : f32
+// CHECK: %[[VAR19:.*]] = arith.cmpf ogt, %[[VAR3]], %[[VAR4]] : f32
+// CHECK: %[[VAR20:.*]] = arith.select %[[VAR19]], %[[VAR12]], %[[VAR18]] : f32
+// CHECK: %[[VAR21:.*]] = arith.select %[[VAR6]], %[[VAR11]], %[[VAR20]] : f32
+// CHECK: %[[VAR22:.*]] = arith.select %[[VAR5]], %[[VAR17]], %[[VAR21]] : f32
+// CHECK: %[[VAR23:.*]] = arith.addf %[[VAR2]], %[[VAR22]] : f32
+// CHECK: %[[CST2:.*]]  = arith.constant 5.000000e-01 : f32
+// CHECK: %[[VAR24:.*]] = arith.mulf %[[VAR23]], %[[CST2]] : f32
+// CHECK: %[[VAR25:.*]] = math.sqrt %[[VAR24]] : f32
+// CHECK: %[[VAR26:.*]] = arith.cmpf olt, %[[VAR0]], %cst : f32
+// CHECK: %[[VAR27:.*]] = arith.cmpf olt, %[[VAR1]], %cst : f32
+// CHECK: %[[VAR28:.*]] = arith.addf %[[VAR25]], %[[VAR25]] : f32
+// CHECK: %[[VAR29:.*]] = arith.divf %[[VAR1]], %[[VAR28]] : f32
+// CHECK: %[[VAR30:.*]] = arith.negf %[[VAR25]] : f32
+// CHECK: %[[VAR31:.*]] = arith.select %[[VAR27]], %[[VAR30]], %[[VAR25]] : f32
+// CHECK: %[[VAR32:.*]] = arith.select %[[VAR26]], %[[VAR31]], %[[VAR29]] : f32
+// CHECK: %[[VAR33:.*]] = arith.addf %[[VAR32]], %[[VAR32]] : f32
+// CHECK: %[[VAR34:.*]] = arith.divf %[[VAR1]], %[[VAR33]] : f32
+// CHECK: %[[VAR35:.*]] = arith.select %[[VAR26]], %[[VAR34]], %[[VAR25]] : f32
+// CHECK: %[[VAR36:.*]] = arith.cmpf oeq, %[[VAR0]], %cst : f32
+// CHECK: %[[VAR37:.*]] = arith.cmpf oeq, %[[VAR1]], %cst : f32
+// CHECK: %[[VAR38:.*]] = arith.andi %[[VAR36]], %[[VAR37]] : i1
+// CHECK: %[[VAR39:.*]] = arith.select %[[VAR38]], %cst, %[[VAR35]] : f32
+// CHECK: %[[VAR40:.*]] = arith.select %[[VAR38]], %cst, %[[VAR32]] : f32
+// CHECK: %[[VAR41:.*]] = complex.create %[[VAR39]], %[[VAR40]] : complex<f32>
+// CHECK: return %[[VAR41]] : complex<f32>
+
 // -----
 
 // CHECK-LABEL: func @complex_conj
@@ -1254,42 +1303,42 @@ func.func @complex_atan2_with_fmf(%lhs: complex<f32>,
 // CHECK: %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK: %[[VAR187:.*]] = complex.re %[[VAR186]] : complex<f32>
 // CHECK: %[[VAR188:.*]] = complex.im %[[VAR186]] : complex<f32>
-// CHECK: %[[VAR189:.*]] = math.absf %[[VAR187]] : f32
+// CHECK: %[[VAR189:.*]] = math.absf %[[VAR187]] fastmath<nnan,contract> : f32
 // CHECK: %[[CST_7:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK: %[[CST_8:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK: %[[VAR190:.*]] = complex.re %[[VAR186]] : complex<f32>
 // CHECK: %[[VAR191:.*]] = complex.im %[[VAR186]] : complex<f32>
 // CHECK: %[[VAR192:.*]] = arith.cmpf oeq, %[[VAR190]], %[[CST_7]] : f32
 // CHECK: %[[VAR193:.*]] = arith.cmpf oeq, %[[VAR191]], %[[CST_7]] : f32
-// CHECK: %[[VAR194:.*]] = arith.divf %[[VAR191]], %[[VAR190]] : f32
-// CHECK: %[[VAR195:.*]] = arith.mulf %[[VAR194]], %[[VAR194]] : f32
-// CHECK: %[[VAR196:.*]] = arith.addf %[[VAR195]], %[[CST_8]] : f32
-// CHECK: %[[VAR197:.*]] = math.sqrt %[[VAR196]] : f32
-// CHECK: %[[VAR198:.*]] = math.absf %[[VAR190]] : f32
-// CHECK: %[[VAR199:.*]] = arith.mulf %[[VAR197]], %[[VAR198]] : f32
-// CHECK: %[[VAR200:.*]] = arith.divf %[[VAR190]], %[[VAR191]] : f32
-// CHECK: %[[VAR201:.*]] = arith.mulf %[[VAR200]], %[[VAR200]] : f32
-// CHECK: %[[VAR202:.*]] = arith.addf %[[VAR201]], %[[CST_8]] : f32
-// CHECK: %[[VAR203:.*]] = math.sqrt %[[VAR202]] : f32
-// CHECK: %[[VAR204:.*]] = math.absf %[[VAR191]] : f32
-// CHECK: %[[VAR205:.*]] = arith.mulf %[[VAR203]], %[[VAR204]] : f32
+// CHECK: %[[VAR194:.*]] = arith.divf %[[VAR191]], %[[VAR190]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR195:.*]] = arith.mulf %[[VAR194]], %[[VAR194]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR196:.*]] = arith.addf %[[VAR195]], %[[CST_8]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR197:.*]] = math.sqrt %[[VAR196]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR198:.*]] = math.absf %[[VAR190]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR199:.*]] = arith.mulf %[[VAR197]], %[[VAR198]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR200:.*]] = arith.divf %[[VAR190]], %[[VAR191]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR201:.*]] = arith.mulf %[[VAR200]], %[[VAR200]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR202:.*]] = arith.addf %[[VAR201]], %[[CST_8]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR203:.*]] = math.sqrt %[[VAR202]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR204:.*]] = math.absf %[[VAR191]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR205:.*]] = arith.mulf %[[VAR203]], %[[VAR204]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR206:.*]] = arith.cmpf ogt, %[[VAR190]], %[[VAR191]] : f32
 // CHECK: %[[VAR207:.*]] = arith.select %[[VAR206]], %[[VAR199]], %[[VAR205]] : f32
 // CHECK: %[[VAR208:.*]] = arith.select %[[VAR193]], %[[VAR198]], %[[VAR207]] : f32
 // CHECK: %[[VAR209:.*]] = arith.select %[[VAR192]], %[[VAR204]], %[[VAR208]] : f32
-// CHECK: %[[VAR210:.*]] = arith.addf %[[VAR189]], %[[VAR209]] : f32
+// CHECK: %[[VAR210:.*]] = arith.addf %[[VAR189]], %[[VAR209]] fastmath<nnan,contract> : f32
 // CHECK: %[[CST_9:.*]] = arith.constant 5.000000e-01 : f32
-// CHECK: %[[VAR211:.*]] = arith.mulf %[[VAR210]], %[[CST_9]] : f32
-// CHECK: %[[VAR212:.*]] = math.sqrt %[[VAR211]] : f32
+// CHECK: %[[VAR211:.*]] = arith.mulf %[[VAR210]], %[[CST_9]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR212:.*]] = math.sqrt %[[VAR211]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR213:.*]] = arith.cmpf olt, %[[VAR187]], %[[CST_6]] : f32
 // CHECK: %[[VAR214:.*]] = arith.cmpf olt, %[[VAR188]], %[[CST_6]] : f32
-// CHECK: %[[VAR215:.*]] = arith.addf %[[VAR212]], %[[VAR212]] : f32
-// CHECK: %[[VAR216:.*]] = arith.divf %[[VAR188]], %[[VAR215]] : f32
+// CHECK: %[[VAR215:.*]] = arith.addf %[[VAR212]], %[[VAR212]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR216:.*]] = arith.divf %[[VAR188]], %[[VAR215]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR217:.*]] = arith.negf %[[VAR212]] : f32
 // CHECK: %[[VAR218:.*]] = arith.select %[[VAR214]], %[[VAR217]], %[[VAR212]] : f32
 // CHECK: %[[VAR219:.*]] = arith.select %[[VAR213]], %[[VAR218]], %[[VAR216]] : f32
-// CHECK: %[[VAR220:.*]] = arith.addf %[[VAR219]], %[[VAR219]] : f32
-// CHECK: %[[VAR221:.*]] = arith.divf %[[VAR188]], %[[VAR220]] : f32
+// CHECK: %[[VAR220:.*]] = arith.addf %[[VAR219]], %[[VAR219]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR221:.*]] = arith.divf %[[VAR188]], %[[VAR220]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR222:.*]] = arith.select %[[VAR213]], %[[VAR221]], %[[VAR212]] : f32
 // CHECK: %[[VAR223:.*]] = arith.cmpf oeq, %[[VAR187]], %[[CST_6]] : f32
 // CHECK: %[[VAR224:.*]] = arith.cmpf oeq, %[[VAR188]], %[[CST_6]] : f32
@@ -1728,3 +1777,60 @@ func.func @complex_div_with_fmf(%lhs: complex<f32>, %rhs: complex<f32>) -> compl
 // CHECK: %[[RESULT_IMAG_WITH_SPECIAL_CASES:.*]] = arith.select %[[RESULT_IS_NAN]], %[[RESULT_IMAG_SPECIAL_CASE_1]], %[[RESULT_IMAG]] : f32
 // CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL_WITH_SPECIAL_CASES]], %[[RESULT_IMAG_WITH_SPECIAL_CASES]] : complex<f32>
 // CHECK: return %[[RESULT]] : complex<f32>
+
+// -----
+
+// CHECK-LABEL: func @complex_sqrt_with_fmf
+// CHECK-SAME: %[[ARG:.*]]: complex<f32>
+func.func @complex_sqrt_with_fmf(%arg: complex<f32>) -> complex<f32> {
+  %sqrt = complex.sqrt %arg fastmath<nnan,contract> : complex<f32>
+  return %sqrt : complex<f32>
+}
+
+// CHECK: %[[CST:.*]]  = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VAR0:.*]] = complex.re %[[ARG]] : complex<f32>
+// CHECK: %[[VAR1:.*]] = complex.im %[[ARG]] : complex<f32>
+// CHECK: %[[VAR2:.*]] = math.absf %[[VAR0]] fastmath<nnan,contract> : f32
+// CHECK: %[[CST0:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[CST1:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK: %[[VAR3:.*]] = complex.re %[[ARG]] : complex<f32>
+// CHECK: %[[VAR4:.*]] = complex.im %[[ARG]] : complex<f32>
+// CHECK: %[[VAR5:.*]] = arith.cmpf oeq, %[[VAR3]], %[[CST0]] : f32
+// CHECK: %[[VAR6:.*]] = arith.cmpf oeq, %[[VAR4]], %[[CST0]] : f32
+// CHECK: %[[VAR7:.*]] = arith.divf %[[VAR4]], %[[VAR3]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR8:.*]] = arith.mulf %[[VAR7]], %[[VAR7]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR9:.*]] = arith.addf %[[VAR8]], %[[CST1]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR10:.*]] = math.sqrt %[[VAR9]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR11:.*]] = math.absf %[[VAR3]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR12:.*]] = arith.mulf %[[VAR10]], %[[VAR11]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR13:.*]] = arith.divf %[[VAR3]], %[[VAR4]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR14:.*]] = arith.mulf %[[VAR13]], %[[VAR13]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR15:.*]] = arith.addf %[[VAR14]], %[[CST1]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR16:.*]] = math.sqrt %[[VAR15]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR17:.*]] = math.absf %[[VAR4]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR18:.*]] = arith.mulf %[[VAR16]], %[[VAR17]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR19:.*]] = arith.cmpf ogt, %[[VAR3]], %[[VAR4]] : f32
+// CHECK: %[[VAR20:.*]] = arith.select %[[VAR19]], %[[VAR12]], %[[VAR18]] : f32
+// CHECK: %[[VAR21:.*]] = arith.select %[[VAR6]], %[[VAR11]], %[[VAR20]] : f32
+// CHECK: %[[VAR22:.*]] = arith.select %[[VAR5]], %[[VAR17]], %[[VAR21]] : f32
+// CHECK: %[[VAR23:.*]] = arith.addf %[[VAR2]], %[[VAR22]] fastmath<nnan,contract> : f32
+// CHECK: %[[CST2:.*]]  = arith.constant 5.000000e-01 : f32
+// CHECK: %[[VAR24:.*]] = arith.mulf %[[VAR23]], %[[CST2]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR25:.*]] = math.sqrt %[[VAR24]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR26:.*]] = arith.cmpf olt, %[[VAR0]], %cst : f32
+// CHECK: %[[VAR27:.*]] = arith.cmpf olt, %[[VAR1]], %cst : f32
+// CHECK: %[[VAR28:.*]] = arith.addf %[[VAR25]], %[[VAR25]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR29:.*]] = arith.divf %[[VAR1]], %[[VAR28]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR30:.*]] = arith.negf %[[VAR25]] : f32
+// CHECK: %[[VAR31:.*]] = arith.select %[[VAR27]], %[[VAR30]], %[[VAR25]] : f32
+// CHECK: %[[VAR32:.*]] = arith.select %[[VAR26]], %[[VAR31]], %[[VAR29]] : f32
+// CHECK: %[[VAR33:.*]] = arith.addf %[[VAR32]], %[[VAR32]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR34:.*]] = arith.divf %[[VAR1]], %[[VAR33]] fastmath<nnan,contract> : f32
+// CHECK: %[[VAR35:.*]] = arith.select %[[VAR26]], %[[VAR34]], %[[VAR25]] : f32
+// CHECK: %[[VAR36:.*]] = arith.cmpf oeq, %[[VAR0]], %cst : f32
+// CHECK: %[[VAR37:.*]] = arith.cmpf oeq, %[[VAR1]], %cst : f32
+// CHECK: %[[VAR38:.*]] = arith.andi %[[VAR36]], %[[VAR37]] : i1
+// CHECK: %[[VAR39:.*]] = arith.select %[[VAR38]], %cst, %[[VAR35]] : f32
+// CHECK: %[[VAR40:.*]] = arith.select %[[VAR38]], %cst, %[[VAR32]] : f32
+// CHECK: %[[VAR41:.*]] = complex.create %[[VAR39]], %[[VAR40]] : complex<f32>
+// CHECK: return %[[VAR41]] : complex<f32>
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 0ac7331e1f698..8920bf86d89b1 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -680,3 +680,15 @@ func.func @fence_proxy() {
   nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cluster>}
   func.return
 }
+
+// -----
+
+// CHECK-LABEL: @llvm_nvvm_barrier_arrive
+// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
+llvm.func @llvm_nvvm_barrier_arrive(%barID : i32, %numberOfThreads : i32) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive 0, $0;", "r" %[[numberOfThreads]] : (i32) -> ()
+  nvvm.barrier.arrive number_of_threads = %numberOfThreads
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive $0, $1'", "r,r" %[[barId]], %[[numberOfThreads]] : (i32, i32) -> ()
+  nvvm.barrier.arrive id = %barID number_of_threads = %numberOfThreads
+  llvm.return
+}
diff --git a/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
index 9fe1e532dfc77..31da59dcdc726 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
@@ -43,18 +43,18 @@ spirv.func @composite_insert_vector(%arg0: vector<3xf32>, %arg1: f32) "None" {
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: @select_scalar
-spirv.func @select_scalar(%arg0: i1, %arg1: vector<3xi32>, %arg2: f32) "None" {
+spirv.func @select_scalar(%arg0: i1, %arg1: vector<3xi32>, %arg2: vector<3xi32>, %arg3: f32, %arg4: f32) "None" {
   // CHECK: llvm.select %{{.*}}, %{{.*}}, %{{.*}} : i1, vector<3xi32>
-  %0 = spirv.Select %arg0, %arg1, %arg1 : i1, vector<3xi32>
+  %0 = spirv.Select %arg0, %arg1, %arg2 : i1, vector<3xi32>
   // CHECK: llvm.select %{{.*}}, %{{.*}}, %{{.*}} : i1, f32
-  %1 = spirv.Select %arg0, %arg2, %arg2 : i1, f32
+  %1 = spirv.Select %arg0, %arg3, %arg4 : i1, f32
   spirv.Return
 }
 
 // CHECK-LABEL: @select_vector
-spirv.func @select_vector(%arg0: vector<2xi1>, %arg1: vector<2xi32>) "None" {
+spirv.func @select_vector(%arg0: vector<2xi1>, %arg1: vector<2xi32>, %arg2: vector<2xi32>) "None" {
   // CHECK: llvm.select %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi1>, vector<2xi32>
-  %0 = spirv.Select %arg0, %arg1, %arg1 : vector<2xi1>, vector<2xi32>
+  %0 = spirv.Select %arg0, %arg1, %arg2 : vector<2xi1>, vector<2xi32>
   spirv.Return
 }
 
diff --git a/mlir/test/Dialect/Affine/access-analysis.mlir b/mlir/test/Dialect/Affine/access-analysis.mlir
index 68310b9323535..789de646a8f9e 100644
--- a/mlir/test/Dialect/Affine/access-analysis.mlir
+++ b/mlir/test/Dialect/Affine/access-analysis.mlir
@@ -1,13 +1,14 @@
 // RUN: mlir-opt %s -split-input-file -test-affine-access-analysis -verify-diagnostics | FileCheck %s
 
-// CHECK-LABEL: func @loop_1d
-func.func @loop_1d(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-LABEL: func @loop_simple
+func.func @loop_simple(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    %c0 = arith.constant 0 : index
    %M = memref.dim %A, %c0 : memref<?x?xf32>
    affine.for %i = 0 to %M {
      affine.for %j = 0 to %M {
        affine.load %A[%c0, %i] : memref<?x?xf32>
        // expected-remark@above {{contiguous along loop 0}}
+       // expected-remark@above {{invariant along loop 1}}
        affine.load %A[%c0, 8 * %i + %j] : memref<?x?xf32>
        // expected-remark@above {{contiguous along loop 1}}
        // Note/FIXME: access stride isn't being checked.
@@ -15,6 +16,7 @@ func.func @loop_1d(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
        // These are all non-contiguous along both loops. Nothing is emitted.
        affine.load %A[%i, %c0] : memref<?x?xf32>
+       // expected-remark@above {{invariant along loop 1}}
        // Note/FIXME: access stride isn't being checked.
        affine.load %A[%i, 8 * %j] : memref<?x?xf32>
        // expected-remark@above {{contiguous along loop 1}}
@@ -27,6 +29,22 @@ func.func @loop_1d(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @loop_unsimplified
+func.func @loop_unsimplified(%A : memref<100xf32>) {
+   affine.for %i = 0 to 100 {
+     affine.load %A[2 * %i - %i - %i] : memref<100xf32>
+     // expected-remark@above {{invariant along loop 0}}
+
+     %m = affine.apply affine_map<(d0) -> (-2 * d0)>(%i)
+     %n = affine.apply affine_map<(d0) -> (2 * d0)>(%i)
+     affine.load %A[(%m + %n) floordiv 2] : memref<100xf32>
+     // expected-remark@above {{invariant along loop 0}}
+   }
+   return
+}
+
+// -----
+
 #map = affine_map<(d0) -> (d0 * 16)>
 #map1 = affine_map<(d0) -> (d0 * 16 + 16)>
 #map2 = affine_map<(d0) -> (d0)>
@@ -41,11 +59,19 @@ func.func @tiled(%arg0: memref<*xf32>) {
         %alloc_0 = memref.alloc() : memref<1x16x1x16xf32>
         affine.for %arg4 = #map(%arg1) to #map1(%arg1) {
           affine.for %arg5 = #map(%arg3) to #map1(%arg3) {
+            // TODO: here and below, the access isn't really invariant
+            // along tile-space IVs where the intra-tile IVs' bounds
+            // depend on them.
             %0 = affine.load %cast[%arg4] : memref<64xf32>
             // expected-remark@above {{contiguous along loop 3}}
+            // expected-remark@above {{invariant along loop 0}}
+            // expected-remark@above {{invariant along loop 1}}
+            // expected-remark@above {{invariant along loop 2}}
+            // expected-remark@above {{invariant along loop 4}}
             affine.store %0, %alloc_0[0, %arg1 * -16 + %arg4, 0, %arg3 * -16 + %arg5] : memref<1x16x1x16xf32>
             // expected-remark@above {{contiguous along loop 4}}
             // expected-remark@above {{contiguous along loop 2}}
+            // expected-remark@above {{invariant along loop 1}}
           }
         }
         affine.for %arg4 = #map(%arg1) to #map1(%arg1) {
@@ -56,6 +82,9 @@ func.func @tiled(%arg0: memref<*xf32>) {
               // expected-remark@above {{contiguous along loop 2}}
               affine.store %0, %alloc[0, %arg5, %arg6, %arg4] : memref<1x224x224x64xf32>
               // expected-remark@above {{contiguous along loop 3}}
+              // expected-remark@above {{invariant along loop 0}}
+              // expected-remark@above {{invariant along loop 1}}
+              // expected-remark@above {{invariant along loop 2}}
             }
           }
         }
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index cb98a10048a30..bdc6c91d92677 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -116,18 +116,6 @@ func.func @selToNot(%arg0: i1) -> i1 {
   return %res : i1
 }
 
-// CHECK-LABEL: @selToArith
-//       CHECK-NEXT:       %[[trueval:.+]] = arith.constant true
-//       CHECK-NEXT:       %[[notcmp:.+]] = arith.xori %arg0, %[[trueval]] : i1
-//       CHECK-NEXT:       %[[condtrue:.+]] = arith.andi %arg0, %arg1 : i1
-//       CHECK-NEXT:       %[[condfalse:.+]] = arith.andi %[[notcmp]], %arg2 : i1
-//       CHECK-NEXT:       %[[res:.+]] = arith.ori %[[condtrue]], %[[condfalse]] : i1
-//       CHECK:   return %[[res]]
-func.func @selToArith(%arg0: i1, %arg1 : i1, %arg2 : i1) -> i1 {
-  %res = arith.select %arg0, %arg1, %arg2 : i1
-  return %res : i1
-}
-
 // CHECK-LABEL: @redundantSelectTrue
 //       CHECK-NEXT: %[[res:.+]] = arith.select %arg0, %arg1, %arg3
 //       CHECK-NEXT: return %[[res]]
@@ -160,74 +148,6 @@ func.func @selNotCond(%arg0: i1, %arg1 : i32, %arg2 : i32, %arg3 : i32, %arg4 :
   return %res1, %res2 : i32, i32
 }
 
-// CHECK-LABEL: @selAndCond
-//       CHECK-NEXT: %[[and:.+]] = arith.andi %arg1, %arg0
-//       CHECK-NEXT: %[[res:.+]] = arith.select %[[and]], %arg2, %arg3
-//       CHECK-NEXT: return %[[res]]
-func.func @selAndCond(%arg0: i1, %arg1: i1, %arg2 : i32, %arg3 : i32) -> i32 {
-  %sel = arith.select %arg0, %arg2, %arg3 : i32
-  %res = arith.select %arg1, %sel, %arg3 : i32
-  return %res : i32
-}
-
-// CHECK-LABEL: @selAndNotCond
-//       CHECK-NEXT: %[[one:.+]] = arith.constant true
-//       CHECK-NEXT: %[[not:.+]] = arith.xori %arg0, %[[one]]
-//       CHECK-NEXT: %[[and:.+]] = arith.andi %arg1, %[[not]]
-//       CHECK-NEXT: %[[res:.+]] = arith.select %[[and]], %arg3, %arg2
-//       CHECK-NEXT: return %[[res]]
-func.func @selAndNotCond(%arg0: i1, %arg1: i1, %arg2 : i32, %arg3 : i32) -> i32 {
-  %sel = arith.select %arg0, %arg2, %arg3 : i32
-  %res = arith.select %arg1, %sel, %arg2 : i32
-  return %res : i32
-}
-
-// CHECK-LABEL: @selAndNotCondVec
-//       CHECK-NEXT: %[[one:.+]] = arith.constant dense<true> : vector<4xi1>
-//       CHECK-NEXT: %[[not:.+]] = arith.xori %arg0, %[[one]]
-//       CHECK-NEXT: %[[and:.+]] = arith.andi %arg1, %[[not]]
-//       CHECK-NEXT: %[[res:.+]] = arith.select %[[and]], %arg3, %arg2
-//       CHECK-NEXT: return %[[res]]
-func.func @selAndNotCondVec(%arg0: vector<4xi1>, %arg1: vector<4xi1>, %arg2 : vector<4xi32>, %arg3 : vector<4xi32>) -> vector<4xi32> {
-  %sel = arith.select %arg0, %arg2, %arg3 : vector<4xi1>, vector<4xi32>
-  %res = arith.select %arg1, %sel, %arg2 : vector<4xi1>, vector<4xi32>
-  return %res : vector<4xi32>
-}
-
-// CHECK-LABEL: @selOrCond
-//       CHECK-NEXT: %[[or:.+]] = arith.ori %arg1, %arg0
-//       CHECK-NEXT: %[[res:.+]] = arith.select %[[or]], %arg2, %arg3
-//       CHECK-NEXT: return %[[res]]
-func.func @selOrCond(%arg0: i1, %arg1: i1, %arg2 : i32, %arg3 : i32) -> i32 {
-  %sel = arith.select %arg0, %arg2, %arg3 : i32
-  %res = arith.select %arg1, %arg2, %sel : i32
-  return %res : i32
-}
-
-// CHECK-LABEL: @selOrNotCond
-//       CHECK-NEXT: %[[one:.+]] = arith.constant true
-//       CHECK-NEXT: %[[not:.+]] = arith.xori %arg0, %[[one]]
-//       CHECK-NEXT: %[[or:.+]] = arith.ori %arg1, %[[not]]
-//       CHECK-NEXT: %[[res:.+]] = arith.select %[[or]], %arg3, %arg2
-//       CHECK-NEXT: return %[[res]]
-func.func @selOrNotCond(%arg0: i1, %arg1: i1, %arg2 : i32, %arg3 : i32) -> i32 {
-  %sel = arith.select %arg0, %arg2, %arg3 : i32
-  %res = arith.select %arg1, %arg3, %sel : i32
-  return %res : i32
-}
-
-// CHECK-LABEL: @selOrNotCondVec
-//       CHECK-NEXT: %[[one:.+]] = arith.constant dense<true> : vector<4xi1>
-//       CHECK-NEXT: %[[not:.+]] = arith.xori %arg0, %[[one]]
-//       CHECK-NEXT: %[[or:.+]] = arith.ori %arg1, %[[not]]
-//       CHECK-NEXT: %[[res:.+]] = arith.select %[[or]], %arg3, %arg2
-//       CHECK-NEXT: return %[[res]]
-func.func @selOrNotCondVec(%arg0: vector<4xi1>, %arg1: vector<4xi1>, %arg2 : vector<4xi32>, %arg3 : vector<4xi32>) -> vector<4xi32> {
-  %sel = arith.select %arg0, %arg2, %arg3 : vector<4xi1>, vector<4xi32>
-  %res = arith.select %arg1, %arg3, %sel : vector<4xi1>, vector<4xi32>
-  return %res : vector<4xi32>
-}
-
 // Test case: Folding of comparisons with equal operands.
 // CHECK-LABEL: @cmpi_equal_operands
 //   CHECK-DAG:   %[[T:.*]] = arith.constant true
diff --git a/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir b/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir
index cba7b00ba77a8..e2be87453bf6f 100644
--- a/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir
+++ b/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir
@@ -40,3 +40,97 @@ func.func @test_lower_vector_arm_neon_without_extsi(%lhs: vector<2x8xi32>, %rhs:
   %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs, %rhs, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32>
   return %res : vector<2x2xi32>
 }
+
+// -----
+
+// CHECK-LABEL: test_lower_vector_arm_neon_unroll
+// CHECK-SAME: %[[VAL_0:.*]]: vector<4x8xi8>, %[[VAL_1:.*]]: vector<4x8xi8>, %[[VAL_2:.*]]: vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_3:.*]] = arith.constant dense<0> : vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_4:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_5:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_6:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_7:.*]] = vector.shape_cast %[[VAL_4]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_8:.*]] = vector.shape_cast %[[VAL_5]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_9:.*]] = vector.shape_cast %[[VAL_6]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_10:.*]] = arm_neon.intr.smmla %[[VAL_9]], %[[VAL_7]], %[[VAL_8]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_11:.*]] = vector.shape_cast %[[VAL_10]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_12:.*]] = vector.insert_strided_slice %[[VAL_11]], %[[VAL_3]] {offsets = [0, 0], strides = [1, 1]} : vector<2x2xi32> into vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_13:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_14:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_15:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_16:.*]] = vector.shape_cast %[[VAL_13]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_17:.*]] = vector.shape_cast %[[VAL_14]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_18:.*]] = vector.shape_cast %[[VAL_15]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_19:.*]] = arm_neon.intr.smmla %[[VAL_18]], %[[VAL_16]], %[[VAL_17]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_20:.*]] = vector.shape_cast %[[VAL_19]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_21:.*]] = vector.insert_strided_slice %[[VAL_20]], %[[VAL_12]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xi32> into vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_22:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_23:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_24:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_25:.*]] = vector.shape_cast %[[VAL_22]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_26:.*]] = vector.shape_cast %[[VAL_23]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_27:.*]] = vector.shape_cast %[[VAL_24]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_28:.*]] = arm_neon.intr.smmla %[[VAL_27]], %[[VAL_25]], %[[VAL_26]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_29:.*]] = vector.shape_cast %[[VAL_28]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_30:.*]] = vector.insert_strided_slice %[[VAL_29]], %[[VAL_21]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xi32> into vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_31:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_32:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_33:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_34:.*]] = vector.shape_cast %[[VAL_31]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_35:.*]] = vector.shape_cast %[[VAL_32]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_36:.*]] = vector.shape_cast %[[VAL_33]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_37:.*]] = arm_neon.intr.smmla %[[VAL_36]], %[[VAL_34]], %[[VAL_35]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_38:.*]] = vector.shape_cast %[[VAL_37]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_39:.*]] = vector.insert_strided_slice %[[VAL_38]], %[[VAL_30]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xi32> into vector<4x4xi32>
+// CHECK-DAG:  return %[[VAL_39]] : vector<4x4xi32>
+// CHECK-DAG:  }
+func.func @test_lower_vector_arm_neon_unroll(%lhs: vector<4x8xi8>, %rhs: vector<4x8xi8>, %acc : vector<4x4xi32>) -> vector<4x4xi32> {
+  %lhs_extsi = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %rhs_extsi = arith.extsi %rhs : vector<4x8xi8> to vector<4x8xi32>
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<4x8xi32>, vector<4x8xi32> into vector<4x4xi32>
+  return %res : vector<4x4xi32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @test_lower_vector_arm_neon_mixed_unroll(
+// CHECK-SAME:                                                       %[[VAL_0:.*]]: vector<4x8xi8>,
+// CHECK-SAME:                                                       %[[VAL_1:.*]]: vector<2x8xi4>,
+// CHECK-SAME:                                                       %[[VAL_2:.*]]: vector<4x2xi32>) -> vector<4x2xi32> {
+// CHECK-DAG:  %[[VAL_3:.*]] = arith.constant dense<0> : vector<4x2xi32>
+// CHECK-DAG:  %[[VAL_4:.*]] = arith.extsi %[[VAL_1]] : vector<2x8xi4> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_5:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_6:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_7:.*]] = vector.shape_cast %[[VAL_5]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_8:.*]] = vector.shape_cast %[[VAL_4]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_9:.*]] = vector.shape_cast %[[VAL_6]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_10:.*]] = arm_neon.intr.smmla %[[VAL_9]], %[[VAL_7]], %[[VAL_8]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_11:.*]] = vector.shape_cast %[[VAL_10]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_12:.*]] = vector.insert_strided_slice %[[VAL_11]], %[[VAL_3]] {offsets = [0, 0], strides = [1, 1]} : vector<2x2xi32> into vector<4x2xi32>
+// CHECK-DAG:  %[[VAL_13:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_14:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_15:.*]] = vector.shape_cast %[[VAL_13]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_16:.*]] = vector.shape_cast %[[VAL_4]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_17:.*]] = vector.shape_cast %[[VAL_14]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_18:.*]] = arm_neon.intr.smmla %[[VAL_17]], %[[VAL_15]], %[[VAL_16]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_19:.*]] = vector.shape_cast %[[VAL_18]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_20:.*]] = vector.insert_strided_slice %[[VAL_19]], %[[VAL_12]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xi32> into vector<4x2xi32>
+// CHECK-DAG:  return %[[VAL_20]] : vector<4x2xi32>
+// CHECK-DAG:  }
+func.func @test_lower_vector_arm_neon_mixed_unroll(%lhs: vector<4x8xi8>, %rhs: vector<2x8xi4>, %acc : vector<4x2xi32>) -> vector<4x2xi32> {
+  %lhs_extsi = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %rhs_extsi = arith.extsi %rhs : vector<2x8xi4> to vector<2x8xi32>
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<4x8xi32>, vector<2x8xi32> into vector<4x2xi32>
+  return %res : vector<4x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @test_lower_vector_arm_neon_unroll_incompatible_shape(
+// CHECK-DAG:  %[[result:.*]] = vector.contract
+func.func @test_lower_vector_arm_neon_unroll_incompatible_shape(%lhs: vector<4x12xi8>, %rhs: vector<4x12xi8>, %acc : vector<4x4xi32>) -> vector<4x4xi32> {
+  %lhs_extsi = arith.extsi %lhs : vector<4x12xi8> to vector<4x12xi32>
+  %rhs_extsi = arith.extsi %rhs : vector<4x12xi8> to vector<4x12xi32>
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<4x12xi32>, vector<4x12xi32> into vector<4x4xi32>
+  return %res : vector<4x4xi32>
+}
diff --git a/mlir/test/Dialect/Bufferization/invalid.mlir b/mlir/test/Dialect/Bufferization/invalid.mlir
index 83f8ef7861543..4ebdb0a8f0490 100644
--- a/mlir/test/Dialect/Bufferization/invalid.mlir
+++ b/mlir/test/Dialect/Bufferization/invalid.mlir
@@ -26,29 +26,6 @@ func.func @alloc_tensor_copy_and_dims(%t: tensor<?xf32>, %sz: index) {
 
 // -----
 
-#DCSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
-
-func.func @sparse_alloc_direct_return() -> tensor<20x40xf32, #DCSR> {
-  // expected-error @+1{{sparse tensor allocation should not escape function}}
-  %0 = bufferization.alloc_tensor() : tensor<20x40xf32, #DCSR>
-  return %0 : tensor<20x40xf32, #DCSR>
-}
-
-// -----
-
-#DCSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
-
-func.func private @foo(tensor<20x40xf32, #DCSR>) -> ()
-
-func.func @sparse_alloc_call() {
-  // expected-error @+1{{sparse tensor allocation should not escape function}}
-  %0 = bufferization.alloc_tensor() : tensor<20x40xf32, #DCSR>
-  call @foo(%0) : (tensor<20x40xf32, #DCSR>) -> ()
-  return
-}
-
-// -----
-
 // expected-error @+1{{invalid value for 'bufferization.access'}}
 func.func private @invalid_buffer_access_type(tensor<*xf32> {bufferization.access = "foo"})
 
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index 58b3a11ed93e1..6294c853d9993 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -235,7 +235,7 @@ func.func @test_misplaced_yield() {
 // -----
 
 func.func @test_assign_to_non_variable(%arg1: f32, %arg2: f32) {
-  // expected-error @+1 {{'emitc.assign' op requires first operand (<block argument> of type 'f32' at index: 1) to be a Variable}}
+  // expected-error @+1 {{'emitc.assign' op requires first operand (<block argument> of type 'f32' at index: 1) to be a Variable or subscript}}
   emitc.assign %arg1 : f32 to %arg2 : f32
   return
 }
@@ -387,3 +387,11 @@ func.func @logical_or_resulterror(%arg0: i32, %arg1: i32) {
   %0 = "emitc.logical_or"(%arg0, %arg1) : (i32, i32) -> i32
   return
 }
+
+// -----
+
+func.func @test_subscript_indices_mismatch(%arg0: !emitc.array<4x8xf32>, %arg2: index) {
+  // expected-error @+1 {{'emitc.subscript' op requires number of indices (1) to match the rank of the array type (2)}}
+  %0 = emitc.subscript %arg0[%arg2] : <4x8xf32>, index
+  return
+}
diff --git a/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir b/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir
index ce6338fb34883..4fc80a87f20df 100644
--- a/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir
+++ b/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(llvm.func(mem2reg{region-simplify=false}))" --split-input-file | FileCheck %s
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(llvm.func(mem2reg))" --split-input-file | FileCheck %s
 
 // CHECK-LABEL: llvm.func @basic_memset
 // CHECK-SAME: (%[[MEMSET_VALUE:.*]]: i8)
@@ -6,13 +6,13 @@ llvm.func @basic_memset(%memset_value: i8) -> i32 {
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
   %memset_len = llvm.mlir.constant(4 : i32) : i32
-  // CHECK-DAG: %[[C8:.*]] = llvm.mlir.constant(8 : i32) : i32
-  // CHECK-DAG: %[[C16:.*]] = llvm.mlir.constant(16 : i32) : i32
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
   // CHECK-NOT: "llvm.intr.memset"
   // CHECK: %[[VALUE_8:.*]] = llvm.zext %[[MEMSET_VALUE]] : i8 to i32
+  // CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[SHIFTED_8:.*]] = llvm.shl %[[VALUE_8]], %[[C8]]
   // CHECK: %[[VALUE_16:.*]] = llvm.or %[[VALUE_8]], %[[SHIFTED_8]]
+  // CHECK: %[[C16:.*]] = llvm.mlir.constant(16 : i32) : i32
   // CHECK: %[[SHIFTED_16:.*]] = llvm.shl %[[VALUE_16]], %[[C16]]
   // CHECK: %[[VALUE_32:.*]] = llvm.or %[[VALUE_16]], %[[SHIFTED_16]]
   // CHECK-NOT: "llvm.intr.memset"
@@ -31,7 +31,14 @@ llvm.func @basic_memset_constant() -> i32 {
   %memset_len = llvm.mlir.constant(4 : i32) : i32
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
   %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
-  // CHECK: %[[RES:.*]] = llvm.mlir.constant(707406378 : i32) : i32
+  // CHECK: %[[C42:.*]] = llvm.mlir.constant(42 : i8) : i8
+  // CHECK: %[[VALUE_42:.*]] = llvm.zext %[[C42]] : i8 to i32
+  // CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK: %[[SHIFTED_42:.*]] = llvm.shl %[[VALUE_42]], %[[C8]]  : i32
+  // CHECK: %[[OR0:.*]] = llvm.or %[[VALUE_42]], %[[SHIFTED_42]]  : i32
+  // CHECK: %[[C16:.*]] = llvm.mlir.constant(16 : i32) : i32
+  // CHECK: %[[SHIFTED:.*]] = llvm.shl %[[OR0]], %[[C16]]  : i32
+  // CHECK: %[[RES:..*]] = llvm.or %[[OR0]], %[[SHIFTED]]  : i32
   // CHECK: llvm.return %[[RES]] : i32
   llvm.return %2 : i32
 }
@@ -44,16 +51,16 @@ llvm.func @exotic_target_memset(%memset_value: i8) -> i40 {
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x i40 {alignment = 4 : i64} : (i32) -> !llvm.ptr
   %memset_len = llvm.mlir.constant(5 : i32) : i32
-  // CHECK-DAG: %[[C8:.*]] = llvm.mlir.constant(8 : i40) : i40
-  // CHECK-DAG: %[[C16:.*]] = llvm.mlir.constant(16 : i40) : i40
-  // CHECK-DAG: %[[C32:.*]] = llvm.mlir.constant(32 : i40) : i40
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
   // CHECK-NOT: "llvm.intr.memset"
   // CHECK: %[[VALUE_8:.*]] = llvm.zext %[[MEMSET_VALUE]] : i8 to i40
+  // CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : i40) : i40
   // CHECK: %[[SHIFTED_8:.*]] = llvm.shl %[[VALUE_8]], %[[C8]]
   // CHECK: %[[VALUE_16:.*]] = llvm.or %[[VALUE_8]], %[[SHIFTED_8]]
+  // CHECK: %[[C16:.*]] = llvm.mlir.constant(16 : i40) : i40
   // CHECK: %[[SHIFTED_16:.*]] = llvm.shl %[[VALUE_16]], %[[C16]]
   // CHECK: %[[VALUE_32:.*]] = llvm.or %[[VALUE_16]], %[[SHIFTED_16]]
+  // CHECK: %[[C32:.*]] = llvm.mlir.constant(32 : i40) : i40
   // CHECK: %[[SHIFTED_COMPL:.*]] = llvm.shl %[[VALUE_32]], %[[C32]]
   // CHECK: %[[VALUE_COMPL:.*]] = llvm.or %[[VALUE_32]], %[[SHIFTED_COMPL]]
   // CHECK-NOT: "llvm.intr.memset"
@@ -64,21 +71,6 @@ llvm.func @exotic_target_memset(%memset_value: i8) -> i40 {
 
 // -----
 
-// CHECK-LABEL: llvm.func @exotic_target_memset_constant
-llvm.func @exotic_target_memset_constant() -> i40 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  %1 = llvm.alloca %0 x i40 {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  %memset_value = llvm.mlir.constant(42 : i8) : i8
-  %memset_len = llvm.mlir.constant(5 : i32) : i32
-  "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i40
-  // CHECK: %[[RES:.*]] = llvm.mlir.constant(181096032810 : i40) : i40
-  // CHECK: llvm.return %[[RES]] : i40
-  llvm.return %2 : i40
-}
-
-// -----
-
 // CHECK-LABEL: llvm.func @no_volatile_memset
 llvm.func @no_volatile_memset() -> i32 {
   // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
@@ -195,7 +187,7 @@ llvm.func @basic_memcpy_dest(%destination: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @double_memcpy
 llvm.func @double_memcpy() -> i32 {
   %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NEXT: %[[DATA:.*]] = llvm.mlir.constant(42 : i32) : i32
+  // CHECK: %[[DATA:.*]] = llvm.mlir.constant(42 : i32) : i32
   %data = llvm.mlir.constant(42 : i32) : i32
   %is_volatile = llvm.mlir.constant(false) : i1
   %memcpy_len = llvm.mlir.constant(4 : i32) : i32
@@ -206,7 +198,7 @@ llvm.func @double_memcpy() -> i32 {
   "llvm.intr.memcpy"(%2, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
 
   %res = llvm.load %2 : !llvm.ptr -> i32
-  // CHECK-NEXT: llvm.return %[[DATA]] : i32
+  // CHECK: llvm.return %[[DATA]] : i32
   llvm.return %res : i32
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index f35393c5e9574..de2904d15b647 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -55,6 +55,16 @@ llvm.func @llvm_nvvm_barrier(%barId : i32, %numberOfThreads : i32) {
   llvm.return
 }
 
+// CHECK-LABEL: @llvm_nvvm_barrier_arrive
+// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
+llvm.func @llvm_nvvm_barrier_arrive(%barId : i32, %numberOfThreads : i32) {
+  // CHECK: nvvm.barrier.arrive number_of_threads = %[[numberOfThreads]]
+  nvvm.barrier.arrive number_of_threads = %numberOfThreads
+  // CHECK: nvvm.barrier.arrive id = %[[barId]] number_of_threads = %[[numberOfThreads]]
+  nvvm.barrier.arrive id = %barId number_of_threads = %numberOfThreads
+  llvm.return
+}
+
 // CHECK-LABEL: @llvm_nvvm_cluster_arrive
 func.func @llvm_nvvm_cluster_arrive() {
   // CHECK: nvvm.cluster.arrive
diff --git a/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir b/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
index c2e3458134ba4..ba73025814cc0 100644
--- a/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
+++ b/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
@@ -146,12 +146,10 @@ llvm.func @invalid_indirect_memset() -> i32 {
 
 // CHECK-LABEL: llvm.func @memset_double_use
 llvm.func @memset_double_use() -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // CHECK-DAG: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
-  // CHECK-DAG: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
+  // CHECK: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // CHECK: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
   %memset_value = llvm.mlir.constant(42 : i8) : i8
@@ -159,8 +157,11 @@ llvm.func @memset_double_use() -> i32 {
   %memset_len = llvm.mlir.constant(8 : i32) : i32
   // We expect two generated memset, one for each field.
   // CHECK-NOT: "llvm.intr.memset"
-  // CHECK-DAG: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
+  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
   // CHECK-NOT: "llvm.intr.memset"
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
   %2 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f32)>
@@ -208,13 +209,10 @@ llvm.func @memset_considers_alignment() -> i32 {
 
 // CHECK-LABEL: llvm.func @memset_considers_packing
 llvm.func @memset_considers_packing() -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // CHECK-DAG: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
-  // CHECK-DAG: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
-  // After SROA, only 32-bit values will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMSET_LEN_WHOLE:.*]] = llvm.mlir.constant(4 : i32) : i32
-  // CHECK-DAG: %[[MEMSET_LEN_PARTIAL:.*]] = llvm.mlir.constant(3 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
+  // CHECK: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // CHECK: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.struct<"foo", packed (i8, i32, f32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
   %memset_value = llvm.mlir.constant(42 : i8) : i8
@@ -222,7 +220,10 @@ llvm.func @memset_considers_packing() -> i32 {
   %memset_len = llvm.mlir.constant(8 : i32) : i32
   // Now all fields are touched by the memset.
   // CHECK-NOT: "llvm.intr.memset"
+  // After SROA, only 32-bit values will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMSET_LEN_WHOLE:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN_WHOLE]]) <{isVolatile = false}>
+  // CHECK: %[[MEMSET_LEN_PARTIAL:.*]] = llvm.mlir.constant(3 : i32) : i32
   // CHECK: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN_PARTIAL]]) <{isVolatile = false}>
   // CHECK-NOT: "llvm.intr.memset"
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
@@ -241,14 +242,14 @@ llvm.func @memset_considers_packing() -> i32 {
 // CHECK-LABEL: llvm.func @memcpy_dest
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memcpy_dest(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr
   %memcpy_len = llvm.mlir.constant(40 : i32) : i32
   // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
+  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMCPY_LEN]]) <{isVolatile = false}>
   "llvm.intr.memcpy"(%1, %other_array, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
@@ -261,9 +262,8 @@ llvm.func @memcpy_dest(%other_array: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @memcpy_src
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
   // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr
@@ -271,14 +271,18 @@ llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
   // Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from.
   // We can only check that the amount of operations and allocated slots is correct, which should be sufficient
   // as unused slots are not generated.
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
   "llvm.intr.memcpy"(%other_array, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
   %3 = llvm.load %2 : !llvm.ptr -> i32
@@ -289,14 +293,19 @@ llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
 
 // CHECK-LABEL: llvm.func @memcpy_double
 llvm.func @memcpy_double() -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
   %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-COUNT-2: = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // CHECK: = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // TODO: This should also disappear as a GEP with all zero indices should be
+  // ignored.
+  // CHECK: = llvm.alloca %[[ALLOCA_LEN]] x !llvm.array<1 x i32>
   %1 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr
   %2 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr
+  // Match the dead constant, to avoid collision with the newly created one.
+  // CHECK: llvm.mlir.constant
   %memcpy_len = llvm.mlir.constant(4 : i32) : i32
   // CHECK-NOT: "llvm.intr.memcpy"
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
   // CHECK-NOT: "llvm.intr.memcpy"
   "llvm.intr.memcpy"(%1, %2, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
@@ -346,14 +355,14 @@ llvm.func @memcpy_no_volatile(%other_array: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @memmove_dest
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memmove_dest(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr
   %memmove_len = llvm.mlir.constant(40 : i32) : i32
   // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
+  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memmove"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMMOVE_LEN]]) <{isVolatile = false}>
   "llvm.intr.memmove"(%1, %other_array, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
@@ -366,9 +375,7 @@ llvm.func @memmove_dest(%other_array: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @memmove_src
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memmove_src(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr
@@ -376,14 +383,18 @@ llvm.func @memmove_src(%other_array: !llvm.ptr) -> i32 {
   // Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from.
   // We can only check that the amount of operations and allocated slots is correct, which should be sufficient
   // as unused slots are not generated.
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
   "llvm.intr.memmove"(%other_array, %1, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
   %3 = llvm.load %2 : !llvm.ptr -> i32
diff --git a/mlir/test/Dialect/LLVMIR/sroa.mlir b/mlir/test/Dialect/LLVMIR/sroa.mlir
index f56ea53ac029e..02d25f27f978a 100644
--- a/mlir/test/Dialect/LLVMIR/sroa.mlir
+++ b/mlir/test/Dialect/LLVMIR/sroa.mlir
@@ -197,3 +197,21 @@ llvm.func @no_dynamic_indexing(%arg: i32) -> i32 {
   // CHECK: llvm.return %[[RES]] : i32
   llvm.return %3 : i32
 }
+
+// -----
+
+// CHECK-LABEL: llvm.func @no_nested_dynamic_indexing
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+llvm.func @no_nested_dynamic_indexing(%arg: i32) -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x !llvm.struct<(array<10 x i32>, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %1 = llvm.alloca %0 x !llvm.struct<(array<10 x i32>, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  // CHECK-NOT: = llvm.alloca
+  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 0, %[[ARG]]]
+  %2 = llvm.getelementptr %1[0, 0, %arg] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.struct<(array<10 x i32>, i32)>
+  // CHECK: %[[RES:.*]] = llvm.load %[[GEP]]
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %3 : i32
+}
diff --git a/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir b/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir
index ee66073a9a419..a1a0c413da0c1 100644
--- a/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-matmul-to-outerproduct.mlir
@@ -1,38 +1,51 @@
 // RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
-func.func @outerproduct_matmul(%A: memref<3x3xf32>, %B: memref<3x3xf32>, %C: memref<3x3xf32>) {
-  linalg.matmul ins(%A, %B: memref<3x3xf32>, memref<3x3xf32>)
+func.func @matmul_to_outerproduct(%A: memref<3x4xf32>, %B: memref<4x3xf32>, %C: memref<3x3xf32>) {
+  linalg.matmul ins(%A, %B: memref<3x4xf32>, memref<4x3xf32>)
             outs(%C: memref<3x3xf32>)
   return
 }
 
-// CHECK-LABEL:   func.func @outerproduct_matmul(
-// CHECK-SAME:      %[[VAL_0:.*]]: memref<3x3xf32>, %[[VAL_1:.*]]: memref<3x3xf32>, %[[VAL_2:.*]]: memref<3x3xf32>) {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[VAL_5:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : memref<3x3xf32>, vector<3x3xf32>
-// CHECK:           %[[VAL_6:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : memref<3x3xf32>, vector<3x3xf32>
-// CHECK:           %[[VAL_7:.*]] = vector.transfer_read %[[VAL_2]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : memref<3x3xf32>, vector<3x3xf32>
-// CHECK:           %[[VAL_8:.*]] = vector.transpose %[[VAL_5]], [1, 0] : vector<3x3xf32> to vector<3x3xf32>
-// CHECK:           %[[VAL_9:.*]] = vector.extract %[[VAL_8]][0] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_10:.*]] = vector.extract %[[VAL_6]][0] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_11:.*]] = vector.outerproduct %[[VAL_9]], %[[VAL_10]], %[[VAL_7]] {kind = #vector.kind<add>} : vector<3xf32>, vector<3xf32>
-// CHECK:           %[[VAL_12:.*]] = vector.extract %[[VAL_8]][1] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_13:.*]] = vector.extract %[[VAL_6]][1] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_14:.*]] = vector.outerproduct %[[VAL_12]], %[[VAL_13]], %[[VAL_11]] {kind = #vector.kind<add>} : vector<3xf32>, vector<3xf32>
-// CHECK:           %[[VAL_15:.*]] = vector.extract %[[VAL_8]][2] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_16:.*]] = vector.extract %[[VAL_6]][2] : vector<3xf32> from vector<3x3xf32>
-// CHECK:           %[[VAL_17:.*]] = vector.outerproduct %[[VAL_15]], %[[VAL_16]], %[[VAL_14]] {kind = #vector.kind<add>} : vector<3xf32>, vector<3xf32>
-// CHECK:           vector.transfer_write %[[VAL_17]], %[[VAL_2]]{{\[}}%[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true]} : vector<3x3xf32>, memref<3x3xf32>
-// CHECK:           return
-// CHECK:         }
+// CHECK-LABEL:   func.func @matmul_to_outerproduct(
+// CHECK-SAME:      %[[A:.*]]: memref<3x4xf32>,
+// CHECK-SAME:      %[[B:.*]]: memref<4x3xf32>,
+// CHECK-SAME:      %[[C:.*]]: memref<3x3xf32>) {
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]
+// CHECK:           %[[VEC_A_T:.*]] = vector.transpose %[[VEC_A]], [1, 0] : vector<3x4xf32> to vector<4x3xf32>
+// CHECK:           %[[A0:.*]] = vector.extract %[[VEC_A_T]][0] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[B0:.*]] = vector.extract %[[VEC_B]][0] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[OP_0:.*]] = vector.outerproduct %[[A0]], %[[B0]], %[[VEC_C]]
+// CHECK:           %[[A1:.*]] = vector.extract %[[VEC_A_T]][1] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[B1:.*]] = vector.extract %[[VEC_B]][1] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[OP_1:.*]] = vector.outerproduct %[[A1]], %[[B1]], %[[OP_0]]
+// CHECK:           %[[A_2:.*]] = vector.extract %[[VEC_A_T]][2] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[B_2:.*]] = vector.extract %[[VEC_B]][2] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[OP_2:.*]] = vector.outerproduct %[[A_2]], %[[B_2]], %[[OP_1]]
+// CHECK:           %[[A_3:.*]] = vector.extract %[[VEC_A_T]][3] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[B_3:.*]] = vector.extract %[[VEC_B]][3] : vector<3xf32> from vector<4x3xf32>
+// CHECK:           %[[RES:.*]] = vector.outerproduct %[[A_3]], %[[B_3]], %[[OP_2]]
+// CHECK:           vector.transfer_write %[[RES]], %[[C]]{{.*}} : vector<3x3xf32>, memref<3x3xf32>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %2 {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.any_op
+
+    // Vectorize: linalg.matmul -> vector.multi_reduction
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %func : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %matmul : !transform.any_op
+
+    // vector.multi_reduction --> vector.contract
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.reduction_to_contract
+      // Reduce the rank of xfer ops. This transform vector.contract to be more
+      // more matmul-like and to enable the lowering to outer product Ops.
+      transform.apply_patterns.vector.transfer_permutation_patterns
+    } : !transform.any_op
+
+    // vector.contract --> vector.outerproduct
+    transform.apply_patterns to %func {
       transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
     } : !transform.any_op
     transform.yield
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
index a1a52397386c9..9127eac5da951 100644
--- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -19,10 +19,49 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @depthwise_conv1d_nwc_wc_dyn_ch_dim(%input: memref<3x5x?xf32>, %filter: memref<2x?xf32>, %output: memref<3x2x?xf32>) {
+// Masked vectorisation of 1D depthwise CW convs is not yet supported
+
+func.func @depthwise_conv1d_ncw_cw(%input: memref<3x?x4xf32>, %filter: memref<?x1xf32>, %output: memref<3x?x4xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.depthwise_conv_1d_ncw_cw
+    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x?x4xf32>, memref<?x1xf32>)
+    outs(%output : memref<3x?x4xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_ncw_cw"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [3, 4, 5, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_dyn_w_dim(%input: memref<3x?x4xf32>, %filter: memref<?x4xf32>, %output: memref<3x?x4xf32>) {
   // expected-error @+1 {{Attempted to vectorize, but failed}}
   linalg.depthwise_conv_1d_nwc_wc
     {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x?x4xf32>, memref<?x4xf32>)
+    outs(%output : memref<3x?x4xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [3, 2, 4, 2] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_dyn_ch_dim(%input: memref<3x5x?xf32>, %filter: memref<2x?xf32>, %output: memref<3x2x?xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.depthwise_conv_1d_nwc_wc
     ins(%input, %filter : memref<3x5x?xf32>, memref<2x?xf32>)
     outs(%output : memref<3x2x?xf32>)
   return
@@ -41,7 +80,6 @@ module attributes {transform.with_named_sequence} {
 func.func @depthwise_conv1d_nwc_wc_dyn_w_dim(%input: memref<3x?x3xf32>, %filter: memref<2x3xf32>, %output: memref<3x?x3xf32>) {
   // expected-error @+1 {{Attempted to vectorize, but failed}}
   linalg.depthwise_conv_1d_nwc_wc
-    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %filter : memref<3x?x3xf32>, memref<2x3xf32>)
     outs(%output : memref<3x?x3xf32>)
   return
diff --git a/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir b/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir
new file mode 100644
index 0000000000000..84b556d80887c
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir
@@ -0,0 +1,185 @@
+// RUN: mlir-opt -split-input-file -transform-interpreter -cse %s | FileCheck %s
+
+func.func @depthwise_conv1d_nwc_wc_1x8x3xi8_tensor(%input: tensor<1x8x?xi8>,
+                                                   %filter: tensor<1x?xi8>,
+                                                   %output: tensor<1x8x?xi8>) -> (tensor<1x8x?xi8>) {
+  %res = linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<1> : vector<1xi64>,
+    strides = dense<1> : vector<1xi64>}
+    ins(%input, %filter : tensor<1x8x?xi8>, tensor<1x?xi8>)
+    outs(%output : tensor<1x8x?xi8>) -> tensor<1x8x?xi8>
+  return %res : tensor<1x8x?xi8>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [1, 8, 4, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL:   func.func @depthwise_conv1d_nwc_wc_1x8x3xi8_tensor(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<1x8x?xi8>,
+// CHECK-SAME:      %[[FILTER:.*]]: tensor<1x?xi8>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: tensor<1x8x?xi8>) -> tensor<1x8x?xi8> {
+
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0 : i8
+
+/// Create a mask for the input tensor
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[CH_DIM_IN:.*]] = tensor.dim %[[INPUT]], %[[C2]] : tensor<1x8x?xi8>
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[MASK_IN:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_IN]] : vector<1x8x4xi1>
+/// Read the input tensor
+// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x4xi8> } : vector<1x8x4xi1> -> vector<1x8x4xi8>
+
+/// Create a mask for the filter tensor
+// CHECK:           %[[CH_DIM_FLT:.*]] = tensor.dim %[[FILTER]], %[[C1]] : tensor<1x?xi8>
+// CHECK:           %[[MASK_FLT:.*]] = vector.create_mask %[[C1]], %[[CH_DIM_FLT]] : vector<1x4xi1>
+/// Read the filter tensor
+// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : tensor<1x?xi8>, vector<1x4xi8> } : vector<1x4xi1> -> vector<1x4xi8>
+
+/// Create a mask for the output tensor
+// CHECK:           %[[CH_DIM_OUT:.*]] = tensor.dim %[[OUTPUT]], %[[C2]] : tensor<1x8x?xi8>
+// CHECK:           %[[MASK_OUT:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_OUT]] : vector<1x8x4xi1>
+// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x4xi8> } : vector<1x8x4xi1> -> vector<1x8x4xi8>
+
+/// Convolution
+// CHECK:           %[[IN_1:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x4xi8> to vector<1x8x4xi8>
+// CHECK:           %[[FLT_1:.*]] = vector.extract %[[VEC_FLT]][0] : vector<4xi8> from vector<1x4xi8>
+// CHECK:           %[[OUT_1:.*]] = vector.extract_strided_slice %[[VEC_OUT]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x4xi8> to vector<1x8x4xi8>
+// CHECK:           %[[FLT_1_B:.*]] = vector.broadcast %[[FLT_1]] : vector<4xi8> to vector<1x8x4xi8>
+// CHECK:           %[[MULI:.*]] = arith.muli %[[IN_1]], %[[FLT_1_B]] : vector<1x8x4xi8>
+// CHECK:           %[[ADDI:.*]] = arith.addi %[[MULI]], %[[OUT_1]] : vector<1x8x4xi8>
+// CHECK:           %[[OUT_INS:.*]] = vector.insert_strided_slice %[[ADDI]], %[[VEC_OUT]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x8x4xi8> into vector<1x8x4xi8>
+// CHECK:           %[[OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : vector<1x8x4xi8>, tensor<1x8x?xi8> } : vector<1x8x4xi1> -> tensor<1x8x?xi8>
+// CHECK:           return %[[OUT]] : tensor<1x8x?xi8>
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_1x8x3xi8_tensor_scalable(
+      %input: tensor<1x8x?xi8>,
+      %filter: tensor<1x?xi8>,
+      %output: tensor<1x8x?xi8>) -> (tensor<1x8x?xi8>) {
+  %res = linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<1> : vector<1xi64>,
+    strides = dense<1> : vector<1xi64>}
+    ins(%input, %filter : tensor<1x8x?xi8>, tensor<1x?xi8>)
+    outs(%output : tensor<1x8x?xi8>) -> tensor<1x8x?xi8>
+  return %res : tensor<1x8x?xi8>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [1, 8, [4], 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL:   func.func @depthwise_conv1d_nwc_wc_1x8x3xi8_tensor_scalable(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<1x8x?xi8>,
+// CHECK-SAME:      %[[FILTER:.*]]: tensor<1x?xi8>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: tensor<1x8x?xi8>) -> tensor<1x8x?xi8> {
+
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0 : i8
+
+/// Create a mask for the input tensor
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[CH_DIM_IN:.*]] = tensor.dim %[[INPUT]], %[[C2]] : tensor<1x8x?xi8>
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[MASK_IN:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_IN]] : vector<1x8x[4]xi1>
+/// Read the input tensor
+// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x[4]xi8> } : vector<1x8x[4]xi1> -> vector<1x8x[4]xi8>
+
+/// Create a mask for the filter tensor
+// CHECK:           %[[CH_DIM_FLT:.*]] = tensor.dim %[[FILTER]], %[[C1]] : tensor<1x?xi8>
+// CHECK:           %[[MASK_FLT:.*]] = vector.create_mask %[[C1]], %[[CH_DIM_FLT]] : vector<1x[4]xi1>
+/// Read the filter tensor
+// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : tensor<1x?xi8>, vector<1x[4]xi8> } : vector<1x[4]xi1> -> vector<1x[4]xi8>
+
+/// Create a mask for the output tensor
+// CHECK:           %[[CH_DIM_OUT:.*]] = tensor.dim %[[OUTPUT]], %[[C2]] : tensor<1x8x?xi8>
+// CHECK:           %[[MASK_OUT:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_OUT]] : vector<1x8x[4]xi1>
+/// Read the output tensor
+// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x[4]xi8> } : vector<1x8x[4]xi1> -> vector<1x8x[4]xi8>
+
+/// Convolution
+// CHECK:           %[[IN_1:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x[4]xi8> to vector<1x8x[4]xi8>
+// CHECK:           %[[FLT_1:.*]] = vector.extract %[[VEC_FLT]][0] : vector<[4]xi8> from vector<1x[4]xi8>
+// CHECK:           %[[OUT_1:.*]] = vector.extract_strided_slice %[[VEC_OUT]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x[4]xi8> to vector<1x8x[4]xi8>
+// CHECK:           %[[FLT_1_B:.*]] = vector.broadcast %[[FLT_1]] : vector<[4]xi8> to vector<1x8x[4]xi8>
+// CHECK:           %[[MULI:.*]] = arith.muli %[[IN_1]], %[[FLT_1_B]] : vector<1x8x[4]xi8>
+// CHECK:           %[[ADDI:.*]] = arith.addi %[[MULI]], %[[OUT_1]] : vector<1x8x[4]xi8>
+// CHECK:           %[[OUT_INS:.*]] = vector.insert_strided_slice %[[ADDI]], %[[VEC_OUT]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x8x[4]xi8> into vector<1x8x[4]xi8>
+// CHECK:           %[[OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : vector<1x8x[4]xi8>, tensor<1x8x?xi8> } : vector<1x8x[4]xi1> -> tensor<1x8x?xi8>
+// CHECK:           return %[[OUT]] : tensor<1x8x?xi8>
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_3x5x4xf32_memref_dilation_2(
+      %input: memref<3x5x?xf32>,
+      %filter: memref<2x?xf32>,
+      %output: memref<3x2x?xf32>) {
+  linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x5x?xf32>, memref<2x?xf32>)
+    outs(%output : memref<3x2x?xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [3, 2, [4], 2] : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL:   func.func @depthwise_conv1d_nwc_wc_3x5x4xf32_memref_dilation_2(
+// CHECK-SAME:      %[[INPUT:.*]]: memref<3x5x?xf32>,
+// CHECK-SAME:      %[[FILTER:.*]]: memref<2x?xf32>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: memref<3x2x?xf32>) {
+
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+
+/// Create a mask for the input tensor
+// CHECK:           %[[CH_DIM_IN:.*]] = memref.dim %[[INPUT]], %[[C2]] : memref<3x5x?xf32>
+// CHECK:           %[[C3:.*]] = arith.constant 3 : index
+// CHECK:           %[[C5:.*]] = arith.constant 5 : index
+// CHECK:           %[[MASK_IN:.*]] = vector.create_mask %[[C3]], %[[C5]], %[[CH_DIM_IN]] : vector<3x4x[4]xi1>
+/// Read the input tensor
+// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : memref<3x5x?xf32>, vector<3x4x[4]xf32> } : vector<3x4x[4]xi1> -> vector<3x4x[4]xf32>
+
+/// Create a mask for the filter tensor
+// CHECK:           %[[CH_DIM_FLT:.*]] = memref.dim %[[FILTER]], %[[C1]] : memref<2x?xf32>
+// CHECK:           %[[MASK_FLT:.*]] = vector.create_mask %[[C2]], %[[CH_DIM_FLT]] : vector<2x[4]xi1>
+/// Read the filter tensor
+// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : memref<2x?xf32>, vector<2x[4]xf32> } : vector<2x[4]xi1> -> vector<2x[4]xf32>
+
+/// Create a mask for the output tensor
+// CHECK:           %[[CH_DIM_OUT:.*]] = memref.dim %[[OUTPUT]], %[[C2]] : memref<3x2x?xf32>
+// CHECK:           %[[MASK_OUT:.*]] = vector.create_mask %[[C3]], %[[C2]], %[[CH_DIM_OUT]] : vector<3x2x[4]xi1>
+/// Read the output tensor
+// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : memref<3x2x?xf32>, vector<3x2x[4]xf32> } : vector<3x2x[4]xi1> -> vector<3x2x[4]xf32>
+
+/// Convolution
+// CHECK:           %[[IN_1:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 0, 0], sizes = [3, 2, 4], strides = [1, 1, 1]} : vector<3x4x[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[IN_2:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 2, 0], sizes = [3, 2, 4], strides = [1, 1, 1]} : vector<3x4x[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[FLT_1:.*]] = vector.extract %[[VEC_FLT]][0] : vector<[4]xf32> from vector<2x[4]xf32>
+// CHECK:           %[[FLT_2:.*]] = vector.extract %[[VEC_FLT]][1] : vector<[4]xf32> from vector<2x[4]xf32>
+// CHECK:           %[[OUT_1:.*]] = vector.extract_strided_slice %[[VEC_OUT]] {offsets = [0, 0, 0], sizes = [3, 2, 4], strides = [1, 1, 1]} : vector<3x2x[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[FLT_1_B:.*]] = vector.broadcast %[[FLT_1]] : vector<[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[FMA_1:.*]] = vector.fma %[[IN_1]], %[[FLT_1_B]], %[[OUT_1]] : vector<3x2x[4]xf32>
+// CHECK:           %[[FLT_2_B:.*]] = vector.broadcast %[[FLT_2]] : vector<[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[FMA_2:.*]] = vector.fma %[[IN_2]], %[[FLT_2_B]], %[[FMA_1]] : vector<3x2x[4]xf32>
+// CHECK:           %[[OUT_INS:.*]] = vector.insert_strided_slice %[[FMA_2]], %[[VEC_OUT]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<3x2x[4]xf32> into vector<3x2x[4]xf32>
+// CHECK:           vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : vector<3x2x[4]xf32>, memref<3x2x?xf32> } : vector<3x2x[4]xi1>
diff --git a/mlir/test/Dialect/MLProgram/inlining.mlir b/mlir/test/Dialect/MLProgram/inlining.mlir
new file mode 100644
index 0000000000000..ac3677e1b9288
--- /dev/null
+++ b/mlir/test/Dialect/MLProgram/inlining.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-opt %s -inline | FileCheck %s
+
+// Verifies that regions with operations from the ml_program dialect can
+// be inlined.
+
+ml_program.global private @global(dense<4> : tensor<4xi32>) : tensor<4xi32>
+
+// CHECK: @inline_into
+func.func @inline_into() -> tensor<4xi32> {
+  // CHECK-NOT: @inline_from
+  // CHECK: ml_program.global_load_const
+  %0 = call @inline_from() : () -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+func.func @inline_from() -> tensor<4xi32> {
+  %0 = ml_program.global_load_const @global : tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
diff --git a/mlir/test/Dialect/Math/expand-math.mlir b/mlir/test/Dialect/Math/expand-math.mlir
index 6ee65b085dad1..6326d3a71874b 100644
--- a/mlir/test/Dialect/Math/expand-math.mlir
+++ b/mlir/test/Dialect/Math/expand-math.mlir
@@ -7,19 +7,18 @@ func.func @tanh(%arg: f32) -> f32 {
 }
 // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG: %[[ONE:.+]] = arith.constant 1.000000e+00 : f32
-// CHECK-DAG: %[[TWO:.+]] = arith.constant 2.000000e+00 : f32
-// CHECK: %[[DOUBLEDX:.+]] = arith.mulf %arg0, %[[TWO]] : f32
-// CHECK: %[[NEGDOUBLEDX:.+]] = arith.negf %[[DOUBLEDX]] : f32
+// CHECK-DAG: %[[TWO:.+]] = arith.constant -2.000000e+00 : f32
+// CHECK: %[[VAL0:.+]] = arith.cmpf olt, %arg0, %[[ZERO]] : f32
+// CHECK: %[[VAL1:.+]] = arith.uitofp %[[VAL0]] : i1 to f32
+// CHECK: %[[VAL2:.+]] = arith.mulf %[[VAL1]], %[[TWO]] : f32
+// CHECK: %[[SIGN:.+]] = arith.addf %[[VAL2]], %[[ONE]] : f32
+// CHECK: %[[POSX:.+]] = arith.mulf %[[SIGN]], %arg0 : f32
+// CHECK: %[[NEGDOUBLEDX:.+]] = arith.mulf %[[POSX]], %[[TWO]] : f32
 // CHECK: %[[EXP1:.+]] = math.exp %[[NEGDOUBLEDX]] : f32
 // CHECK: %[[DIVIDEND1:.+]] = arith.subf %[[ONE]], %[[EXP1]] : f32
 // CHECK: %[[DIVISOR1:.+]] = arith.addf %[[EXP1]], %[[ONE]] : f32
-// CHECK: %[[RES1:.+]] = arith.divf %[[DIVIDEND1]], %[[DIVISOR1]] : f32
-// CHECK: %[[EXP2:.+]] = math.exp %[[DOUBLEDX]] : f32
-// CHECK: %[[DIVIDEND2:.+]] = arith.subf %[[EXP2]], %[[ONE]] : f32
-// CHECK: %[[DIVISOR2:.+]] = arith.addf %[[EXP2]], %[[ONE]] : f32
-// CHECK: %[[RES2:.+]] = arith.divf %[[DIVIDEND2]], %[[DIVISOR2]] : f32
-// CHECK: %[[COND:.+]] = arith.cmpf oge, %arg0, %[[ZERO]] : f32
-// CHECK: %[[RESULT:.+]] = arith.select %[[COND]], %[[RES1]], %[[RES2]] : f32
+// CHECK: %[[POSRES:.+]] = arith.divf %[[DIVIDEND1]], %[[DIVISOR1]] : f32
+// CHECK: %[[RESULT:.+]] = arith.mulf %[[SIGN]], %[[POSRES]] : f32
 // CHECK: return %[[RESULT]]
 
 // -----
diff --git a/mlir/test/Dialect/Math/polynomial-approximation.mlir b/mlir/test/Dialect/Math/polynomial-approximation.mlir
index 834a7dc0af66d..93ecd67f14dd3 100644
--- a/mlir/test/Dialect/Math/polynomial-approximation.mlir
+++ b/mlir/test/Dialect/Math/polynomial-approximation.mlir
@@ -94,6 +94,20 @@ func.func @erf_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @erf_scalable_vector(
+// CHECK-SAME:                     %[[arg0:.*]]: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[zero:.*]] = arith.constant dense<0.000000e+00> : vector<[8]xf32>
+// CHECK-NOT:       erf
+// CHECK-NOT:       vector<8xf32>
+// CHECK-COUNT-20:  select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[res:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[res]] : vector<[8]xf32>
+// CHECK:         }
+func.func @erf_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.erf %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // CHECK-LABEL:   func @exp_scalar(
 // CHECK-SAME:                     %[[VAL_0:.*]]: f32) -> f32 {
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 5.000000e-01 : f32
@@ -151,6 +165,17 @@ func.func @exp_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @exp_scalable_vector
+// CHECK-NOT:      math.exp
+// CHECK-NOT:      vector<8xf32>
+// CHECK-COUNT-46: vector<[8]x{{(i32)|(f32)}}>
+// CHECK-NOT:      vector<8xf32>
+// CHECK-NOT:      math.exp
+func.func @exp_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.exp %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // CHECK-LABEL:   func @expm1_scalar(
 // CHECK-SAME:                       %[[X:.*]]: f32) -> f32 {
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f32
@@ -277,6 +302,22 @@ func.func @expm1_vector(%arg0: vector<8x8xf32>) -> vector<8x8xf32> {
   return %0 : vector<8x8xf32>
 }
 
+// CHECK-LABEL:   func @expm1_scalable_vector(
+// CHECK-SAME:                       %{{.*}}: vector<8x[8]xf32>) -> vector<8x[8]xf32> {
+// CHECK-NOT:       vector<8x8xf32>
+// CHECK-NOT:       exp
+// CHECK-NOT:       log
+// CHECK-NOT:       expm1
+// CHECK-COUNT-127: vector<8x[8]x{{(i32)|(f32)|(i1)}}>
+// CHECK-NOT:       vector<8x8xf32>
+// CHECK-NOT:       exp
+// CHECK-NOT:       log
+// CHECK-NOT:       expm1
+func.func @expm1_scalable_vector(%arg0: vector<8x[8]xf32>) -> vector<8x[8]xf32> {
+  %0 = math.expm1 %arg0 : vector<8x[8]xf32>
+  return %0 : vector<8x[8]xf32>
+}
+
 // CHECK-LABEL:   func @log_scalar(
 // CHECK-SAME:                             %[[X:.*]]: f32) -> f32 {
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
@@ -357,6 +398,18 @@ func.func @log_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @log_scalable_vector(
+// CHECK-SAME:                     %{{.*}}: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[CST_LN2:.*]] = arith.constant dense<0.693147182> : vector<[8]xf32>
+// CHECK-COUNT-5:   select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[VAL_71:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[VAL_71]] : vector<[8]xf32>
+// CHECK:         }
+func.func @log_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.log %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // CHECK-LABEL:   func @log2_scalar(
 // CHECK-SAME:                      %[[VAL_0:.*]]: f32) -> f32 {
 // CHECK:           %[[CST_LOG2E:.*]] = arith.constant 1.44269502 : f32
@@ -381,6 +434,18 @@ func.func @log2_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @log2_scalable_vector(
+// CHECK-SAME:                      %{{.*}}: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[CST_LOG2E:.*]] = arith.constant dense<1.44269502> : vector<[8]xf32>
+// CHECK-COUNT-5:   select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[VAL_71:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[VAL_71]] : vector<[8]xf32>
+// CHECK:         }
+func.func @log2_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.log2 %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // CHECK-LABEL:   func @log1p_scalar(
 // CHECK-SAME:                       %[[X:.*]]: f32) -> f32 {
 // CHECK:           %[[CST_ONE:.*]] = arith.constant 1.000000e+00 : f32
@@ -414,6 +479,17 @@ func.func @log1p_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @log1p_scalable_vector(
+// CHECK-SAME:                       %[[VAL_0:.*]]: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[CST_ONE:.*]] = arith.constant dense<1.000000e+00> : vector<[8]xf32>
+// CHECK-COUNT-6:   select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[VAL_79:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[VAL_79]] : vector<[8]xf32>
+// CHECK:         }
+func.func @log1p_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.log1p %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
 
 // CHECK-LABEL:   func @tanh_scalar(
 // CHECK-SAME:                      %[[VAL_0:.*]]: f32) -> f32 {
@@ -470,6 +546,19 @@ func.func @tanh_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @tanh_scalable_vector(
+// CHECK-SAME:                      %[[VAL_0:.*]]: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant dense<-7.99881172> : vector<[8]xf32>
+// CHECK-NOT:       tanh
+// CHECK-COUNT-2:   select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[VAL_33:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[VAL_33]] : vector<[8]xf32>
+// CHECK:         }
+func.func @tanh_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.tanh %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // We only approximate rsqrt for vectors and when the AVX2 option is enabled.
 // CHECK-LABEL:   func @rsqrt_scalar
 // AVX2-LABEL:    func @rsqrt_scalar
diff --git a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
index 1cb69891a70ed..de21d114e9fc4 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
@@ -1346,6 +1346,52 @@ func.func @convert_logical_or_true_false_vector(%arg: vector<3xi1>) -> (vector<3
 
 // -----
 
+//===----------------------------------------------------------------------===//
+// spirv.Select
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @convert_select_scalar
+// CHECK-SAME: %[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32
+func.func @convert_select_scalar(%arg1: i32, %arg2: i32) -> (i32, i32) {
+  %true = spirv.Constant true
+  %false = spirv.Constant false
+  %0 = spirv.Select %true, %arg1, %arg2 : i1, i32
+  %1 = spirv.Select %false, %arg1, %arg2 : i1, i32
+
+  // CHECK: return %[[ARG1]], %[[ARG2]]
+  return %0, %1 : i32, i32
+}
+
+// CHECK-LABEL: @convert_select_vector
+// CHECK-SAME: %[[ARG1:.+]]: vector<3xi32>, %[[ARG2:.+]]: vector<3xi32>
+func.func @convert_select_vector(%arg1: vector<3xi32>, %arg2: vector<3xi32>) -> (vector<3xi32>, vector<3xi32>) {
+  %true = spirv.Constant dense<true> : vector<3xi1>
+  %false = spirv.Constant dense<false> : vector<3xi1>
+  %0 = spirv.Select %true, %arg1, %arg2 : vector<3xi1>, vector<3xi32>
+  %1 = spirv.Select %false, %arg1, %arg2 : vector<3xi1>, vector<3xi32>
+
+  // CHECK: return %[[ARG1]], %[[ARG2]]
+  return %0, %1: vector<3xi32>, vector<3xi32>
+}
+
+// CHECK-LABEL: @convert_select_vector_extra
+// CHECK-SAME: %[[CONDITIONS:.+]]: vector<2xi1>, %[[ARG1:.+]]: vector<2xi32>
+func.func @convert_select_vector_extra(%conditions: vector<2xi1>, %arg1: vector<2xi32>) -> (vector<2xi32>, vector<2xi32>) {
+  %true_false = spirv.Constant dense<[true, false]> : vector<2xi1>
+  %cvec_1 = spirv.Constant dense<[42, -132]> : vector<2xi32>
+  %cvec_2 = spirv.Constant dense<[0, 42]> : vector<2xi32>
+
+  // CHECK: %[[RES:.+]] = spirv.Constant dense<42>
+  %0 = spirv.Select %true_false, %cvec_1, %cvec_2: vector<2xi1>, vector<2xi32>
+
+  %1 = spirv.Select %conditions, %arg1, %arg1 : vector<2xi1>, vector<2xi32>
+
+  // CHECK: return %[[RES]], %[[ARG1]]
+  return %0, %1: vector<2xi32>, vector<2xi32>
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.IEqual
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
index 48f28ef390ed5..18851f29d8eaa 100644
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -868,16 +868,6 @@ func.func @sparse_sort_coo_no_perm(%arg0: index, %arg1: memref<?xindex>) -> (mem
 
 // -----
 
-#CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
-
-func.func @sparse_alloc_escapes(%arg0: index) -> tensor<10x?xf64, #CSR> {
-  // expected-error@+1 {{sparse tensor allocation should not escape function}}
-  %0 = bufferization.alloc_tensor(%arg0) : tensor<10x?xf64, #CSR>
-  return %0: tensor<10x?xf64, #CSR>
-}
-
-// -----
-
 #UnorderedCOO = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : compressed(nonunique, nonordered), d1 : singleton(nonordered))}>
 #OrderedCOOPerm = #sparse_tensor.encoding<{map = (d0, d1) -> (d1 : compressed(nonunique), d0 : singleton)}>
 
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
index e9e458e805ba4..a47a3d5119f96 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -728,3 +728,13 @@ func.func @sparse_print(%arg0: tensor<10x10xf64, #CSR>) {
   sparse_tensor.print %arg0 : tensor<10x10xf64, #CSR>
   return
 }
+
+// -----
+
+// CHECK-LABEL:   func.func @sparse_has_runtime() -> i1
+// CHECK:           %[[H:.*]] = sparse_tensor.has_runtime_library
+// CHECK:           return %[[H]] : i1
+func.func @sparse_has_runtime() -> i1 {
+  %has_runtime = sparse_tensor.has_runtime_library
+  return %has_runtime : i1
+}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 4c73a6271786e..627ac54cf145b 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2483,6 +2483,18 @@ func.func @all_true_vector_mask(%a : vector<3x4xf32>) -> vector<3x4xf32> {
 
 // -----
 
+// CHECK-LABEL: func @all_true_vector_mask_no_result(
+func.func @all_true_vector_mask_no_result(%a : vector<3x4xf32>, %m : memref<3x4xf32>) {
+//   CHECK-NOT:   vector.mask
+//       CHECK:   vector.transfer_write
+  %c0 = arith.constant 0 : index
+  %all_true = vector.constant_mask [3, 4] : vector<3x4xi1>
+  vector.mask %all_true { vector.transfer_write %a, %m[%c0, %c0] : vector<3x4xf32>, memref<3x4xf32> } : vector<3x4xi1>
+  return
+}
+
+// -----
+
 // CHECK-LABEL:   func.func @fold_shape_cast_with_mask(
 // CHECK-SAME:     %[[VAL_0:.*]]: tensor<1x?xf32>) -> vector<1x4xi1> {
 func.func @fold_shape_cast_with_mask(%arg0: tensor<1x?xf32>) -> vector<1x4xi1> {
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index a42ed6968d370..441f7500538f7 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -9,9 +9,6 @@
 // RUN:  -shared-libs=%mlir_c_runner_utils |\
 // RUN: FileCheck %s
 
-// XFAIL: target=aarch64{{.*}}
-// See: https://github.com/llvm/llvm-project/issues/58531
-
 func.func @test_unary(%input: tensor<?xcomplex<f32>>,
                       %func: (complex<f32>) -> complex<f32>) {
   %c0 = arith.constant 0 : index
@@ -189,8 +186,9 @@ func.func @entry() {
     // CHECK-NEXT:  0
     // CHECK-NEXT:  0
     (0.0, 0.0), (-1.0, 0.0),
-    // CHECK-NEXT:  -nan
-    // CHECK-NEXT:  -nan
+    // Ignoring the sign of nan as that can't be tested in platform agnostic manner. See: #58531
+    // CHECK-NEXT:  nan
+    // CHECK-NEXT:  nan
     (1.0, 1.0), (1.0, 1.0)
     // CHECK-NEXT:  0.273
     // CHECK-NEXT:  0.583
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
index 8ee4e1fb48fef..92c7039c84960 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
@@ -1,6 +1,6 @@
 // DEFINE: %{compile} =  mlir-opt %s \
 // DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
-// DEFINE:    -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm -o %t
+// DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm -o %t
 // DEFINE: %{entry_point} = mmt4d
 // DEFINE: %{run} = mlir-cpu-runner %t -e %{entry_point} -entry-point-result=void \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-matmul-masked-vec.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-matmul-masked-vec.mlir
index 0378d638df61a..a70d794506c48 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-matmul-masked-vec.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-matmul-masked-vec.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -one-shot-bufferize -func-bufferize -lower-vector-mask --test-lower-to-llvm | \
+// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -lower-vector-mask --test-lower-to-llvm | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_c_runner_utils,%mlir_runner_utils | \
 // RUN: FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
index 3700291540547..2239ba50b6626 100644
--- a/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
@@ -34,7 +34,8 @@ func.func @main() {
   %5 = arith.constant 5 : index
 
   %alloca_1 = memref.alloca() : memref<1xf32>
-  %alloc_4 = memref.alloc(%4) : memref<?xf32>
+  %alloca_4 = memref.alloca() : memref<4xf32>
+  %alloca_4_dyn = memref.cast %alloca_4 : memref<4xf32> to memref<?xf32>
 
   // Offset is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
@@ -55,20 +56,20 @@ func.func @main() {
   // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
   // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %5, %1) : (memref<?xf32>, index, index, index) -> ()
+  func.call @reinterpret_cast_fully_dynamic(%alloca_4_dyn, %0, %5, %1) : (memref<?xf32>, index, index, index) -> ()
 
   // Stride is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
   // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
   // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %4) : (memref<?xf32>, index, index, index) -> ()
+  func.call @reinterpret_cast_fully_dynamic(%alloca_4_dyn, %0, %4, %4) : (memref<?xf32>, index, index, index) -> ()
 
   //  CHECK-NOT: ERROR: Runtime op verification failed
   func.call @reinterpret_cast(%alloca_1, %0) : (memref<1xf32>, index) -> ()
 
   //  CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %1) : (memref<?xf32>, index, index, index) -> ()
+  func.call @reinterpret_cast_fully_dynamic(%alloca_4_dyn, %0, %4, %1) : (memref<?xf32>, index, index, index) -> ()
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
index 48987ce216f1a..3ccf8b1be6d7b 100644
--- a/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
@@ -38,14 +38,15 @@ func.func @main() {
   %5 = arith.constant 5 : index
 
   %alloca = memref.alloca() : memref<1xf32>
-  %alloc = memref.alloc(%4) : memref<?x4xf32>
+  %alloca_4 = memref.alloca() : memref<4x4xf32>
+  %alloca_4_dyn = memref.cast %alloca_4 : memref<4x4xf32> to memref<?x4xf32>
 
   // Offset is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
   // CHECK-NEXT: "memref.subview"
   // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @subview_dynamic_rank_reduce(%alloc, %5, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic_rank_reduce(%alloca_4_dyn, %5, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
 
   // Offset is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
@@ -66,23 +67,23 @@ func.func @main() {
   // CHECK-NEXT: "memref.subview"
   // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @subview_dynamic(%alloc, %0, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic(%alloca_4_dyn, %0, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
 
   // Stride is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
   // CHECK-NEXT: "memref.subview"
   // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @subview_dynamic(%alloc, %0, %4, %4) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic(%alloca_4_dyn, %0, %4, %4) : (memref<?x4xf32>, index, index, index) -> ()
 
   // CHECK-NOT: ERROR: Runtime op verification failed
   func.call @subview(%alloca, %0) : (memref<1xf32>, index) -> ()
 
   // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @subview_dynamic(%alloc, %0, %4, %1) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic(%alloca_4_dyn, %0, %4, %1) : (memref<?x4xf32>, index, index, index) -> ()
 
   // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @subview_dynamic_rank_reduce(%alloc, %0, %1, %0) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic_rank_reduce(%alloca_4_dyn, %0, %1, %0) : (memref<?x4xf32>, index, index, index) -> ()
 
 
   return
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
index 350b5b41dafc0..c645ca6567209 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
@@ -222,6 +222,7 @@ module {
     bufferization.dealloc_tensor %sparse_filter_CD : tensor<3x3xi32, #CDR>
     bufferization.dealloc_tensor %sparse_filter_CSC : tensor<3x3xi32, #CSC>
 
+    bufferization.dealloc_tensor %0 : tensor<6x6xi32>
     bufferization.dealloc_tensor %2 : tensor<6x6xi32, #DCSR>
     bufferization.dealloc_tensor %3 : tensor<6x6xi32, #CSR>
     bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CDR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
index ebf9f4392d859..f7975e0738fa8 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
@@ -35,8 +35,8 @@
 #COO_3D = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique), d2 : singleton), posWidth = 32, crdWidth = 32 }>
 
 module {
-  func.func private @printMemref3dF32(%ptr : tensor<?x?x?xf32>) attributes { llvm.emit_c_interface }
-  func.func private @printMemref2dF32(%ptr : tensor<?x?xf32>) attributes { llvm.emit_c_interface }
+  func.func private @printMemref3dF32(%ptr : tensor<?x?x?xf32> {bufferization.access = "read"}) attributes { llvm.emit_c_interface }
+  func.func private @printMemref2dF32(%ptr : tensor<?x?xf32> {bufferization.access = "read"}) attributes { llvm.emit_c_interface }
 
   func.func @test_sparse_rhs(%arg0: tensor<5x6xf32>, %arg1: tensor<6x2x3xf32, #COO_3D>) -> tensor<?x?x?xf32> {
     %collapsed = tensor.collapse_shape %arg1 [[0], [1, 2]] : tensor<6x2x3xf32, #COO_3D> into tensor<6x6xf32, #COO_2D>
@@ -46,6 +46,11 @@ module {
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
+
+    // Note: tensor.collapse_shape is a metadata-only operation on dense tensors
+    // but requires reallocation on sparse tensors.
+    bufferization.dealloc_tensor %collapsed : tensor<6x6xf32, #COO_2D>
+
     return %ret1 : tensor<?x?x?xf32>
   }
 
@@ -57,6 +62,11 @@ module {
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32, #COO_2D>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
+
+    // Note: tensor.collapse_shape is a metadata-only operation on dense tensors
+    // but requires reallocation on sparse tensors.
+    bufferization.dealloc_tensor %collapsed : tensor<6x6xf32, #COO_2D>
+
     return %ret1 : tensor<?x?x?xf32>
   }
 
@@ -80,6 +90,11 @@ module {
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32, #COO_2D>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
+
+    // Note: tensor.collapse_shape is a metadata-only operation on dense tensors
+    // but requires reallocation on sparse tensors.
+    bufferization.dealloc_tensor %collapsed : tensor<6x6xf32, #COO_2D>
+
     return %ret1 : tensor<?x?x?xf32>
   }
 
@@ -192,6 +207,7 @@ module {
     bufferization.dealloc_tensor %so1 : tensor<?x?x?xf32>
     bufferization.dealloc_tensor %so2 : tensor<?x?x?xf32>
     bufferization.dealloc_tensor %so3 : tensor<?x?x?xf32>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
index 464de9c8a2c3a..efef01155cc78 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
@@ -161,6 +161,14 @@ module {
     call @dump_dense_f64(%s24)  : (tensor<4x4xf64>) -> ()
     call @dump_dense_f64(%scsr) : (tensor<4x4xf64>) -> ()
 
+    bufferization.dealloc_tensor %a : tensor<4x8xf64, #BSR>
+    bufferization.dealloc_tensor %b : tensor<4x8xf64, #NV_24>
+    bufferization.dealloc_tensor %c : tensor<4x8xf64, #CSR>
+    bufferization.dealloc_tensor %d : tensor<4x4xf64>
+    bufferization.dealloc_tensor %s : tensor<4x4xf64>
+    bufferization.dealloc_tensor %s24 : tensor<4x4xf64>
+    bufferization.dealloc_tensor %scsr : tensor<4x4xf64>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
index f8fb8fdf53e35..55d4caeb7eb30 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
@@ -273,11 +273,14 @@ module {
     bufferization.dealloc_tensor %sparse_input_CSC : tensor<8x8xi32, #CSC>
     bufferization.dealloc_tensor %sparse_input_CD : tensor<8x8xi32, #CDR>
 
+    bufferization.dealloc_tensor %0 : tensor<6x6xi32>
     bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR>
     bufferization.dealloc_tensor %2 : tensor<6x6xi32, #DCSR>
     bufferization.dealloc_tensor %3 : tensor<6x6xi32, #CSR>
     bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CDR>
     bufferization.dealloc_tensor %5 : tensor<6x6xi32, #CSC>
+    bufferization.dealloc_tensor %6 : tensor<6x6xi32>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir
index 9b05f9bf3a29c..e145c4542a7bf 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir
@@ -233,6 +233,19 @@ module {
     // bufferization.dealloc_tensor %s2pp4 : tensor<2x?x?xf64, #Tensor4>
     // bufferization.dealloc_tensor %s2pp5 : tensor<2x?x?xf64, #Tensor5>
     // bufferization.dealloc_tensor %s2pp6 : tensor<2x?x?xf64, #Tensor6>
+
+    bufferization.dealloc_tensor %d2341 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2342 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2343 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2344 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2345 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2346 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %dp344 : tensor<?x3x4xf64>
+    bufferization.dealloc_tensor %d2p45 : tensor<2x?x4xf64>
+    bufferization.dealloc_tensor %d23p6 : tensor<2x3x?xf64>
+    bufferization.dealloc_tensor %dp3p4 : tensor<?x3x?xf64>
+    bufferization.dealloc_tensor %dpp45 : tensor<?x?x4xf64>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir
index 0f9dfb9da7204..12f8e34500d81 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir
@@ -114,12 +114,14 @@ module {
     call @dump(%d31) : (tensor<2x3x4xf64>) -> ()
 
     //
-    // Release sparse tensors.
+    // Release the resources.
     //
     bufferization.dealloc_tensor %t13 : tensor<2x3x4xf64, #Tensor3>
     bufferization.dealloc_tensor %t31 : tensor<2x3x4xf64, #Tensor1>
     bufferization.dealloc_tensor %s1 : tensor<2x3x4xf64, #Tensor1>
     bufferization.dealloc_tensor %s3 : tensor<2x3x4xf64, #Tensor3>
+    bufferization.dealloc_tensor %d13 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d31 : tensor<2x3x4xf64>
 
     return
   }
@@ -167,12 +169,14 @@ module {
     call @dump(%d31) : (tensor<2x3x4xf64>) -> ()
 
     //
-    // Release sparse tensors.
+    // Release the resources.
     //
     bufferization.dealloc_tensor %t13 : tensor<2x3x4xf64, #SingletonTensor3>
     bufferization.dealloc_tensor %t31 : tensor<2x3x4xf64, #SingletonTensor1>
     bufferization.dealloc_tensor %s1 : tensor<2x3x4xf64, #SingletonTensor1>
     bufferization.dealloc_tensor %s3 : tensor<2x3x4xf64, #SingletonTensor3>
+    bufferization.dealloc_tensor %d13 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d31 : tensor<2x3x4xf64>
 
     return
   }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir
new file mode 100755
index 0000000000000..bcd71f7bd674b
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir
@@ -0,0 +1,144 @@
+//--------------------------------------------------------------------------------------------------
+// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
+//
+// Set-up that's shared across all tests in this directory. In principle, this
+// config could be moved to lit.local.cfg. However, there are downstream users that
+//  do not use these LIT config files. Hence why this is kept inline.
+//
+// DEFINE: %{sparsifier_opts} = enable-runtime-library=true
+// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts}
+// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
+// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
+// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
+// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
+// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
+//
+// DEFINE: %{env} =
+//--------------------------------------------------------------------------------------------------
+
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and VLA vectorization.
+// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
+
+
+#map = affine_map<(d0) -> (d0)>
+
+#SV  = #sparse_tensor.encoding<{
+  map = (d0) -> (d0 : compressed)
+}>
+
+module {
+
+  // This directly yields an empty sparse vector.
+  func.func @empty() -> tensor<10xf32, #SV> {
+    %0 = tensor.empty() : tensor<10xf32, #SV>
+    return %0 : tensor<10xf32, #SV>
+  }
+
+  // This also directly yields an empty sparse vector.
+  func.func @empty_alloc() -> tensor<10xf32, #SV> {
+    %0 = bufferization.alloc_tensor() : tensor<10xf32, #SV>
+    return %0 : tensor<10xf32, #SV>
+  }
+
+  // This yields a hidden empty sparse vector (all zeros).
+  func.func @zeros() -> tensor<10xf32, #SV> {
+    %cst = arith.constant 0.0 : f32
+    %0 = bufferization.alloc_tensor() : tensor<10xf32, #SV>
+    %1 = linalg.generic {
+        indexing_maps = [#map],
+	iterator_types = ["parallel"]}
+      outs(%0 : tensor<10xf32, #SV>) {
+         ^bb0(%out: f32):
+            linalg.yield %cst : f32
+    } -> tensor<10xf32, #SV>
+    return %1 : tensor<10xf32, #SV>
+  }
+
+  // This yields a filled sparse vector (all ones).
+  func.func @ones() -> tensor<10xf32, #SV> {
+    %cst = arith.constant 1.0 : f32
+    %0 = bufferization.alloc_tensor() : tensor<10xf32, #SV>
+    %1 = linalg.generic {
+        indexing_maps = [#map],
+	iterator_types = ["parallel"]}
+      outs(%0 : tensor<10xf32, #SV>) {
+         ^bb0(%out: f32):
+            linalg.yield %cst : f32
+    } -> tensor<10xf32, #SV>
+    return %1 : tensor<10xf32, #SV>
+  }
+
+  //
+  // Main driver.
+  //
+  func.func @main() {
+
+    %0 = call @empty()       : () -> tensor<10xf32, #SV>
+    %1 = call @empty_alloc() : () -> tensor<10xf32, #SV>
+    %2 = call @zeros()       : () -> tensor<10xf32, #SV>
+    %3 = call @ones()        : () -> tensor<10xf32, #SV>
+
+    //
+    // Verify the output. In particular, make sure that
+    // all empty sparse vector data structures are properly
+    // finalized with a pair (0,0) for positions.
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 0
+    // CHECK-NEXT: dim = ( 10 )
+    // CHECK-NEXT: lvl = ( 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 0,
+    // CHECK-NEXT: crd[0] : (
+    // CHECK-NEXT: values : (
+    // CHECK-NEXT: ----
+    //
+    // CHECK-NEXT: ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 0
+    // CHECK-NEXT: dim = ( 10 )
+    // CHECK-NEXT: lvl = ( 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 0,
+    // CHECK-NEXT: crd[0] : (
+    // CHECK-NEXT: values : (
+    // CHECK-NEXT: ----
+    //
+    // CHECK-NEXT: ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 0
+    // CHECK-NEXT: dim = ( 10 )
+    // CHECK-NEXT: lvl = ( 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 0,
+    // CHECK-NEXT: crd[0] : (
+    // CHECK-NEXT: values : (
+    // CHECK-NEXT: ----
+    //
+    // CHECK-NEXT: ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 10
+    // CHECK-NEXT: dim = ( 10 )
+    // CHECK-NEXT: lvl = ( 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 10,
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %0 : tensor<10xf32, #SV>
+    sparse_tensor.print %1 : tensor<10xf32, #SV>
+    sparse_tensor.print %2 : tensor<10xf32, #SV>
+    sparse_tensor.print %3 : tensor<10xf32, #SV>
+
+    bufferization.dealloc_tensor %0 : tensor<10xf32, #SV>
+    bufferization.dealloc_tensor %1 : tensor<10xf32, #SV>
+    bufferization.dealloc_tensor %2 : tensor<10xf32, #SV>
+    bufferization.dealloc_tensor %3 : tensor<10xf32, #SV>
+    return
+  }
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
index 7cde6b93d3250..34d450c2403f6 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
@@ -279,6 +279,31 @@ module {
     %si = tensor.extract %li[] : tensor<i64>
     vector.print %si : i64
 
+    // TODO: This check is no longer needed once the codegen path uses the
+    // buffer deallocation pass. "dealloc_tensor" turn into a no-op in the
+    // codegen path.
+    %has_runtime = sparse_tensor.has_runtime_library
+    scf.if %has_runtime {
+      // sparse_tensor.assemble copies buffers when running with the runtime
+      // library. Deallocations are needed not needed when running in codgen
+      // mode.
+      bufferization.dealloc_tensor %s4 : tensor<10x10xf64, #SortedCOO>
+      bufferization.dealloc_tensor %s5 : tensor<10x10xf64, #SortedCOOI32>
+      bufferization.dealloc_tensor %csr : tensor<2x2xf64, #CSR>
+      bufferization.dealloc_tensor %bs : tensor<2x10x10xf64, #BCOO>
+    }
+
+    bufferization.dealloc_tensor %li : tensor<i64>
+    bufferization.dealloc_tensor %od : tensor<3xf64>
+    bufferization.dealloc_tensor %op : tensor<2xi32>
+    bufferization.dealloc_tensor %oi : tensor<3x2xi32>
+    bufferization.dealloc_tensor %d_csr : tensor<4xf64>
+    bufferization.dealloc_tensor %p_csr : tensor<3xi32>
+    bufferization.dealloc_tensor %i_csr : tensor<3xi32>
+    bufferization.dealloc_tensor %bod : tensor<6xf64>
+    bufferization.dealloc_tensor %bop : tensor<4xindex>
+    bufferization.dealloc_tensor %boi : tensor<6x2xindex>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
index aa1bd04fde87d..fe8836266a479 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
@@ -140,10 +140,19 @@ module {
     sparse_tensor.print %s1 : tensor<4x3x2xf32, #BatchedCSR>
     sparse_tensor.print %s2 : tensor<4x3x2xf32, #CSRDense>
 
-    // FIXME: doing this explicitly crashes runtime
-    // bufferization.dealloc_tensor %s0 : tensor<4x3x2xf32, #CCC>
-    // bufferization.dealloc_tensor %s1 : tensor<4x3x2xf32, #BatchedCSR>
-    // bufferization.dealloc_tensor %s2 : tensor<4x3x2xf32, #CSRDense>
+    // TODO: This check is no longer needed once the codegen path uses the
+    // buffer deallocation pass. "dealloc_tensor" turn into a no-op in the
+    // codegen path.
+    %has_runtime = sparse_tensor.has_runtime_library
+    scf.if %has_runtime {
+      // sparse_tensor.assemble copies buffers when running with the runtime
+      // library. Deallocations are needed not needed when running in codgen
+      // mode.
+      bufferization.dealloc_tensor %s0 : tensor<4x3x2xf32, #CCC>
+      bufferization.dealloc_tensor %s1 : tensor<4x3x2xf32, #BatchedCSR>
+      bufferization.dealloc_tensor %s2 : tensor<4x3x2xf32, #CSRDense>
+    }
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir b/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir
index bf178c826574e..258b1b4f2fab4 100644
--- a/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir
+++ b/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith))" | \
-// RUN: mlir-opt -one-shot-bufferize -func-bufferize -test-lower-to-llvm | \
+// RUN: mlir-opt -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -test-lower-to-llvm | \
 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir
new file mode 100644
index 0000000000000..4d41539448202
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir
@@ -0,0 +1,148 @@
+// RUN: mlir-opt %s \
+// RUN:  -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
+// RUN:  | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN:  | FileCheck %s
+
+// CHECK: Correct Results :8192
+// CHECK: Incorrect Results :0
+
+module {
+  func.func @main() {
+    %c10000000 = arith.constant 10000000 : index
+    %false = arith.constant false
+    %c32768 = arith.constant 32768 : index
+    %c31_i32 = arith.constant 31 : i32
+    %c-1_i32 = arith.constant -1 : i32
+    %c5_i32 = arith.constant 5 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c2 = arith.constant 2 : index
+    %c32768_i32 = arith.constant 32768 : i32
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.mlir.constant(128 : i64) : i64
+    %2 = llvm.mlir.constant(0 : i64) : i64
+    %f0 = arith.constant 0.0 : f16
+    %f123 = arith.constant 1.123 : f16
+    
+    %srcMemref_host = memref.alloc() : memref<128x128xf16>
+    %dstMemref_host = memref.alloc() : memref<128x128xf16>
+    scf.for %arg0 = %c0 to %c128 step %c1 {
+      scf.for %arg1 = %c0 to %c64 step %c1 {
+        %d1 = arith.index_cast %arg0 : index to i32
+        %d2 = arith.index_cast %arg1 : index to i32
+        %d3 = arith.sitofp %d1 : i32 to f16
+        %d4 = arith.sitofp %d2 : i32 to f16
+        %d5 = arith.addf %d3, %f123 : f16
+        %d6 = arith.constant 3.12 : f16
+        %d7 = arith.mulf %d5, %d6 : f16
+        %d8 = arith.addf %d7, %d5 : f16
+        %d9 = arith.constant 0.178 : f16
+        %d10 = arith.divf %d9, %d8 : f16
+        memref.store %d10, %srcMemref_host[%arg0, %arg1] : memref<128x128xf16>
+        memref.store %f0, %dstMemref_host[%arg0, %arg1] : memref<128x128xf16>
+      }
+    }
+
+    %s1 = gpu.wait async
+    %srcMemref, %s2 = gpu.alloc async [%s1] () : memref<128x128xf16>
+    %dstMemref, %s3 = gpu.alloc async [%s2] () : memref<128x128xf16>
+    %s4 = gpu.memcpy async [%s3] %srcMemref, %srcMemref_host : memref<128x128xf16>, memref<128x128xf16>
+    %s5 = gpu.memcpy async [%s4] %dstMemref, %dstMemref_host : memref<128x128xf16>, memref<128x128xf16>
+
+    %expand_shape = memref.expand_shape %srcMemref [[0, 1], [2, 3]] : memref<128x128xf16> into memref<2x64x2x64xf16>
+    %transpose = memref.transpose %expand_shape (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<2x64x2x64xf16> to memref<2x2x64x64xf16, strided<[8192, 64, 128, 1]>>
+    %cast = memref.cast %transpose : memref<2x2x64x64xf16, strided<[8192, 64, 128, 1]>> to memref<*xf16>
+    %24 = nvgpu.tma.create.descriptor %cast box[%c2, %c2, %c64, %c64] : memref<*xf16> -> <tensor = memref<2x2x64x64xf16, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
+    
+    gpu.launch 
+      blocks(%arg2, %arg3, %arg4) in (%arg8 = %c1, %arg9 = %c1, %arg10 = %c1) 
+      threads(%arg5, %arg6, %arg7) in (%arg11 = %c128, %arg12 = %c1, %arg13 = %c1) 
+      dynamic_shared_memory_size %c32768_i32 
+    {
+      %26 = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+      %view = memref.view %26[%c0][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<2x2x64x64xf16, #gpu.address_space<workgroup>>
+      %27 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+      %thread_id_x = gpu.thread_id  x
+      %28 = arith.index_cast %thread_id_x : index to i32
+      %29 = arith.shrui %28, %c5_i32 : i32
+      %30 = nvvm.shfl.sync  idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32
+      %31 = arith.cmpi eq, %30, %c0_i32 : i32
+      %32 = nvvm.elect.sync -> i1
+      %33 = arith.andi %31, %32 : i1
+      scf.if %33 {
+        nvgpu.mbarrier.init %27[%c0], %c1 : <memorySpace = #gpu.address_space<workgroup>>
+      }
+      %34 = nvvm.shfl.sync  idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32
+      %35 = arith.cmpi eq, %34, %c0_i32 : i32
+      %36 = nvvm.elect.sync -> i1
+      %37 = arith.andi %35, %36 : i1
+      scf.if %37 {
+        nvgpu.mbarrier.arrive.expect_tx %27[%c0], %c32768 : <memorySpace = #gpu.address_space<workgroup>>
+        nvgpu.tma.async.load %24[%c0, %c0, %c0, %c0], %27[%c0] to %view : <tensor = memref<2x2x64x64xf16, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<2x2x64x64xf16, #gpu.address_space<workgroup>>
+      }
+      nvgpu.mbarrier.try_wait.parity %27[%c0], %false, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
+      scf.for %arg14 = %c0 to %c2 step %c1 {
+        scf.for %arg15 = %c0 to %c2 step %c1 {
+          %38 = arith.muli %arg14, %c64 : index
+          %39 = arith.muli %arg15, %c64 : index
+          %subview = memref.subview %view[%arg14, %arg15, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space<workgroup>>
+          %subview_0 = memref.subview %dstMemref[%38, %39] [64, 64] [1, 1] : memref<128x128xf16> to memref<64x64xf16, strided<[128, 1], offset: ?>>
+          %block_dim_x = gpu.block_dim  x
+          %thread_id_y = gpu.thread_id  y
+          %40 = arith.muli %thread_id_y, %block_dim_x : index
+          %41 = arith.addi %thread_id_x, %40 : index
+          %block_dim_y = gpu.block_dim  y
+          %42 = arith.muli %block_dim_x, %block_dim_y : index
+          %thread_id_z = gpu.thread_id  z
+          %43 = arith.muli %thread_id_z, %42 : index
+          %44 = arith.addi %41, %43 : index
+          %45 = arith.cmpi eq, %44, %c0 : index
+          scf.if %45 {
+            scf.for %arg16 = %c0 to %c64 step %c1 {
+              scf.for %arg17 = %c0 to %c64 step %c1 {
+                %46 = memref.load %subview[%arg16, %arg17] : memref<64x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space<workgroup>>
+                memref.store %46, %subview_0[%arg16, %arg17] : memref<64x64xf16, strided<[128, 1], offset: ?>>
+              }
+            }
+          }
+          gpu.barrier
+        }
+      }
+      gpu.terminator
+    }
+
+    %s6 = gpu.memcpy async [%s5] %dstMemref_host, %dstMemref  : memref<128x128xf16>, memref<128x128xf16>
+    gpu.wait [%s6]
+
+   %errorCount, %correctCount =  scf.for %arg0 = %c0 to %c128 step %c1 iter_args(%ec1 = %c0, %cc1 = %c0) -> (index,index) {
+      %ec2, %cc2 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%ec2 = %ec1, %cc2 = %cc1) -> (index, index) { 
+        %v1 = memref.load %dstMemref_host[%arg0, %arg1] : memref<128x128xf16>
+        %v2 = memref.load %srcMemref_host[%arg0, %arg1] : memref<128x128xf16>
+        %p = arith.cmpf one, %v1, %v2 : f16        
+        %ec3, %cc3 = scf.if %p -> (index, index) {
+          %ec3 = arith.addi %ec2, %c1 : index
+          scf.yield %ec3, %cc2 : index, index
+        } else {
+          %cc3 = arith.addi %cc2, %c1 : index
+          scf.yield %ec2, %cc3 : index, index
+        }
+      scf.yield %ec3, %cc3 : index,index
+      }
+      scf.yield %ec2, %cc2 : index,index
+    }
+    
+    vector.print str "Correct Results :"
+    vector.print %correctCount : index
+    vector.print str "Incorrect Results :"
+    vector.print %errorCount : index
+    return
+  }
+}
diff --git a/mlir/test/Target/Cpp/common-cpp.mlir b/mlir/test/Target/Cpp/common-cpp.mlir
index a87b33a10844d..0e24bdd19993f 100644
--- a/mlir/test/Target/Cpp/common-cpp.mlir
+++ b/mlir/test/Target/Cpp/common-cpp.mlir
@@ -36,7 +36,7 @@ func.func @test_multiple_return() -> (i32, i32) {
 
 // CHECK: test_float
 func.func @test_float() {
-  // CHECK: foo::constant({(float)0.0e+00, (float)1.000000000e+00})
+  // CHECK: foo::constant({0.0e+00f, 1.000000000e+00f})
   %0 = emitc.call_opaque "foo::constant"() {args = [dense<[0.000000e+00, 1.000000e+00]> : tensor<2xf32>]} : () -> f32
   return
 }
diff --git a/mlir/test/Target/Cpp/const.mlir b/mlir/test/Target/Cpp/const.mlir
index 524d564b3b943..3658455d66943 100644
--- a/mlir/test/Target/Cpp/const.mlir
+++ b/mlir/test/Target/Cpp/const.mlir
@@ -10,6 +10,7 @@ func.func @emitc_constant() {
   %c5 = "emitc.constant"(){value = #emitc.opaque<"CHAR_MIN">} : () -> !emitc.opaque<"char">
   %c6 = "emitc.constant"(){value = 2 : index} : () -> index
   %c7 = "emitc.constant"(){value = 2.0 : f32} : () -> f32
+  %f64 = "emitc.constant"(){value = 4.0 : f64} : () -> f64
   %c8 = "emitc.constant"(){value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %c9 = "emitc.constant"(){value = dense<[0, 1]> : tensor<2xindex>} : () -> tensor<2xindex>
   %c10 = "emitc.constant"(){value = dense<[[0.0, 1.0], [2.0, 3.0]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
@@ -23,10 +24,11 @@ func.func @emitc_constant() {
 // CPP-DEFAULT-NEXT: uint8_t [[V4:[^ ]*]] = 255;
 // CPP-DEFAULT-NEXT: char [[V5:[^ ]*]] = CHAR_MIN;
 // CPP-DEFAULT-NEXT: size_t [[V6:[^ ]*]] = 2;
-// CPP-DEFAULT-NEXT: float [[V7:[^ ]*]] = (float)2.000000000e+00;
+// CPP-DEFAULT-NEXT: float [[V7:[^ ]*]] = 2.000000000e+00f;
+// CPP-DEFAULT-NEXT: double [[F64:[^ ]*]] = 4.00000000000000000e+00;
 // CPP-DEFAULT-NEXT: Tensor<int32_t> [[V8:[^ ]*]] = {0};
 // CPP-DEFAULT-NEXT: Tensor<size_t, 2> [[V9:[^ ]*]] = {0, 1};
-// CPP-DEFAULT-NEXT: Tensor<float, 2, 2> [[V10:[^ ]*]] = {(float)0.0e+00, (float)1.000000000e+00, (float)2.000000000e+00, (float)3.000000000e+00};
+// CPP-DEFAULT-NEXT: Tensor<float, 2, 2> [[V10:[^ ]*]] = {0.0e+00f, 1.000000000e+00f, 2.000000000e+00f, 3.000000000e+00f};
 
 // CPP-DECLTOP: void emitc_constant() {
 // CPP-DECLTOP-NEXT: int32_t [[V0:[^ ]*]];
@@ -37,6 +39,7 @@ func.func @emitc_constant() {
 // CPP-DECLTOP-NEXT: char [[V5:[^ ]*]];
 // CPP-DECLTOP-NEXT: size_t [[V6:[^ ]*]];
 // CPP-DECLTOP-NEXT: float [[V7:[^ ]*]];
+// CPP-DECLTOP-NEXT: double [[F64:[^ ]*]];
 // CPP-DECLTOP-NEXT: Tensor<int32_t> [[V8:[^ ]*]];
 // CPP-DECLTOP-NEXT: Tensor<size_t, 2> [[V9:[^ ]*]];
 // CPP-DECLTOP-NEXT: Tensor<float, 2, 2> [[V10:[^ ]*]];
@@ -47,7 +50,8 @@ func.func @emitc_constant() {
 // CPP-DECLTOP-NEXT: [[V4]] = 255;
 // CPP-DECLTOP-NEXT: [[V5]] = CHAR_MIN;
 // CPP-DECLTOP-NEXT: [[V6]] = 2;
-// CPP-DECLTOP-NEXT: [[V7]] = (float)2.000000000e+00;
+// CPP-DECLTOP-NEXT: [[V7]] = 2.000000000e+00f;
+// CPP-DECLTOP-NEXT: [[F64]] = 4.00000000000000000e+00;
 // CPP-DECLTOP-NEXT: [[V8]] = {0};
 // CPP-DECLTOP-NEXT: [[V9]] = {0, 1};
-// CPP-DECLTOP-NEXT: [[V10]] = {(float)0.0e+00, (float)1.000000000e+00, (float)2.000000000e+00, (float)3.000000000e+00};
+// CPP-DECLTOP-NEXT: [[V10]] = {0.0e+00f, 1.000000000e+00f, 2.000000000e+00f, 3.000000000e+00f};
diff --git a/mlir/test/Target/Cpp/for.mlir b/mlir/test/Target/Cpp/for.mlir
index 5225f3ddaff25..60988bcb46556 100644
--- a/mlir/test/Target/Cpp/for.mlir
+++ b/mlir/test/Target/Cpp/for.mlir
@@ -63,7 +63,7 @@ func.func @test_for_yield() {
 // CPP-DEFAULT-NEXT: size_t [[STOP:[^ ]*]] = 10;
 // CPP-DEFAULT-NEXT: size_t [[STEP:[^ ]*]] = 1;
 // CPP-DEFAULT-NEXT: int32_t [[S0:[^ ]*]] = 0;
-// CPP-DEFAULT-NEXT: float [[P0:[^ ]*]] = (float)1.000000000e+00;
+// CPP-DEFAULT-NEXT: float [[P0:[^ ]*]] = 1.000000000e+00f;
 // CPP-DEFAULT-NEXT: int32_t [[SE:[^ ]*]];
 // CPP-DEFAULT-NEXT: float [[PE:[^ ]*]];
 // CPP-DEFAULT-NEXT: int32_t [[SI:[^ ]*]];
@@ -96,7 +96,7 @@ func.func @test_for_yield() {
 // CPP-DECLTOP-NEXT: [[STOP]] = 10;
 // CPP-DECLTOP-NEXT: [[STEP]] = 1;
 // CPP-DECLTOP-NEXT: [[S0]] = 0;
-// CPP-DECLTOP-NEXT: [[P0]] = (float)1.000000000e+00;
+// CPP-DECLTOP-NEXT: [[P0]] = 1.000000000e+00f;
 // CPP-DECLTOP-NEXT: ;
 // CPP-DECLTOP-NEXT: ;
 // CPP-DECLTOP-NEXT: ;
diff --git a/mlir/test/Target/Cpp/invalid.mlir b/mlir/test/Target/Cpp/invalid.mlir
index 552c04a9b07f7..513371a09cde1 100644
--- a/mlir/test/Target/Cpp/invalid.mlir
+++ b/mlir/test/Target/Cpp/invalid.mlir
@@ -10,10 +10,10 @@ func.func @multiple_blocks() {
 
 // -----
 
-func.func @unsupported_std_op(%arg0: f64) -> f64 {
-  // expected-error@+1 {{'math.absf' op unable to find printer for op}}
-  %0 = math.absf %arg0 : f64
-  return %0 : f64
+func.func @unsupported_op(%arg0: i1) {
+  // expected-error@+1 {{'cf.assert' op unable to find printer for op}}
+  cf.assert %arg0, "assertion foo"
+  return
 }
 
 // -----
diff --git a/mlir/test/Target/Cpp/stdops.mlir b/mlir/test/Target/Cpp/stdops.mlir
index cc6bdbe376984..589e5f2e96aff 100644
--- a/mlir/test/Target/Cpp/stdops.mlir
+++ b/mlir/test/Target/Cpp/stdops.mlir
@@ -60,14 +60,14 @@ func.func @two_results() -> (i32, f32) {
 }
 // CPP-DEFAULT: std::tuple<int32_t, float> two_results() {
 // CPP-DEFAULT: int32_t [[V0:[^ ]*]] = 0;
-// CPP-DEFAULT: float [[V1:[^ ]*]] = (float)1.000000000e+00;
+// CPP-DEFAULT: float [[V1:[^ ]*]] = 1.000000000e+00f;
 // CPP-DEFAULT: return std::make_tuple([[V0]], [[V1]]);
 
 // CPP-DECLTOP: std::tuple<int32_t, float> two_results() {
 // CPP-DECLTOP: int32_t [[V0:[^ ]*]];
 // CPP-DECLTOP: float [[V1:[^ ]*]];
 // CPP-DECLTOP: [[V0]] = 0;
-// CPP-DECLTOP: [[V1]] = (float)1.000000000e+00;
+// CPP-DECLTOP: [[V1]] = 1.000000000e+00f;
 // CPP-DECLTOP: return std::make_tuple([[V0]], [[V1]]);
 
 
diff --git a/mlir/test/Target/Cpp/subscript.mlir b/mlir/test/Target/Cpp/subscript.mlir
new file mode 100644
index 0000000000000..a6c82df9111a7
--- /dev/null
+++ b/mlir/test/Target/Cpp/subscript.mlir
@@ -0,0 +1,32 @@
+// RUN: mlir-translate -mlir-to-cpp %s | FileCheck %s
+// RUN: mlir-translate -mlir-to-cpp -declare-variables-at-top %s | FileCheck %s
+
+func.func @load_store(%arg0: !emitc.array<4x8xf32>, %arg1: !emitc.array<3x5xf32>, %arg2: index, %arg3: index) {
+  %0 = emitc.subscript %arg0[%arg2, %arg3] : <4x8xf32>, index, index
+  %1 = emitc.subscript %arg1[%arg2, %arg3] : <3x5xf32>, index, index
+  emitc.assign %0 : f32 to %1 : f32
+  return
+}
+// CHECK: void load_store(float [[ARR1:[^ ]*]][4][8], float [[ARR2:[^ ]*]][3][5],
+// CHECK-SAME:            size_t [[I:[^ ]*]], size_t [[J:[^ ]*]])
+// CHECK-NEXT: [[ARR2]][[[I]]][[[J]]] = [[ARR1]][[[I]]][[[J]]];
+
+emitc.func @func1(%arg0 : f32) {
+  emitc.return
+}
+
+emitc.func @call_arg(%arg0: !emitc.array<4x8xf32>, %i: i32, %j: i16,
+                     %k: i8) {
+  %0 = emitc.subscript %arg0[%i, %j] : <4x8xf32>, i32, i16
+  %1 = emitc.subscript %arg0[%j, %k] : <4x8xf32>, i16, i8
+
+  emitc.call @func1 (%0) : (f32) -> ()
+  emitc.call_opaque "func2" (%1) : (f32) -> ()
+  emitc.call_opaque "func3" (%0, %1) { args = [1 : index, 0 : index] } : (f32, f32) -> ()
+  emitc.return
+}
+// CHECK: void call_arg(float [[ARR1:[^ ]*]][4][8], int32_t [[I:[^ ]*]],
+// CHECK-SAME:          int16_t [[J:[^ ]*]], int8_t [[K:[^ ]*]])
+// CHECK-NEXT: func1([[ARR1]][[[I]]][[[J]]]);
+// CHECK-NEXT: func2([[ARR1]][[[J]]][[[K]]]);
+// CHECK-NEXT: func3([[ARR1]][[[J]]][[[K]]], [[ARR1]][[[I]]][[[J]]]);
diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll
index 9ef6580bcf240..9af40d8c8d3ee 100644
--- a/mlir/test/Target/LLVMIR/Import/debug-info.ll
+++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll
@@ -296,12 +296,17 @@ define void @class_method() {
   ret void, !dbg !9
 }
 
-; Verify the elements parameter is dropped due to the cyclic dependencies.
-; CHECK: #[[COMP:.+]] = #llvm.di_composite_type<tag = DW_TAG_class_type, name = "class_name", file = #{{.*}}, line = 42, flags = "TypePassByReference|NonTrivial">
-; CHECK: #[[COMP_PTR:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP]], sizeInBits = 64>
+; Verify the cyclic composite type is identified, even though conversion begins from the subprogram type.
+; CHECK: #[[COMP_SELF:.+]] = #llvm.di_composite_type<tag = DW_TAG_null, recId = [[REC_ID:.+]]>
+; CHECK: #[[COMP_PTR:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP_SELF]], sizeInBits = 64>
 ; CHECK: #[[SP_TYPE:.+]] = #llvm.di_subroutine_type<types = #{{.*}}, #[[COMP_PTR]]>
-; CHECK: #[[SP:.+]] = #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #{{.*}}, scope = #[[COMP]], name = "class_method", file = #{{.*}}, subprogramFlags = Definition, type = #[[SP_TYPE]]>
-; CHECK: #[[LOC]] = loc(fused<#[[SP]]>
+; CHECK: #[[SP_INNER:.+]] = #llvm.di_subprogram<id = [[SP_ID:.+]], compileUnit = #{{.*}}, scope = #[[COMP_SELF]], name = "class_method", file = #{{.*}}, subprogramFlags = Definition, type = #[[SP_TYPE]]>
+; CHECK: #[[COMP:.+]] = #llvm.di_composite_type<tag = DW_TAG_class_type, recId = [[REC_ID]], name = "class_name", file = #{{.*}}, line = 42, flags = "TypePassByReference|NonTrivial", elements = #[[SP_INNER]]>
+
+; CHECK: #[[COMP_PTR_OUTER:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP]], sizeInBits = 64>
+; CHECK: #[[SP_TYPE_OUTER:.+]] = #llvm.di_subroutine_type<types = #{{.*}}, #[[COMP_PTR_OUTER]]>
+; CHECK: #[[SP_OUTER:.+]] = #llvm.di_subprogram<id = [[SP_ID]], compileUnit = #{{.*}}, scope = #[[COMP]], name = "class_method", file = #{{.*}}, subprogramFlags = Definition, type = #[[SP_TYPE_OUTER]]>
+; CHECK: #[[LOC]] = loc(fused<#[[SP_OUTER]]>
 
 !llvm.dbg.cu = !{!1}
 !llvm.module.flags = !{!0}
@@ -318,15 +323,18 @@ define void @class_method() {
 
 ; // -----
 
-; Verify the elements parameter is dropped due to the cyclic dependencies.
-; CHECK: #[[$COMP:.+]] = #llvm.di_composite_type<tag = DW_TAG_class_type, name = "class_field", file = #{{.*}}, line = 42, flags = "TypePassByReference|NonTrivial">
-; CHECK: #[[$COMP_PTR:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[$COMP]]>
-; CHECK: #[[$VAR0:.+]] = #llvm.di_local_variable<scope = #{{.*}}, name = "class_field", file = #{{.*}}, type = #[[$COMP_PTR]]>
+; Verify the cyclic composite type is handled correctly.
+; CHECK: #[[COMP_SELF:.+]] = #llvm.di_composite_type<tag = DW_TAG_null, recId = [[REC_ID:.+]]>
+; CHECK: #[[COMP_PTR_INNER:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP_SELF]]>
+; CHECK: #[[FIELD:.+]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "call_field", baseType = #[[COMP_PTR_INNER]]>
+; CHECK: #[[COMP:.+]] = #llvm.di_composite_type<tag = DW_TAG_class_type, recId = [[REC_ID]], name = "class_field", file = #{{.*}}, line = 42, flags = "TypePassByReference|NonTrivial", elements = #[[FIELD]]>
+; CHECK: #[[COMP_PTR_OUTER:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP]]>
+; CHECK: #[[VAR0:.+]] = #llvm.di_local_variable<scope = #{{.*}}, name = "class_field", file = #{{.*}}, type = #[[COMP_PTR_OUTER]]>
 
-; CHECK-LABEL: @class_field
+; CHECK: @class_field
 ; CHECK-SAME:  %[[ARG0:[a-zA-Z0-9]+]]
 define void @class_field(ptr %arg1) {
-  ; CHECK: llvm.intr.dbg.value #[[$VAR0]] = %[[ARG0]] : !llvm.ptr
+  ; CHECK: llvm.intr.dbg.value #[[VAR0]] = %[[ARG0]] : !llvm.ptr
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !7, metadata !DIExpression()), !dbg !9
   ret void
 }
@@ -563,35 +571,6 @@ define void @func_in_module(ptr %arg) !dbg !8 {
 
 ; // -----
 
-; Verifies that array types that have an unimportable base type are removed to
-; avoid producing invalid IR.
-; CHECK: #[[DI_LOCAL_VAR:.+]] = #llvm.di_local_variable<
-; CHECK-NOT: type =
-
-; CHECK-LABEL: @array_with_cyclic_base_type
-define i32 @array_with_cyclic_base_type(ptr %0) !dbg !3 {
-  call void @llvm.dbg.value(metadata ptr %0, metadata !4, metadata !DIExpression()), !dbg !7
-  ret i32 0
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-
-!llvm.module.flags = !{!0}
-!llvm.dbg.cu = !{!1}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2)
-!2 = !DIFile(filename: "debug-info.ll", directory: "/")
-!3 = distinct !DISubprogram(name: "func", scope: !2, file: !2, line: 46, scopeLine: 48, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
-!4 = !DILocalVariable(name: "op", arg: 5, scope: !3, file: !2, line: 47, type: !5)
-!5 = !DICompositeType(tag: DW_TAG_array_type, size: 42, baseType: !6)
-!6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5)
-!7 = !DILocation(line: 0, scope: !3)
-
-; // -----
-
 ; Verifies that import compile units respect the distinctness of the input.
 ; CHECK-LABEL: @distinct_cu_func0
 define void @distinct_cu_func0() !dbg !4 {
diff --git a/mlir/test/Target/LLVMIR/Import/import-failure.ll b/mlir/test/Target/LLVMIR/Import/import-failure.ll
index 9a4e939d10651..3f4efab70e1c0 100644
--- a/mlir/test/Target/LLVMIR/Import/import-failure.ll
+++ b/mlir/test/Target/LLVMIR/Import/import-failure.ll
@@ -85,38 +85,6 @@ define void @unsupported_argument(i64 %arg1) {
 
 ; // -----
 
-; Check that debug intrinsics that depend on cyclic metadata are dropped.
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-; CHECK:      import-failure.ll
-; CHECK-SAME: warning: dropped instruction: call void @llvm.dbg.label(metadata !{{.*}})
-; CHECK:      import-failure.ll
-; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata i64 %{{.*}}, metadata !3, metadata !DIExpression())
-define void @cylic_metadata(i64 %arg1) {
-  call void @llvm.dbg.value(metadata i64 %arg1, metadata !10, metadata !DIExpression()), !dbg !14
-  call void @llvm.dbg.label(metadata !13), !dbg !14
-  ret void
-}
-
-!llvm.dbg.cu = !{!1}
-!llvm.module.flags = !{!0}
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2)
-!2 = !DIFile(filename: "import-failure.ll", directory: "/")
-!3 = !DICompositeType(tag: DW_TAG_array_type, size: 42, baseType: !4)
-!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3)
-!5 = distinct !DISubprogram(name: "class_method", scope: !2, file: !2, type: !6, spFlags: DISPFlagDefinition, unit: !1)
-!6 = !DISubroutineType(types: !7)
-!7 = !{!3}
-!10 = !DILocalVariable(scope: !5, name: "arg1", file: !2, line: 1, arg: 1, align: 64);
-!11 = !DILexicalBlock(scope: !5)
-!12 = !DILexicalBlockFile(scope: !11, discriminator: 0)
-!13 = !DILabel(scope: !12, name: "label", file: !2, line: 42)
-!14 = !DILocation(line: 1, column: 2, scope: !5)
-
-; // -----
-
 ; global_dtors with non-null data fields cannot be represented in MLIR.
 ; CHECK:      <unknown>
 ; CHECK-SAME: error: unhandled global variable: @llvm.global_dtors
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index cfd5239515c9c..c042628334d4c 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -342,3 +342,83 @@ llvm.func @func_line_tables() {
 llvm.func @func_debug_directives() {
   llvm.return
 } loc(fused<#di_subprogram_2>["foo2.mlir":0:0])
+
+// -----
+
+// Ensure recursive types with multiple external references work.
+
+// Common base nodes.
+#di_file = #llvm.di_file<"test.mlir" in "/">
+#di_null_type = #llvm.di_null_type
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[1]<>, sourceLanguage = DW_LANG_C, file = #di_file, isOptimized = false, emissionKind = None>
+
+// Recursive type itself.
+#di_struct_self = #llvm.di_composite_type<tag = DW_TAG_null, recId = distinct[0]<>>
+#di_ptr_inner = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #di_struct_self, sizeInBits = 64>
+#di_subroutine_inner = #llvm.di_subroutine_type<types = #di_null_type, #di_ptr_inner>
+#di_subprogram_inner = #llvm.di_subprogram<
+  id = distinct[2]<>,
+  compileUnit = #di_compile_unit,
+  scope = #di_struct_self,
+  name = "class_method",
+  file = #di_file,
+  subprogramFlags = Definition,
+  type = #di_subroutine_inner>
+#di_struct = #llvm.di_composite_type<
+  tag = DW_TAG_class_type,
+  recId = distinct[0]<>,
+  name = "class_name",
+  file = #di_file,
+  line = 42,
+  flags = "TypePassByReference|NonTrivial",
+  elements = #di_subprogram_inner>
+
+// Outer types referencing the entire recursive type.
+#di_ptr_outer = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #di_struct, sizeInBits = 64>
+#di_subroutine_outer = #llvm.di_subroutine_type<types = #di_null_type, #di_ptr_outer>
+#di_subprogram_outer = #llvm.di_subprogram<
+  id = distinct[2]<>,
+  compileUnit = #di_compile_unit,
+  scope = #di_struct,
+  name = "class_method",
+  file = #di_file,
+  subprogramFlags = Definition,
+  type = #di_subroutine_outer>
+
+#loc3 = loc(fused<#di_subprogram_outer>["test.mlir":1:1])
+
+// CHECK: @class_method
+// CHECK: ret void, !dbg ![[LOC:.*]]
+
+// CHECK: ![[CU:.*]] = distinct !DICompileUnit(
+// CHECK: ![[SP:.*]] = distinct !DISubprogram(name: "class_method", scope: ![[STRUCT:.*]], file: !{{.*}}, type: ![[SUBROUTINE:.*]], spFlags: DISPFlagDefinition, unit: ![[CU]])
+// CHECK: ![[STRUCT]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "class_name", {{.*}}, elements: ![[ELEMS:.*]])
+// CHECK: ![[ELEMS]] = !{![[SP]]}
+// CHECK: ![[SUBROUTINE]] = !DISubroutineType(types: ![[SUBROUTINE_ELEMS:.*]])
+// CHECK: ![[SUBROUTINE_ELEMS]] = !{null, ![[PTR:.*]]}
+// CHECK: ![[PTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[STRUCT]], size: 64)
+// CHECK: ![[LOC]] = !DILocation(line: 1, column: 1, scope: ![[SP]])
+
+llvm.func @class_method() {
+  llvm.return loc(#loc3)
+} loc(#loc3)
+
+// -----
+
+// Ensures composite types with a recursive scope work.
+
+#di_composite_type_self = #llvm.di_composite_type<tag = DW_TAG_null, recId = distinct[0]<>>
+#di_file = #llvm.di_file<"test.mlir" in "/">
+#di_subroutine_type = #llvm.di_subroutine_type<types = #di_composite_type_self>
+#di_subprogram = #llvm.di_subprogram<scope = #di_file, file = #di_file, subprogramFlags = Optimized, type = #di_subroutine_type>
+#di_composite_type = #llvm.di_composite_type<tag = DW_TAG_class_type, recId = distinct[0]<>, scope = #di_subprogram>
+#di_global_variable = #llvm.di_global_variable<file = #di_file, line = 1, type = #di_composite_type>
+#di_global_variable_expression = #llvm.di_global_variable_expression<var = #di_global_variable>
+
+llvm.mlir.global @global_variable() {dbg_expr = #di_global_variable_expression} : !llvm.struct<()>
+
+// CHECK: distinct !DIGlobalVariable({{.*}}type: ![[COMP:[0-9]+]],
+// CHECK: ![[COMP]] = distinct !DICompositeType({{.*}}scope: ![[SCOPE:[0-9]+]],
+// CHECK: ![[SCOPE]] = !DISubprogram({{.*}}type: ![[SUBROUTINE:[0-9]+]],
+// CHECK: ![[SUBROUTINE]] = !DISubroutineType(types: ![[SR_TYPES:[0-9]+]])
+// CHECK: ![[SR_TYPES]] = !{![[COMP]]}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 93550f5c7bd5a..ce6b56d48437a 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -88,13 +88,20 @@ llvm.func @rocdl.bpermute(%src : i32) -> i32 {
   llvm.return %0 : i32
 }
 
-llvm.func @rocdl.ballot(%pred : i1) -> i32 {
-  // CHECK-LABEL: rocdl.ballot
+llvm.func @rocdl.ballot32(%pred : i1) -> i32 {
+  // CHECK-LABEL: rocdl.ballot32
   // CHECK: call i32 @llvm.amdgcn.ballot
   %0 = rocdl.ballot %pred : i32
   llvm.return %0 : i32
 }
 
+llvm.func @rocdl.ballot64(%pred : i1) -> i64 {
+  // CHECK-LABEL: rocdl.ballot64
+  // CHECK: call i64 @llvm.amdgcn.ballot
+  %0 = rocdl.ballot %pred : i64
+  llvm.return %0 : i64
+}
+
 llvm.func @rocdl.waitcnt() {
   // CHECK-LABEL: rocdl.waitcnt
   // CHECK-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0)
diff --git a/mlir/test/Transforms/buffer-results-to-out-params-add-result-attr.mlir b/mlir/test/Transforms/buffer-results-to-out-params-add-result-attr.mlir
new file mode 100644
index 0000000000000..f4a95c73e2953
--- /dev/null
+++ b/mlir/test/Transforms/buffer-results-to-out-params-add-result-attr.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-opt -p 'builtin.module(buffer-results-to-out-params{add-result-attr})' -split-input-file -verify-diagnostics %s | FileCheck %s
+
+// CHECK-LABEL: @basic({{.*}}: memref<f32> {bufferize.result})
+func.func @basic() -> (memref<f32>) {
+  %0 = "test.source"() : () -> (memref<f32>)
+  return %0 : memref<f32>
+}
+
+// -----
+
+// CHECK-LABEL: multiple_results
+// CHECK-SAME:  memref<1xf32> {bufferize.result}
+// CHECK-SAME:  memref<2xf32> {bufferize.result}
+func.func @multiple_results() -> (memref<1xf32>, memref<2xf32>) {
+  %0, %1 = "test.source"() : () -> (memref<1xf32>, memref<2xf32>)
+  return %0, %1 : memref<1xf32>, memref<2xf32>
+}
diff --git a/mlir/test/lib/Dialect/Affine/TestAccessAnalysis.cpp b/mlir/test/lib/Dialect/Affine/TestAccessAnalysis.cpp
index b38046299d504..751302550092d 100644
--- a/mlir/test/lib/Dialect/Affine/TestAccessAnalysis.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAccessAnalysis.cpp
@@ -59,18 +59,25 @@ void TestAccessAnalysis::runOnOperation() {
       enclosingOps.clear();
       getAffineForIVs(*memOp, &enclosingOps);
       for (unsigned d = 0, e = enclosingOps.size(); d < e; d++) {
+        AffineForOp loop = enclosingOps[d];
         int memRefDim;
-        bool isContiguous;
+        bool isContiguous, isInvariant;
         if (auto read = dyn_cast<AffineReadOpInterface>(memOp)) {
-          isContiguous = isContiguousAccess(enclosingOps[d].getInductionVar(),
-                                            read, &memRefDim);
+          isContiguous =
+              isContiguousAccess(loop.getInductionVar(), read, &memRefDim);
+          isInvariant = isInvariantAccess(read, loop);
         } else {
-          isContiguous = isContiguousAccess(enclosingOps[d].getInductionVar(),
-                                            cast<AffineWriteOpInterface>(memOp),
-                                            &memRefDim);
+          auto write = cast<AffineWriteOpInterface>(memOp);
+          isContiguous =
+              isContiguousAccess(loop.getInductionVar(), write, &memRefDim);
+          isInvariant = isInvariantAccess(write, loop);
         }
+        // Check for contiguity for the innermost memref dimension to avoid
+        // emitting too many diagnostics.
         if (isContiguous && memRefDim == 0)
           memOp->emitRemark("contiguous along loop ") << d << '\n';
+        if (isInvariant)
+          memOp->emitRemark("invariant along loop ") << d << '\n';
       }
     }
   }
diff --git a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
index 541a201c94c58..e2229a392bbf7 100644
--- a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
@@ -683,6 +683,24 @@ func.func @cosh() {
  return
 }
 
+// -------------------------------------------------------------------------- //
+// Tanh.
+// -------------------------------------------------------------------------- //
+
+func.func @tanh_8xf32(%a : vector<8xf32>) {
+  %r = math.tanh %a : vector<8xf32>
+  vector.print %r : vector<8xf32>
+  return
+}
+
+func.func @tanh() {
+  // CHECK: -1, -0.761594, -0.291313, 0, 0.291313, 0.761594, 1, 1
+  %v3 = arith.constant dense<[0xff800000, -1.0, -0.3, 0.0, 0.3, 1.0, 10.0, 0x7f800000]> : vector<8xf32>
+  call @tanh_8xf32(%v3) : (vector<8xf32>) -> ()
+
+ return
+}
+
 func.func @main() {
   call @exp2f() : () -> ()
   call @roundf() : () -> ()
@@ -690,5 +708,6 @@ func.func @main() {
   call @roundeven() : () -> ()
   call @sinh() : () -> ()
   call @cosh() : () -> ()
+  call @tanh() : () -> ()
   return
 }
diff --git a/mlir/test/mlir-opt/nearmiss.mlir b/mlir/test/mlir-opt/split-markers.mlir
similarity index 52%
rename from mlir/test/mlir-opt/nearmiss.mlir
rename to mlir/test/mlir-opt/split-markers.mlir
index 2f695517eca06..665a37f317705 100644
--- a/mlir/test/mlir-opt/nearmiss.mlir
+++ b/mlir/test/mlir-opt/split-markers.mlir
@@ -1,6 +1,13 @@
-// RUN: mlir-opt --split-input-file --verify-diagnostics %s 2> %t &&  FileCheck --input-file %t %s
+// Check near-miss mechanics:
+// RUN: mlir-opt --split-input-file --verify-diagnostics %s 2> %t \
+// RUN: &&  FileCheck --input-file %t %s
 // RUN: cat %t
 
+// Check that (1) custom input splitter and (2) custom output splitters work.
+// RUN: mlir-opt %s -split-input-file="// CHECK: ""----" \
+// RUN:   -output-split-marker="// ---- next split ----" \
+// RUN: | FileCheck --check-prefix=CHECK-SPLITTERS %s
+
 func.func @main() {return}
 
 // -----
@@ -20,3 +27,9 @@ func.func @bar2() {return }
 
 // No error flagged at the end for a near miss.
 // ----
+
+// CHECK-SPLITTERS: module
+// CHECK-SPLITTERS: ---- next split ----
+// CHECK-SPLITTERS: module
+// CHECK-SPLITTERS: ---- next split ----
+// CHECK-SPLITTERS: module
diff --git a/mlir/test/mlir-pdll/split-markers.pdll b/mlir/test/mlir-pdll/split-markers.pdll
new file mode 100644
index 0000000000000..45e409a838369
--- /dev/null
+++ b/mlir/test/mlir-pdll/split-markers.pdll
@@ -0,0 +1,36 @@
+// Check that (1) the default input split marker used if no custom marker is
+// specified and (2) the output file is merged using the default marker.
+// RUN: mlir-pdll %s -split-input-file \
+// RUN: | FileCheck -check-prefix=CHECK-DEFAULT %s
+
+// Check that the custom (3) input and (output) split markers are used if
+// provided.
+// RUN: mlir-pdll %s \
+// RUN:   -split-input-file="// ""=====" -output-split-marker "// #####" \
+// RUN: | FileCheck -check-prefix=CHECK-CUSTOM %s
+
+// CHECK-DEFAULT:      Module
+// CHECK-DEFAULT-NEXT: PatternDecl
+// CHECK-DEFAULT-NOT:  PatternDecl
+// CHECK-DEFAULT:      //{{ }}-----
+// CHECK-DEFAULT-NEXT: Module
+// CHECK-DEFAULT-NEXT: PatternDecl
+// CHECK-DEFAULT:      PatternDecl
+
+// CHECK-CUSTOM:      Module
+// CHECK-CUSTOM-NEXT: PatternDecl
+// CHECK-CUSTOM:      PatternDecl
+// CHECK-CUSTOM:      // #####
+// CHECK-CUSTOM-NEXT: Module
+// CHECK-CUSTOM-NEXT: PatternDecl
+// CHECK-CUSTOM-NOT:  PatternDecl
+
+Pattern => erase op<test.op>;
+
+// -----
+
+Pattern => erase op<test.op2>;
+
+// =====
+
+Pattern => erase op<test.op3>;
diff --git a/mlir/test/mlir-translate/split-markers.mlir b/mlir/test/mlir-translate/split-markers.mlir
new file mode 100644
index 0000000000000..ed576bcd85236
--- /dev/null
+++ b/mlir/test/mlir-translate/split-markers.mlir
@@ -0,0 +1,35 @@
+// Check that (1) the output split marker is inserted and (2) the input file is
+// split using the default split marker.
+// RUN: mlir-translate %s -split-input-file -mlir-to-llvmir \
+// RUN:   -output-split-marker="; -----" \
+// RUN: | FileCheck -check-prefix=CHECK-OUTPUT %s
+
+// With the second command, check that (3) the input split marker is used and
+// (4) the output split marker is empty if not specified.
+// RUN: mlir-translate %s -split-input-file="// ""-----" -mlir-to-llvmir \
+// RUN:   -output-split-marker="; -----" \
+// RUN: | mlir-translate -split-input-file -import-llvm \
+// RUN:   -split-input-file="; -----" \
+// RUN: | FileCheck -check-prefix=CHECK-ROUNDTRIP %s
+
+// Check that (5) the input is not split if `-split-input-file` is not given.
+// RUN: mlir-translate %s -mlir-to-llvmir \
+// RUN: | FileCheck -check-prefix=CHECK-NOSPLIT %s
+
+// CHECK-OUTPUT:      ModuleID
+// CHECK-OUTPUT:      ; -----
+// CHECK-OUTPUT-NEXT: ModuleID
+
+// CHECK-ROUNDTRIP:       module {{.*}} {
+// CHECK-ROUNDTRIP-NEXT:  }
+// CHECK-ROUNDTRIP-EMPTY:
+// CHECK-ROUNDTRIP:       module
+
+// CHECK-NOSPLIT:     ModuleID
+// CHECK-NOSPLIT-NOT: ModuleID
+
+module {}
+
+// -----
+
+module {}
diff --git a/mlir/tools/mlir-pdll/mlir-pdll.cpp b/mlir/tools/mlir-pdll/mlir-pdll.cpp
index 43e4aa5064240..c6ad6c361e994 100644
--- a/mlir/tools/mlir-pdll/mlir-pdll.cpp
+++ b/mlir/tools/mlir-pdll/mlir-pdll.cpp
@@ -136,11 +136,20 @@ int main(int argc, char **argv) {
       llvm::cl::desc(
           "Print out the parsed ODS information from the input file"),
       llvm::cl::init(false));
-  llvm::cl::opt<bool> splitInputFile(
-      "split-input-file",
-      llvm::cl::desc("Split the input file into pieces and process each "
-                     "chunk independently"),
-      llvm::cl::init(false));
+  llvm::cl::opt<std::string> inputSplitMarker{
+      "split-input-file", llvm::cl::ValueOptional,
+      llvm::cl::callback([&](const std::string &str) {
+        // Implicit value: use default marker if flag was used without value.
+        if (str.empty())
+          inputSplitMarker.setValue(kDefaultSplitMarker);
+      }),
+      llvm::cl::desc("Split the input file into chunks using the given or "
+                     "default marker and process each chunk independently"),
+      llvm::cl::init("")};
+  llvm::cl::opt<std::string> outputSplitMarker(
+      "output-split-marker",
+      llvm::cl::desc("Split marker to use for merging the ouput"),
+      llvm::cl::init(kDefaultSplitMarker));
   llvm::cl::opt<enum OutputType> outputType(
       "x", llvm::cl::init(OutputType::AST),
       llvm::cl::desc("The type of output desired"),
@@ -187,7 +196,7 @@ int main(int argc, char **argv) {
                          dumpODS, includedFiles);
   };
   if (failed(splitAndProcessBuffer(std::move(inputFile), processFn, outputStrOS,
-                                   splitInputFile)))
+                                   inputSplitMarker, outputSplitMarker)))
     return 1;
 
   // Write the output.
diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
index 9672a02cc08f6..2a7406f42f34b 100644
--- a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
@@ -533,7 +533,7 @@ void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) {
      << "struct " << interfaceTraitsName << " {\n";
   emitConceptDecl(interface);
   emitModelDecl(interface);
-  os << "};";
+  os << "};\n";
 
   // Emit the derived trait for the interface.
   os << "template <typename " << valueTemplate << ">\n";
diff --git a/mlir/unittests/IR/AffineExprTest.cpp b/mlir/unittests/IR/AffineExprTest.cpp
new file mode 100644
index 0000000000000..ff154eb29807c
--- /dev/null
+++ b/mlir/unittests/IR/AffineExprTest.cpp
@@ -0,0 +1,32 @@
+//===- AffineExprTest.cpp - unit tests for affine expression API ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+
+// Test creating AffineExprs using the overloaded binary operators.
+TEST(AffineExprTest, constructFromBinaryOperators) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+
+  auto d0 = b.getAffineDimExpr(0);
+  auto d1 = b.getAffineDimExpr(1);
+
+  auto sum = d0 + d1;
+  auto difference = d0 - d1;
+  auto product = d0 * d1;
+  auto remainder = d0 % d1;
+
+  ASSERT_EQ(sum.getKind(), AffineExprKind::Add);
+  ASSERT_EQ(difference.getKind(), AffineExprKind::Add);
+  ASSERT_EQ(product.getKind(), AffineExprKind::Mul);
+  ASSERT_EQ(remainder.getKind(), AffineExprKind::Mod);
+}
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index e7e9c3b565169..71f8f449756ec 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_unittest(MLIRIRTests
   AdaptorTest.cpp
+  AffineExprTest.cpp
   AffineMapTest.cpp
   AttributeTest.cpp
   DialectTest.cpp
diff --git a/openmp/docs/SupportAndFAQ.rst b/openmp/docs/SupportAndFAQ.rst
index afc532cd73eef..9e6974dfbb13d 100644
--- a/openmp/docs/SupportAndFAQ.rst
+++ b/openmp/docs/SupportAndFAQ.rst
@@ -279,11 +279,12 @@ Q: How to build an OpenMP offload capable compiler with an outdated host compile
 
 Enabling the OpenMP runtime will perform a two-stage build for you.
 If your host compiler is different from your system-wide compiler, you may need
-to set the CMake variable `GCC_INSTALL_PREFIX` so clang will be able to find the
-correct GCC toolchain in the second stage of the build.
+to set ``CMAKE_{C,CXX}_FLAGS`` like
+``--gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/12`` so that clang will be
+able to find the correct GCC toolchain in the second stage of the build.
 
 For example, if your system-wide GCC installation is too old to build LLVM and
-you would like to use a newer GCC, set the CMake variable `GCC_INSTALL_PREFIX`
+you would like to use a newer GCC, set ``--gcc-install-dir=``
 to inform clang of the GCC installation you would like to use in the second stage.
 
 Q: How can I include OpenMP offloading support in my CMake project?
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index f16f21cfce0a8..c6e11a429b173 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -62,6 +62,8 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-oldDriver")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-LTO")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-JIT-LTO")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu-LTO")
 
 # Once the plugins for the different targets are validated, they will be added to
 # the list of supported targets in the current system.
diff --git a/openmp/libomptarget/include/OpenMP/Mapping.h b/openmp/libomptarget/include/OpenMP/Mapping.h
index 4bd676fc658a7..b9f5c16582931 100644
--- a/openmp/libomptarget/include/OpenMP/Mapping.h
+++ b/openmp/libomptarget/include/OpenMP/Mapping.h
@@ -424,7 +424,8 @@ typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
                                    map_var_info_t *, void **, AsyncInfoTy &,
                                    bool);
 
-void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device);
+void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
+                               bool toStdOut = false);
 
 int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     void **ArgsBase, void **Args, int64_t *ArgSizes,
diff --git a/openmp/libomptarget/include/Shared/Debug.h b/openmp/libomptarget/include/Shared/Debug.h
index a39626d15386b..7c3db8dbf119f 100644
--- a/openmp/libomptarget/include/Shared/Debug.h
+++ b/openmp/libomptarget/include/Shared/Debug.h
@@ -136,10 +136,12 @@ inline uint32_t getDebugLevel() {
   } while (0)
 
 /// Print a generic information string used if LIBOMPTARGET_INFO=1
-#define INFO_MESSAGE(_num, ...)                                                \
+#define INFO_MESSAGE(_num, ...) INFO_MESSAGE_TO(stderr, _num, __VA_ARGS__)
+
+#define INFO_MESSAGE_TO(_stdDst, _num, ...)                                    \
   do {                                                                         \
-    fprintf(stderr, GETNAME(TARGET_NAME) " device %d info: ", (int)_num);      \
-    fprintf(stderr, __VA_ARGS__);                                              \
+    fprintf(_stdDst, GETNAME(TARGET_NAME) " device %d info: ", (int)_num);     \
+    fprintf(_stdDst, __VA_ARGS__);                                             \
   } while (0)
 
 // Debugging messages
@@ -187,4 +189,13 @@ inline uint32_t getDebugLevel() {
     }                                                                          \
   } while (false)
 
+#define DUMP_INFO(toStdOut, _flags, _id, ...)                                  \
+  do {                                                                         \
+    if (toStdOut) {                                                            \
+      INFO_MESSAGE_TO(stdout, _id, __VA_ARGS__);                               \
+    } else {                                                                   \
+      INFO(_flags, _id, __VA_ARGS__);                                          \
+    }                                                                          \
+  } while (false)
+
 #endif // OMPTARGET_SHARED_DEBUG_H
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 8e0ccf191839d..323dee41630f2 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -273,6 +273,7 @@ struct __tgt_target_non_contig {
 extern "C" {
 #endif
 
+void ompx_dump_mapping_tables(void);
 int omp_get_num_devices(void);
 int omp_get_device_num(void);
 int omp_get_initial_device(void);
diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
index 3ca02368253ef..b6fc136e8a179 100644
--- a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -12,12 +12,15 @@
 
 add_subdirectory(common)
 
-# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
+# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname,
+#                          string tmachine_llvm, string tmachine_triple, string elf_machine_id);
 # - build a plugin for an ELF based generic 64-bit target based on libffi.
 # - tmachine: name of the machine processor as used in the cmake build system.
 # - tmachine_name: name of the machine to be printed with the debug messages.
 # - tmachine_libname: machine name to be appended to the plugin library name.
-macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+# - tmachine_llvm: LLVM triple for the processor
+# - tmachine_triple: GNU target triple
+macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_llvm tmachine_triple elf_machine_id)
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
   # Define macro to be used as prefix of the runtime messages for this target.
   add_definitions("-DTARGET_NAME=${tmachine_name}")
@@ -30,7 +33,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
   add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
 
   # Define target triple
-  add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine}")
+  add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine_llvm}")
 
   add_llvm_library("omptarget.rtl.${tmachine_libname}"
     SHARED
diff --git a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
index 2c2b753590e20..663ab4d60ff91 100644
--- a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
 else()
  libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index 5c767995126b7..829b4b7291193 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -104,7 +104,7 @@ class GenericGlobalHandlerTy {
   virtual ~GenericGlobalHandlerTy() {}
 
   /// Helper function for getting an ELF from a device image.
-  Expected<ELF64LEObjectFile> getELFObjectFile(DeviceImageTy &Image);
+  Expected<std::unique_ptr<ObjectFile>> getELFObjectFile(DeviceImageTy &Image);
 
   /// Returns whether the symbol named \p SymName is present in the given \p
   /// Image.
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index 9645ab2a67262..b7be7b645ba33 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -289,7 +289,7 @@ struct GenericKernelTy {
 
   /// Return a device pointer to a new kernel launch environment.
   Expected<KernelLaunchEnvironmentTy *>
-  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
+  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
                              AsyncInfoWrapperTy &AsyncInfo) const;
 
   /// Indicate whether an execution mode is valid.
@@ -1393,7 +1393,7 @@ template <typename ResourceRef> class GenericDeviceResourceManagerTy {
 };
 
 /// A static check on whether or not we support RPC in libomptarget.
-const bool libomptargetSupportsRPC();
+bool libomptargetSupportsRPC();
 
 } // namespace plugin
 } // namespace target
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/Utils/ELF.h b/openmp/libomptarget/plugins-nextgen/common/include/Utils/ELF.h
index 140a6b6b84aa1..88c83d39b68ce 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/Utils/ELF.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/Utils/ELF.h
@@ -28,17 +28,15 @@ bool isELF(llvm::StringRef Buffer);
 llvm::Expected<bool> checkMachine(llvm::StringRef Object, uint16_t EMachine);
 
 /// Returns a pointer to the given \p Symbol inside of an ELF object.
-llvm::Expected<const void *> getSymbolAddress(
-    const llvm::object::ELFObjectFile<llvm::object::ELF64LE> &ELFObj,
-    const llvm::object::ELF64LE::Sym &Symbol);
+llvm::Expected<const void *>
+getSymbolAddress(const llvm::object::ELFSymbolRef &Symbol);
 
 /// Returns the symbol associated with the \p Name in the \p ELFObj. It will
 /// first search for the hash sections to identify symbols from the hash table.
 /// If that fails it will fall back to a linear search in the case of an
 /// executable file without a hash table.
-llvm::Expected<const typename llvm::object::ELF64LE::Sym *>
-getSymbol(const llvm::object::ELFObjectFile<llvm::object::ELF64LE> &ELFObj,
-          llvm::StringRef Name);
+llvm::Expected<std::optional<llvm::object::ELFSymbolRef>>
+getSymbol(const llvm::object::ObjectFile &ELFObj, llvm::StringRef Name);
 
 } // namespace elf
 } // namespace utils
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index d398f60c55bd1..ba0aa47f8e51c 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -25,14 +25,12 @@ using namespace omp;
 using namespace target;
 using namespace plugin;
 
-Expected<ELF64LEObjectFile>
+Expected<std::unique_ptr<ObjectFile>>
 GenericGlobalHandlerTy::getELFObjectFile(DeviceImageTy &Image) {
   assert(utils::elf::isELF(Image.getMemoryBuffer().getBuffer()) &&
          "Input is not an ELF file");
 
-  Expected<ELF64LEObjectFile> ElfOrErr =
-      ELF64LEObjectFile::create(Image.getMemoryBuffer());
-  return ElfOrErr;
+  return ELFObjectFileBase::createELFObjectFile(Image.getMemoryBuffer());
 }
 
 Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
@@ -91,13 +89,13 @@ bool GenericGlobalHandlerTy::isSymbolInImage(GenericDeviceTy &Device,
   }
 
   // Search the ELF symbol using the symbol name.
-  auto SymOrErr = utils::elf::getSymbol(*ELFObjOrErr, SymName);
+  auto SymOrErr = utils::elf::getSymbol(**ELFObjOrErr, SymName);
   if (!SymOrErr) {
     consumeError(SymOrErr.takeError());
     return false;
   }
 
-  return *SymOrErr;
+  return SymOrErr->has_value();
 }
 
 Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
@@ -110,17 +108,17 @@ Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
     return ELFObj.takeError();
 
   // Search the ELF symbol using the symbol name.
-  auto SymOrErr = utils::elf::getSymbol(*ELFObj, ImageGlobal.getName());
+  auto SymOrErr = utils::elf::getSymbol(**ELFObj, ImageGlobal.getName());
   if (!SymOrErr)
     return Plugin::error("Failed ELF lookup of global '%s': %s",
                          ImageGlobal.getName().data(),
                          toString(SymOrErr.takeError()).data());
 
-  if (!*SymOrErr)
+  if (!SymOrErr->has_value())
     return Plugin::error("Failed to find global symbol '%s' in the ELF image",
                          ImageGlobal.getName().data());
 
-  auto AddrOrErr = utils::elf::getSymbolAddress(*ELFObj, **SymOrErr);
+  auto AddrOrErr = utils::elf::getSymbolAddress(**SymOrErr);
   // Get the section to which the symbol belongs.
   if (!AddrOrErr)
     return Plugin::error("Failed to get ELF symbol from global '%s': %s",
@@ -129,7 +127,7 @@ Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
 
   // Setup the global symbol's address and size.
   ImageGlobal.setPtr(const_cast<void *>(*AddrOrErr));
-  ImageGlobal.setSize((*SymOrErr)->st_size);
+  ImageGlobal.setSize((*SymOrErr)->getSize());
 
   return Plugin::success();
 }
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 8c8c52de0c210..f39f913d85eec 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -468,11 +468,13 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
 Expected<KernelLaunchEnvironmentTy *>
 GenericKernelTy::getKernelLaunchEnvironment(
-    GenericDeviceTy &GenericDevice,
+    GenericDeviceTy &GenericDevice, uint32_t Version,
     AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   // Ctor/Dtor have no arguments, replaying uses the original kernel launch
-  // environment.
-  if (isCtorOrDtor() || RecordReplay.isReplaying())
+  // environment. Older versions of the compiler do not generate a kernel
+  // launch environment.
+  if (isCtorOrDtor() || RecordReplay.isReplaying() ||
+      Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
     return nullptr;
 
   if (!KernelEnvironment.Configuration.ReductionDataSize ||
@@ -544,8 +546,8 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
   llvm::SmallVector<void *, 16> Args;
   llvm::SmallVector<void *, 16> Ptrs;
 
-  auto KernelLaunchEnvOrErr =
-      getKernelLaunchEnvironment(GenericDevice, AsyncInfoWrapper);
+  auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
+      GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
   if (!KernelLaunchEnvOrErr)
     return KernelLaunchEnvOrErr.takeError();
 
@@ -586,6 +588,9 @@ void *GenericKernelTy::prepareArgs(
   uint32_t KLEOffset = !!KernelLaunchEnvironment;
   NumArgs += KLEOffset;
 
+  if (NumArgs == 0)
+    return nullptr;
+
   Args.resize(NumArgs);
   Ptrs.resize(NumArgs);
 
@@ -1560,7 +1565,7 @@ Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
   return isELFCompatible(Image);
 }
 
-const bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
+bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
   return true;
 #else
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp b/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
index c84c3bad5def0..2ae97f0f25892 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
@@ -36,18 +36,10 @@ bool utils::elf::isELF(StringRef Buffer) {
   }
 }
 
-Expected<bool> utils::elf::checkMachine(StringRef Object, uint16_t EMachine) {
-  assert(isELF(Object) && "Input is not an ELF!");
-
-  Expected<ELF64LEObjectFile> ElfOrErr =
-      ELF64LEObjectFile::create(MemoryBufferRef(Object, /*Identifier=*/""),
-                                /*InitContent=*/false);
-  if (!ElfOrErr)
-    return ElfOrErr.takeError();
-
-  const auto Header = ElfOrErr->getELFFile().getHeader();
-  if (Header.e_ident[EI_CLASS] != ELFCLASS64)
-    return createError("Only 64-bit ELF files are supported");
+template <class ELFT>
+static Expected<bool>
+checkMachineImpl(const object::ELFObjectFile<ELFT> &ELFObj, uint16_t EMachine) {
+  const auto Header = ELFObj.getELFFile().getHeader();
   if (Header.e_type != ET_EXEC && Header.e_type != ET_DYN)
     return createError("Only executable ELF files are supported");
 
@@ -71,6 +63,25 @@ Expected<bool> utils::elf::checkMachine(StringRef Object, uint16_t EMachine) {
   return Header.e_machine == EMachine;
 }
 
+Expected<bool> utils::elf::checkMachine(StringRef Object, uint16_t EMachine) {
+  assert(isELF(Object) && "Input is not an ELF!");
+
+  Expected<std::unique_ptr<ObjectFile>> ElfOrErr =
+      ObjectFile::createELFObjectFile(
+          MemoryBufferRef(Object, /*Identifier=*/""),
+          /*InitContent=*/false);
+  if (!ElfOrErr)
+    return ElfOrErr.takeError();
+
+  if (const ELF64LEObjectFile *ELFObj =
+          dyn_cast<ELF64LEObjectFile>(&**ElfOrErr))
+    return checkMachineImpl(*ELFObj, EMachine);
+  if (const ELF64BEObjectFile *ELFObj =
+          dyn_cast<ELF64BEObjectFile>(&**ElfOrErr))
+    return checkMachineImpl(*ELFObj, EMachine);
+  return createError("Only 64-bit ELF files are supported");
+}
+
 template <class ELFT>
 static Expected<const typename ELFT::Sym *>
 getSymbolFromGnuHashTable(StringRef Name, const typename ELFT::GnuHash &HashTab,
@@ -138,9 +149,10 @@ getSymbolFromSysVHashTable(StringRef Name, const typename ELFT::Hash &HashTab,
 }
 
 template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getHashTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
-                   StringRef Name) {
+static Expected<std::optional<ELFSymbolRef>>
+getHashTableSymbol(const ELFObjectFile<ELFT> &ELFObj,
+                   const typename ELFT::Shdr &Sec, StringRef Name) {
+  const ELFFile<ELFT> &Elf = ELFObj.getELFFile();
   if (Sec.sh_type != ELF::SHT_HASH && Sec.sh_type != ELF::SHT_GNU_HASH)
     return createError(
         "invalid sh_type for hash table, expected SHT_HASH or SHT_GNU_HASH");
@@ -179,7 +191,12 @@ getHashTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
                 sizeof(typename ELFT::Word) * HashTab->nbuckets +
                 sizeof(typename ELFT::Word) * (SymTab.size() - HashTab->symndx))
       return createError("section has invalid sh_size: " + Twine(Sec.sh_size));
-    return getSymbolFromGnuHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
+    auto Sym = getSymbolFromGnuHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
+    if (!Sym)
+      return Sym.takeError();
+    if (!*Sym)
+      return std::nullopt;
+    return ELFObj.toSymbolRef(*SymTabOrErr, *Sym - &SymTab[0]);
   }
 
   // If this is a Sys-V hash table we verify its size and search the symbol
@@ -197,16 +214,22 @@ getHashTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
                           sizeof(typename ELFT::Word) * HashTab->nchain)
       return createError("section has invalid sh_size: " + Twine(Sec.sh_size));
 
-    return getSymbolFromSysVHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
+    auto Sym = getSymbolFromSysVHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
+    if (!Sym)
+      return Sym.takeError();
+    if (!*Sym)
+      return std::nullopt;
+    return ELFObj.toSymbolRef(*SymTabOrErr, *Sym - &SymTab[0]);
   }
 
-  return nullptr;
+  return std::nullopt;
 }
 
 template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getSymTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
-                  StringRef Name) {
+static Expected<std::optional<ELFSymbolRef>>
+getSymTableSymbol(const ELFObjectFile<ELFT> &ELFObj,
+                  const typename ELFT::Shdr &Sec, StringRef Name) {
+  const ELFFile<ELFT> &Elf = ELFObj.getELFFile();
   if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM)
     return createError(
         "invalid sh_type for hash table, expected SHT_SYMTAB or SHT_DYNSYM");
@@ -226,13 +249,14 @@ getSymTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
 
   for (const typename ELFT::Sym &Sym : SymTab)
     if (StrTab.drop_front(Sym.st_name).data() == Name)
-      return &Sym;
+      return ELFObj.toSymbolRef(&Sec, &Sym - &SymTab[0]);
 
-  return nullptr;
+  return std::nullopt;
 }
 
-Expected<const typename ELF64LE::Sym *>
-utils::elf::getSymbol(const ELFObjectFile<ELF64LE> &ELFObj, StringRef Name) {
+template <class ELFT>
+static Expected<std::optional<ELFSymbolRef>>
+getSymbolImpl(const ELFObjectFile<ELFT> &ELFObj, StringRef Name) {
   // First try to look up the symbol via the hash table.
   for (ELFSectionRef Sec : ELFObj.sections()) {
     if (Sec.getType() != SHT_HASH && Sec.getType() != SHT_GNU_HASH)
@@ -241,8 +265,7 @@ utils::elf::getSymbol(const ELFObjectFile<ELF64LE> &ELFObj, StringRef Name) {
     auto HashTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex());
     if (!HashTabOrErr)
       return HashTabOrErr.takeError();
-    return getHashTableSymbol<ELF64LE>(ELFObj.getELFFile(), **HashTabOrErr,
-                                       Name);
+    return getHashTableSymbol<ELFT>(ELFObj, **HashTabOrErr, Name);
   }
 
   // If this is an executable file check the entire standard symbol table.
@@ -253,16 +276,31 @@ utils::elf::getSymbol(const ELFObjectFile<ELF64LE> &ELFObj, StringRef Name) {
     auto SymTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex());
     if (!SymTabOrErr)
       return SymTabOrErr.takeError();
-    return getSymTableSymbol<ELF64LE>(ELFObj.getELFFile(), **SymTabOrErr, Name);
+    return getSymTableSymbol<ELFT>(ELFObj, **SymTabOrErr, Name);
   }
 
-  return nullptr;
+  return std::nullopt;
 }
 
-Expected<const void *> utils::elf::getSymbolAddress(
-    const object::ELFObjectFile<object::ELF64LE> &ELFObj,
-    const object::ELF64LE::Sym &Symbol) {
-  const ELFFile<ELF64LE> &ELFFile = ELFObj.getELFFile();
+Expected<std::optional<ELFSymbolRef>>
+utils::elf::getSymbol(const ObjectFile &Obj, StringRef Name) {
+  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(&Obj))
+    return getSymbolImpl(*ELFObj, Name);
+  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(&Obj))
+    return getSymbolImpl(*ELFObj, Name);
+  return createError("Only 64-bit ELF files are supported");
+}
+
+template <class ELFT>
+static Expected<const void *>
+getSymbolAddressImpl(const ELFObjectFile<ELFT> &ELFObj,
+                     const ELFSymbolRef &SymRef) {
+  const ELFFile<ELFT> &ELFFile = ELFObj.getELFFile();
+
+  auto SymOrErr = ELFObj.getSymbol(SymRef.getRawDataRefImpl());
+  if (!SymOrErr)
+    return SymOrErr.takeError();
+  const auto &Symbol = **SymOrErr;
 
   auto SecOrErr = ELFFile.getSection(Symbol.st_shndx);
   if (!SecOrErr)
@@ -283,3 +321,13 @@ Expected<const void *> utils::elf::getSymbolAddress(
 
   return ELFFile.base() + Offset;
 }
+
+Expected<const void *>
+utils::elf::getSymbolAddress(const ELFSymbolRef &SymRef) {
+  const ObjectFile *Obj = SymRef.getObject();
+  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
+    return getSymbolAddressImpl(*ELFObj, SymRef);
+  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
+    return getSymbolAddressImpl(*ELFObj, SymRef);
+  return createError("Only 64-bit ELF files are supported");
+}
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index f85a00cd1cd53..b862bc7490925 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -1166,7 +1166,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     // Search for all symbols that contain a constructor or destructor.
     SmallVector<std::pair<StringRef, uint16_t>> Funcs;
-    for (ELFSymbolRef Sym : ELFObjOrErr->symbols()) {
+    for (ELFSymbolRef Sym : (*ELFObjOrErr)->symbols()) {
       auto NameOrErr = Sym.getName();
       if (!NameOrErr)
         return NameOrErr.takeError();
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
index 0cccc9cb82e42..77466c111ee0a 100644
--- a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+  build_generic_elf64("ppc64" "PPC64" "ppc64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
 else()
  libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
index 9461d79d145df..91d21627a3272 100644
--- a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "ppc64le" "powerpc64le-ibm-linux-gnu" "21")
 else()
  libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
index 1b12a29289998..0388a235d2899 100644
--- a/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64("SystemZ" "S390X" "s390x" "s390x-ibm-linux-gnu" "22")
+ build_generic_elf64("s390x" "S390X" "s390x" "systemz" "s390x-ibm-linux-gnu" "22")
 else()
  libomptarget_say("Not building s390x NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
index 129d526a2ae78..27cf3e069a377 100644
--- a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
 else()
  libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt
index 9bc3f3339583d..d0971bd4ef079 100644
--- a/openmp/libomptarget/src/CMakeLists.txt
+++ b/openmp/libomptarget/src/CMakeLists.txt
@@ -73,6 +73,7 @@ if (NOT LIBOMPTARGET_PLUGINS_TO_LOAD)
 	check_plugin_target(cuda)
 	check_plugin_target(aarch64)
 	check_plugin_target(amdgpu)
+	check_plugin_target(s390x)
 endif()
 
 list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD PREPEND "\"libomptarget.rtl.")
diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 85fb08c00a9a7..c85f9868e37c2 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -16,6 +16,7 @@
 #include "rtl.h"
 
 #include "OpenMP/InternalTypes.h"
+#include "OpenMP/Mapping.h"
 #include "OpenMP/OMPT/Interface.h"
 #include "OpenMP/omp.h"
 #include "Shared/Profile.h"
@@ -27,6 +28,13 @@
 #include <cstring>
 #include <mutex>
 
+EXTERN void ompx_dump_mapping_tables() {
+  ident_t Loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
+  auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
+  for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
+    dumpTargetPointerMappings(&Loc, Device, true);
+}
+
 #ifdef OMPT_SUPPORT
 using namespace llvm::omp::target::ompt;
 #endif
diff --git a/openmp/libomptarget/src/OpenMP/Mapping.cpp b/openmp/libomptarget/src/OpenMP/Mapping.cpp
index 9c0b219b6f15f..c6ff3aa54dd66 100644
--- a/openmp/libomptarget/src/OpenMP/Mapping.cpp
+++ b/openmp/libomptarget/src/OpenMP/Mapping.cpp
@@ -16,28 +16,33 @@
 #include "device.h"
 
 /// Dump a table of all the host-target pointer pairs on failure
-void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device) {
+void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
+                               bool toStdOut) {
   MappingInfoTy::HDTTMapAccessorTy HDTTMap =
       Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
-  if (HDTTMap->empty())
+  if (HDTTMap->empty()) {
+    DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
+              "OpenMP Host-Device pointer mappings table empty\n");
     return;
+  }
 
   SourceInfo Kernel(Loc);
-  INFO(OMP_INFOTYPE_ALL, Device.DeviceID,
-       "OpenMP Host-Device pointer mappings after block at %s:%d:%d:\n",
-       Kernel.getFilename(), Kernel.getLine(), Kernel.getColumn());
-  INFO(OMP_INFOTYPE_ALL, Device.DeviceID, "%-18s %-18s %s %s %s %s\n",
-       "Host Ptr", "Target Ptr", "Size (B)", "DynRefCount", "HoldRefCount",
-       "Declaration");
+  DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
+            "OpenMP Host-Device pointer mappings after block at %s:%d:%d:\n",
+            Kernel.getFilename(), Kernel.getLine(), Kernel.getColumn());
+  DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
+            "%-18s %-18s %s %s %s %s\n", "Host Ptr", "Target Ptr", "Size (B)",
+            "DynRefCount", "HoldRefCount", "Declaration");
   for (const auto &It : *HDTTMap) {
     HostDataToTargetTy &HDTT = *It.HDTT;
     SourceInfo Info(HDTT.HstPtrName);
-    INFO(OMP_INFOTYPE_ALL, Device.DeviceID,
-         DPxMOD " " DPxMOD " %-8" PRIuPTR " %-11s %-12s %s at %s:%d:%d\n",
-         DPxPTR(HDTT.HstPtrBegin), DPxPTR(HDTT.TgtPtrBegin),
-         HDTT.HstPtrEnd - HDTT.HstPtrBegin, HDTT.dynRefCountToStr().c_str(),
-         HDTT.holdRefCountToStr().c_str(), Info.getName(), Info.getFilename(),
-         Info.getLine(), Info.getColumn());
+    DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
+              DPxMOD " " DPxMOD " %-8" PRIuPTR " %-11s %-12s %s at %s:%d:%d\n",
+              DPxPTR(HDTT.HstPtrBegin), DPxPTR(HDTT.TgtPtrBegin),
+              HDTT.HstPtrEnd - HDTT.HstPtrBegin,
+              HDTT.dynRefCountToStr().c_str(), HDTT.holdRefCountToStr().c_str(),
+              Info.getName(), Info.getFilename(), Info.getLine(),
+              Info.getColumn());
   }
 }
 
@@ -511,7 +516,8 @@ static void printCopyInfoImpl(int DeviceId, bool H2D, void *SrcPtrBegin,
        "Copying data from %s to %s, %sPtr=" DPxMOD ", %sPtr=" DPxMOD
        ", Size=%" PRId64 ", Name=%s\n",
        H2D ? "host" : "device", H2D ? "device" : "host", H2D ? "Hst" : "Tgt",
-       DPxPTR(SrcPtrBegin), H2D ? "Tgt" : "Hst", DPxPTR(DstPtrBegin), Size,
+       DPxPTR(H2D ? SrcPtrBegin : DstPtrBegin), H2D ? "Tgt" : "Hst",
+       DPxPTR(H2D ? DstPtrBegin : SrcPtrBegin), Size,
        (HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
                               : "unknown");
 }
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index d5432a9eed380..f95544ec8329c 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -35,6 +35,7 @@ VERS1.0 {
     __tgt_push_mapper_component;
     __kmpc_push_target_tripcount;
     __kmpc_push_target_tripcount_mapper;
+    ompx_dump_mapping_tables;
     omp_get_mapped_ptr;
     omp_get_num_devices;
     omp_get_device_num;
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 8b89bc3ff7124..b7f547f1ec3d5 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -21,6 +21,8 @@
 
 #include "Utils/ExponentialBackoff.h"
 
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+
 #include <cassert>
 #include <cstdint>
 #include <cstdio>
@@ -218,11 +220,19 @@ EXTERN void __tgt_target_data_update_nowait_mapper(
 static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs,
                                        KernelArgsTy &LocalKernelArgs,
                                        int32_t NumTeams, int32_t ThreadLimit) {
-  if (KernelArgs->Version > 2)
+  if (KernelArgs->Version > OMP_KERNEL_ARG_VERSION)
     DP("Unexpected ABI version: %u\n", KernelArgs->Version);
 
-  if (KernelArgs->Version == 1) {
-    LocalKernelArgs.Version = 2;
+  uint32_t UpgradedVersion = KernelArgs->Version;
+  if (KernelArgs->Version < OMP_KERNEL_ARG_VERSION) {
+    // The upgraded version will be based on the kernel launch environment.
+    if (KernelArgs->Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
+      UpgradedVersion = OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR - 1;
+    else
+      UpgradedVersion = OMP_KERNEL_ARG_VERSION;
+  }
+  if (UpgradedVersion != KernelArgs->Version) {
+    LocalKernelArgs.Version = UpgradedVersion;
     LocalKernelArgs.NumArgs = KernelArgs->NumArgs;
     LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs;
     LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs;
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 3b3e17f0f311e..5bbf3a455c72a 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -30,6 +30,7 @@
 
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/bit.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Object/ObjectFile.h"
 
 #include <cassert>
@@ -1752,7 +1753,7 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
   Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
 
   KernelArgsTy KernelArgs = {0};
-  KernelArgs.Version = 2;
+  KernelArgs.Version = OMP_KERNEL_ARG_VERSION;
   KernelArgs.NumArgs = NumArgs;
   KernelArgs.Tripcount = LoopTripCount;
   KernelArgs.NumTeams[0] = NumTeams;
diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
index 5095a69a375c9..3fe75f24db3e6 100644
--- a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
@@ -10,6 +10,8 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/openmp/libomptarget/test/api/ompx_dump_mapping_tables.cpp b/openmp/libomptarget/test/api/ompx_dump_mapping_tables.cpp
new file mode 100644
index 0000000000000..c170c2d738733
--- /dev/null
+++ b/openmp/libomptarget/test/api/ompx_dump_mapping_tables.cpp
@@ -0,0 +1,35 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <cstdio>
+#include <omp.h>
+
+#define N 10
+
+int main() {
+  int *a = new __int32_t[N];     // mapped and released from device 0
+  int *b = new __int32_t[2 * N]; // mapped to device 0
+
+  // clang-format off
+  // CHECK: Mapping tables after target enter data:
+  // CHECK-NEXT: omptarget device 0 info: OpenMP Host-Device pointer mappings after block
+  // CHECK-NEXT: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration
+  // CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} {{[48]}}0
+  // CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} {{[48]}}0
+#pragma omp target enter data device(0) map(to : a[ : N])
+#pragma omp target enter data device(0) map(to : b[ : 2*N])
+  // clang-format on
+  printf("Mapping tables after target enter data:\n");
+  ompx_dump_mapping_tables();
+
+  // clang-format off
+  // CHECK: Mapping tables after target exit data for a:
+  // CHECK-NEXT: omptarget device 0 info: OpenMP Host-Device pointer mappings after block
+  // CHECK-NEXT: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration
+  // CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} 80
+#pragma omp target exit data device(0) map(release : a[ : N])
+  // clang-format on
+  printf("\nMapping tables after target exit data for a:\n");
+  ompx_dump_mapping_tables();
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/jit/empty_kernel_lvl1.c b/openmp/libomptarget/test/jit/empty_kernel_lvl1.c
index c908c8ba06509..a0b8cd448837d 100644
--- a/openmp/libomptarget/test/jit/empty_kernel_lvl1.c
+++ b/openmp/libomptarget/test/jit/empty_kernel_lvl1.c
@@ -32,5 +32,7 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include "empty_kernel.inc"
diff --git a/openmp/libomptarget/test/jit/empty_kernel_lvl2.c b/openmp/libomptarget/test/jit/empty_kernel_lvl2.c
index 0b88d33292c57..81a04f55ce43d 100644
--- a/openmp/libomptarget/test/jit/empty_kernel_lvl2.c
+++ b/openmp/libomptarget/test/jit/empty_kernel_lvl2.c
@@ -92,5 +92,7 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include "empty_kernel.inc"
diff --git a/openmp/libomptarget/test/jit/type_punning.c b/openmp/libomptarget/test/jit/type_punning.c
index 23aa69bba7b95..10e3d2cef718b 100644
--- a/openmp/libomptarget/test/jit/type_punning.c
+++ b/openmp/libomptarget/test/jit/type_punning.c
@@ -12,6 +12,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 // Ensure that there is only the kernel function left, not any outlined
 // parallel regions.
diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
index 6f9d8c2b128c9..46641200bb56e 100644
--- a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
+++ b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
@@ -13,6 +13,8 @@
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 // REQUIRES: unified_shared_memory
 
diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy_globals.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy_globals.cpp
index 4a13d270aeebe..55dfb2807ebc7 100644
--- a/openmp/libomptarget/test/mapping/auto_zero_copy_globals.cpp
+++ b/openmp/libomptarget/test/mapping/auto_zero_copy_globals.cpp
@@ -9,6 +9,8 @@
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 // REQUIRES: unified_shared_memory
 
diff --git a/openmp/libomptarget/test/offloading/barrier_fence.c b/openmp/libomptarget/test/offloading/barrier_fence.c
index 5d1096478ed9e..b9a8ca27965a0 100644
--- a/openmp/libomptarget/test/offloading/barrier_fence.c
+++ b/openmp/libomptarget/test/offloading/barrier_fence.c
@@ -7,6 +7,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/openmp/libomptarget/test/offloading/bug49334.cpp b/openmp/libomptarget/test/offloading/bug49334.cpp
index a22d3fe9f6653..1f19dab378810 100644
--- a/openmp/libomptarget/test/offloading/bug49334.cpp
+++ b/openmp/libomptarget/test/offloading/bug49334.cpp
@@ -9,6 +9,8 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 // UNSUPPORTED: amdgcn-amd-amdhsa
 // UNSUPPORTED: nvptx64-nvidia-cuda
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
diff --git a/openmp/libomptarget/test/offloading/default_thread_limit.c b/openmp/libomptarget/test/offloading/default_thread_limit.c
index d32e7df418cbb..4da02bbb152e6 100644
--- a/openmp/libomptarget/test/offloading/default_thread_limit.c
+++ b/openmp/libomptarget/test/offloading/default_thread_limit.c
@@ -9,6 +9,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 __attribute__((optnone)) int optnone() { return 1; }
 
diff --git a/openmp/libomptarget/test/offloading/fortran/target-nested-target-data.f90 b/openmp/libomptarget/test/offloading/fortran/target-nested-target-data.f90
new file mode 100644
index 0000000000000..a694f2464546c
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target-nested-target-data.f90
@@ -0,0 +1,31 @@
+! Offloading test for target nested inside
+! a target data region
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+   integer :: A(10), B(10), C(10)
+
+   do I = 1, 10
+      A(I) = 1
+      B(I) = 2
+   end do
+   !$omp target data map(to: A, B) map(alloc: C)
+   !$omp target map(from: C)
+   do I = 1, 10
+      C(I) = A(I) + B(I) ! assigns 3, A:1 + B:2
+   end do
+   !$omp end target
+   !$omp target update from(C) ! updates C device -> host
+   !$omp end target data
+
+   print *, C ! should be all 3's
+
+end program
+
+! CHECK: 3 3 3 3 3 3 3 3 3 3
diff --git a/openmp/libomptarget/test/offloading/ompx_bare.c b/openmp/libomptarget/test/offloading/ompx_bare.c
index fb3810bd1df12..3dabdcd15e0d8 100644
--- a/openmp/libomptarget/test/offloading/ompx_bare.c
+++ b/openmp/libomptarget/test/offloading/ompx_bare.c
@@ -1,10 +1,13 @@
 // RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
+// RUN:   %fcheck-generic
 //
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <assert.h>
 #include <ompx.h>
diff --git a/openmp/libomptarget/test/offloading/ompx_coords.c b/openmp/libomptarget/test/offloading/ompx_coords.c
index 61dad61f46405..5e4e14b4c6dae 100644
--- a/openmp/libomptarget/test/offloading/ompx_coords.c
+++ b/openmp/libomptarget/test/offloading/ompx_coords.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <omp.h>
 #include <ompx.h>
diff --git a/openmp/libomptarget/test/offloading/ompx_saxpy_mixed.c b/openmp/libomptarget/test/offloading/ompx_saxpy_mixed.c
index 440b694e3ac70..f479be8a484fc 100644
--- a/openmp/libomptarget/test/offloading/ompx_saxpy_mixed.c
+++ b/openmp/libomptarget/test/offloading/ompx_saxpy_mixed.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <math.h>
 #include <omp.h>
diff --git a/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
index 5303a9463f155..10e1b33f1ce40 100644
--- a/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
+++ b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
@@ -6,6 +6,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <iostream>
 #include <vector>
diff --git a/openmp/libomptarget/test/offloading/small_trip_count.c b/openmp/libomptarget/test/offloading/small_trip_count.c
index d8bef667607a1..65f094f157469 100644
--- a/openmp/libomptarget/test/offloading/small_trip_count.c
+++ b/openmp/libomptarget/test/offloading/small_trip_count.c
@@ -9,6 +9,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #define N 128
 
diff --git a/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp b/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
index 9796c2dc11663..b7ae52a62c83b 100644
--- a/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
+++ b/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
@@ -7,6 +7,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 int main(int argc, char *argv[]) {
   constexpr const int block_size = 256;
diff --git a/openmp/libomptarget/test/offloading/spmdization.c b/openmp/libomptarget/test/offloading/spmdization.c
index 2cf30ff593b99..77913bec8342f 100644
--- a/openmp/libomptarget/test/offloading/spmdization.c
+++ b/openmp/libomptarget/test/offloading/spmdization.c
@@ -11,6 +11,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/openmp/libomptarget/test/offloading/target_critical_region.cpp b/openmp/libomptarget/test/offloading/target_critical_region.cpp
index 9a741bef6c591..495632bf76e17 100644
--- a/openmp/libomptarget/test/offloading/target_critical_region.cpp
+++ b/openmp/libomptarget/test/offloading/target_critical_region.cpp
@@ -6,6 +6,8 @@
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 // UNSUPPORTED: amdgcn-amd-amdhsa
 
 #include <omp.h>
diff --git a/openmp/libomptarget/test/offloading/thread_limit.c b/openmp/libomptarget/test/offloading/thread_limit.c
index 65275a8f27958..a8cc51b651dc9 100644
--- a/openmp/libomptarget/test/offloading/thread_limit.c
+++ b/openmp/libomptarget/test/offloading/thread_limit.c
@@ -9,6 +9,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 int main() {
   int n = 1 << 20;
diff --git a/openmp/libomptarget/test/offloading/workshare_chunk.c b/openmp/libomptarget/test/offloading/workshare_chunk.c
new file mode 100644
index 0000000000000..c2248547605be
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/workshare_chunk.c
@@ -0,0 +1,252 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// RUN: %libomptarget-compileopt-run-and-check-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+// clang-format off
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 100
+#define BLOCK_SHIFT 8
+
+void print(int *A, int size) {
+  for (int i = 0; i < size; ++i) {
+    printf("B%dT%d ", A[i] >> BLOCK_SHIFT, A[i] % (1 << BLOCK_SHIFT));
+  }
+  printf("\n");
+}
+
+int main() {
+  int A[N];
+
+#pragma omp target parallel for map(from:A) num_threads(10) schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target parallel for thread chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute map(from:A) num_teams(10) \
+        dist_schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute block chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) dist_schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 2);
+  printf("thread chunk size default\n");
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) dist_schedule(static, 2) schedule(static, 3)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 2);
+  printf("thread chunk size %d\n", 3);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) dist_schedule(static, 3) schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 3);
+  printf("thread chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) dist_schedule(static, 5) schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 5);
+  printf("thread chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) num_teams(10) \
+        dist_schedule(static, 49) schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 49);
+  printf("thread chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) num_threads(10) dist_schedule(static, 29)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 29);
+  printf("thread chunk size default\n");
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) num_threads(10) dist_schedule(static, 101)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 101);
+  printf("thread chunk size default\n");
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(9) num_threads(10) schedule(static, 101)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for default block chunk size ");
+  printf("thread chunk size %d\n", 101);
+  print(A, N);
+  return 0;
+}
+//CHECK:      omp target parallel for thread chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+
+//CHECK:      omp target teams distribute block chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 2 thread chunk size default
+
+//CHECK-NEXT: B0T0 B0T1 B1T0 B1T1 B2T0 B2T1 B3T0 B3T1 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T1 B6T0 B6T1 B7T0 B7T1 B8T0 B8T1 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T1 B1T0 B1T1 B2T0 B2T1 B3T0 B3T1 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T1 B6T0 B6T1 B7T0 B7T1 B8T0 B8T1 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T1 B1T0 B1T1 B2T0 B2T1 B3T0 B3T1 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T1 B6T0 B6T1 B7T0 B7T1 B8T0 B8T1 B9T0 B9T1
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME  block chunk size 2 thread chunk size 3
+
+//CHECK-NEXT: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 3 thread chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1
+//CHECK-SAME: B3T0 B3T0 B3T1 B4T0 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T0 B5T1 B6T0 B6T0 B6T1 B7T0 B7T0 B7T1
+//CHECK-SAME: B8T0 B8T0 B8T1 B9T0 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1
+//CHECK-SAME: B3T0 B3T0 B3T1 B4T0 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T0 B5T1 B6T0 B6T0 B6T1 B7T0 B7T0 B7T1
+//CHECK-SAME: B8T0 B8T0 B8T1 B9T0 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1
+//CHECK-SAME: B3T0 B3T0 B3T1 B4T0 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T0 B5T1 B6T0 B6T0 B6T1 B7T0 B7T0 B7T1
+//CHECK-SAME: B8T0 B8T0 B8T1 B9T0 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1 B3T0
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 5 thread chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B1T0 B1T0 B1T1 B1T1 B1T2
+//CHECK-SAME: B2T0 B2T0 B2T1 B2T1 B2T2 B3T0 B3T0 B3T1 B3T1 B3T2
+//CHECK-SAME: B4T0 B4T0 B4T1 B4T1 B4T2 B5T0 B5T0 B5T1 B5T1 B5T2
+//CHECK-SAME: B6T0 B6T0 B6T1 B6T1 B6T2 B7T0 B7T0 B7T1 B7T1 B7T2
+//CHECK-SAME: B8T0 B8T0 B8T1 B8T1 B8T2 B9T0 B9T0 B9T1 B9T1 B9T2
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B1T0 B1T0 B1T1 B1T1 B1T2
+//CHECK-SAME: B2T0 B2T0 B2T1 B2T1 B2T2 B3T0 B3T0 B3T1 B3T1 B3T2
+//CHECK-SAME: B4T0 B4T0 B4T1 B4T1 B4T2 B5T0 B5T0 B5T1 B5T1 B5T2
+//CHECK-SAME: B6T0 B6T0 B6T1 B6T1 B6T2 B7T0 B7T0 B7T1 B7T1 B7T2
+//CHECK-SAME: B8T0 B8T0 B8T1 B8T1 B8T2 B9T0 B9T0 B9T1 B9T1 B9T2
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 49 thread chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4 B0T5 B0T5
+//CHECK-SAME: B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9 B0T10 B0T10 B0T11 B0T11
+//CHECK-SAME: B0T12 B0T12 B0T13 B0T13 B0T14 B0T14 B0T15 B0T15 B0T16 B0T16
+//CHECK-SAME: B0T17 B0T17 B0T18 B0T18 B0T19 B0T19 B0T20 B0T20 B0T21 B0T21
+//CHECK-SAME: B0T22 B0T22 B0T23 B0T23 B0T24
+//CHECK-SAME: B1T0 B1T0 B1T1 B1T1 B1T2 B1T2 B1T3 B1T3 B1T4 B1T4 B1T5 B1T5
+//CHECK-SAME: B1T6 B1T6 B1T7 B1T7 B1T8 B1T8 B1T9 B1T9 B1T10 B1T10 B1T11 B1T11
+//CHECK-SAME: B1T12 B1T12 B1T13 B1T13 B1T14 B1T14 B1T15 B1T15 B1T16 B1T16
+//CHECK-SAME: B1T17 B1T17 B1T18 B1T18 B1T19 B1T19 B1T20 B1T20 B1T21 B1T21
+//CHECK-SAME: B1T22 B1T22 B1T23 B1T23 B1T24
+//CHECK-SAME: B2T0 B2T0
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 29 thread chunk size default
+
+//CHECK-NEXT: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8
+//CHECK-SAME: B1T0 B1T1 B1T2 B1T3 B1T4 B1T5 B1T6 B1T7 B1T8 B1T9
+//CHECK-SAME: B1T0 B1T1 B1T2 B1T3 B1T4 B1T5 B1T6 B1T7 B1T8 B1T9
+//CHECK-SAME: B1T0 B1T1 B1T2 B1T3 B1T4 B1T5 B1T6 B1T7 B1T8
+//CHECK-SAME: B2T0 B2T1 B2T2 B2T3 B2T4 B2T5 B2T6 B2T7 B2T8 B2T9
+//CHECK-SAME: B2T0 B2T1 B2T2 B2T3 B2T4 B2T5 B2T6 B2T7 B2T8 B2T9
+//CHECK-SAME: B2T0 B2T1 B2T2 B2T3 B2T4 B2T5 B2T6 B2T7 B2T8
+//CHECK-SAME: B3T0 B3T1 B3T2 B3T3 B3T4 B3T5 B3T6 B3T7 B3T8 B3T9
+//CHECK-SAME: B3T0 B3T1 B3T2
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 101 thread chunk size default
+
+//CHECK-NEXT: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: default block chunk size thread chunk size 101
+
+//CHECK-NEXT: B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0
+//CHECK-SAME: B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0
+//CHECK-SAME: B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0
+//CHECK-SAME: B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0
+//CHECK-SAME: B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0
+//CHECK-SAME: B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0
+//CHECK-SAME: B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0
+//CHECK-SAME: B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0
+//CHECK-SAME: B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0
diff --git a/openmp/libomptarget/test/ompt/target_memcpy.c b/openmp/libomptarget/test/ompt/target_memcpy.c
index 80a8d6a4b32e5..3224c5ac4c916 100644
--- a/openmp/libomptarget/test/ompt/target_memcpy.c
+++ b/openmp/libomptarget/test/ompt/target_memcpy.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Verify that for the target OpenMP APIs, the return address is non-null and
diff --git a/openmp/libomptarget/test/ompt/target_memcpy_emi.c b/openmp/libomptarget/test/ompt/target_memcpy_emi.c
index 5347f38b87b6f..fd7da13cb05d0 100644
--- a/openmp/libomptarget/test/ompt/target_memcpy_emi.c
+++ b/openmp/libomptarget/test/ompt/target_memcpy_emi.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Verify all three data transfer directions: H2D, D2D and D2H
diff --git a/openmp/libomptarget/test/ompt/veccopy.c b/openmp/libomptarget/test/ompt/veccopy.c
index 80e71fd8a48cb..bc70f5a0b5b67 100644
--- a/openmp/libomptarget/test/ompt/veccopy.c
+++ b/openmp/libomptarget/test/ompt/veccopy.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that registers non-EMI callbacks
diff --git a/openmp/libomptarget/test/ompt/veccopy_data.c b/openmp/libomptarget/test/ompt/veccopy_data.c
index cef1de316a7a1..264b484b881f1 100644
--- a/openmp/libomptarget/test/ompt/veccopy_data.c
+++ b/openmp/libomptarget/test/ompt/veccopy_data.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that registers EMI callbacks.
diff --git a/openmp/libomptarget/test/ompt/veccopy_disallow_both.c b/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
index 06293e413ba45..a011c21955bb2 100644
--- a/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
+++ b/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that shows that both EMI and non-EMI
diff --git a/openmp/libomptarget/test/ompt/veccopy_emi.c b/openmp/libomptarget/test/ompt/veccopy_emi.c
index b597d7be6aff6..8718a39b4af81 100644
--- a/openmp/libomptarget/test/ompt/veccopy_emi.c
+++ b/openmp/libomptarget/test/ompt/veccopy_emi.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that registers EMI callbacks
diff --git a/openmp/libomptarget/test/ompt/veccopy_emi_map.c b/openmp/libomptarget/test/ompt/veccopy_emi_map.c
index ce6f6e30d5815..2accba34034e6 100644
--- a/openmp/libomptarget/test/ompt/veccopy_emi_map.c
+++ b/openmp/libomptarget/test/ompt/veccopy_emi_map.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that shows that map-EMI callbacks are not supported.
diff --git a/openmp/libomptarget/test/ompt/veccopy_map.c b/openmp/libomptarget/test/ompt/veccopy_map.c
index 83f63a6e6049e..56a0dd48f5368 100644
--- a/openmp/libomptarget/test/ompt/veccopy_map.c
+++ b/openmp/libomptarget/test/ompt/veccopy_map.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that shows that map callbacks are not supported.
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index a1488ae9d21c6..eb3ab7778606a 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -156,6 +156,8 @@
     /* OpenMP 5.1 interop */
     typedef intptr_t omp_intptr_t;
 
+    extern void __KAI_KMPC_CONVENTION ompx_dump_mapping_tables(void);
+
     /* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined properties */
     typedef enum omp_interop_property {
         omp_ipr_fr_id = -1,
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 569a1ab9b477c..885d6636abe4a 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1402,19 +1402,9 @@ extern void __kmp_query_cpuid(kmp_cpuinfo_t *p);
 // subleaf is only needed for cache and topology discovery and can be set to
 // zero in most cases
 static inline void __kmp_x86_cpuid(int leaf, int subleaf, struct kmp_cpuid *p) {
-#if KMP_ARCH_X86 && (defined(__pic__) || defined(__PIC__))
-  // on i386 arch, the ebx reg. is used by pic, thus we need to preserve from
-  // being trashed beforehand
-  __asm__ __volatile__("mov %%ebx, %%edi\n"
-                       "cpuid\n"
-                       "xchg %%edi, %%ebx\n"
-                       : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
-                       : "a"(leaf), "c"(subleaf));
-#else
   __asm__ __volatile__("cpuid"
                        : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
                        : "a"(leaf), "c"(subleaf));
-#endif
 }
 // Load p into FPU control word
 static inline void __kmp_load_x87_fpu_control_word(const kmp_int16 *p) {
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index c3ee4de75a237..048bd174fc95a 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -327,6 +327,9 @@ void kmp_topology_t::_insert_windows_proc_groups() {
   KMP_CPU_FREE(mask);
   _insert_layer(KMP_HW_PROC_GROUP, ids);
   __kmp_free(ids);
+
+  // sort topology after adding proc groups
+  __kmp_topology->sort_ids();
 }
 #endif
 
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 3f831d6e2a8f9..d751a417331ce 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -2610,6 +2610,43 @@ int __kmp_get_load_balance(int max) {
       KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||            \
       KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC_XCOFF)
 
+// Because WebAssembly will use `call_indirect` to invoke the microtask and
+// WebAssembly indirect calls check that the called signature is a precise
+// match, we need to cast each microtask function pointer back from `void *` to
+// its original type.
+typedef void (*microtask_t0)(int *, int *);
+typedef void (*microtask_t1)(int *, int *, void *);
+typedef void (*microtask_t2)(int *, int *, void *, void *);
+typedef void (*microtask_t3)(int *, int *, void *, void *, void *);
+typedef void (*microtask_t4)(int *, int *, void *, void *, void *, void *);
+typedef void (*microtask_t5)(int *, int *, void *, void *, void *, void *,
+                             void *);
+typedef void (*microtask_t6)(int *, int *, void *, void *, void *, void *,
+                             void *, void *);
+typedef void (*microtask_t7)(int *, int *, void *, void *, void *, void *,
+                             void *, void *, void *);
+typedef void (*microtask_t8)(int *, int *, void *, void *, void *, void *,
+                             void *, void *, void *, void *);
+typedef void (*microtask_t9)(int *, int *, void *, void *, void *, void *,
+                             void *, void *, void *, void *, void *);
+typedef void (*microtask_t10)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *);
+typedef void (*microtask_t11)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *);
+typedef void (*microtask_t12)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *);
+typedef void (*microtask_t13)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *, void *);
+typedef void (*microtask_t14)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *, void *, void *);
+typedef void (*microtask_t15)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *);
+
 // we really only need the case with 1 argument, because CLANG always build
 // a struct of pointers to shared variables referenced in the outlined function
 int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
@@ -2629,66 +2666,76 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
     fflush(stderr);
     exit(-1);
   case 0:
-    (*pkfn)(&gtid, &tid);
+    (*(microtask_t0)pkfn)(&gtid, &tid);
     break;
   case 1:
-    (*pkfn)(&gtid, &tid, p_argv[0]);
+    (*(microtask_t1)pkfn)(&gtid, &tid, p_argv[0]);
     break;
   case 2:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
+    (*(microtask_t2)pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
     break;
   case 3:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
+    (*(microtask_t3)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
     break;
   case 4:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
+    (*(microtask_t4)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3]);
     break;
   case 5:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
+    (*(microtask_t5)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4]);
     break;
   case 6:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5]);
+    (*(microtask_t6)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5]);
     break;
   case 7:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6]);
+    (*(microtask_t7)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5], p_argv[6]);
     break;
   case 8:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7]);
+    (*(microtask_t8)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                          p_argv[7]);
     break;
   case 9:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
+    (*(microtask_t9)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5], p_argv[6], p_argv[7],
+                          p_argv[8]);
     break;
   case 10:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
+    (*(microtask_t10)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9]);
     break;
   case 11:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
+    (*(microtask_t11)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
     break;
   case 12:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11]);
+    (*(microtask_t12)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11]);
     break;
   case 13:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12]);
+    (*(microtask_t13)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11], p_argv[12]);
     break;
   case 14:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12], p_argv[13]);
+    (*(microtask_t14)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11], p_argv[12], p_argv[13]);
     break;
   case 15:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
+    (*(microtask_t15)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
     break;
   }
 
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index 4a457f4cc41f7..a3456063c10fc 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -129,7 +129,7 @@ if config.operating_system == 'NetBSD':
 if config.operating_system == 'Darwin':
     config.available_features.add("darwin")
 
-if config.operating_system in ['Linux', 'Windows']:
+if config.operating_system in ['Windows', 'Linux', 'FreeBSD', 'NetBSD', 'DragonFly']:
     config.available_features.add('affinity')
 
 if config.operating_system in ['Linux']:
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index a226cc2a1b250..8b2207ecbf362 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -1212,7 +1212,7 @@ bool IslNodeBuilder::preloadInvariantEquivClass(
   BasicBlock *EntryBB = &Builder.GetInsertBlock()->getParent()->getEntryBlock();
   auto *Alloca = new AllocaInst(AccInstTy, DL.getAllocaAddrSpace(),
                                 AccInst->getName() + ".preload.s2a",
-                                &*EntryBB->getFirstInsertionPt());
+                                EntryBB->getFirstInsertionPt());
   Builder.CreateStore(PreloadVal, Alloca);
   ValueMapT PreloadedPointer;
   PreloadedPointer[PreloadVal] = AccInst;
@@ -1308,10 +1308,11 @@ void IslNodeBuilder::allocateNewArrays(BBPair StartExitBlocks) {
       auto InstIt = Builder.GetInsertBlock()
                         ->getParent()
                         ->getEntryBlock()
-                        .getTerminator();
+                        .getTerminator()
+                        ->getIterator();
 
       auto *CreatedArray = new AllocaInst(NewArrayType, DL.getAllocaAddrSpace(),
-                                          SAI->getName(), &*InstIt);
+                                          SAI->getName(), InstIt);
       if (PollyTargetFirstLevelCacheLineSize)
         CreatedArray->setAlignment(Align(PollyTargetFirstLevelCacheLineSize));
       SAI->setBasePtr(CreatedArray);
diff --git a/polly/lib/CodeGen/LoopGenerators.cpp b/polly/lib/CodeGen/LoopGenerators.cpp
index 5c99515c54440..b4f8bb8948c28 100644
--- a/polly/lib/CodeGen/LoopGenerators.cpp
+++ b/polly/lib/CodeGen/LoopGenerators.cpp
@@ -225,7 +225,7 @@ ParallelLoopGenerator::storeValuesIntoStruct(SetVector<Value *> &Values) {
   // in the entry block of the function and use annotations to denote the actual
   // live span (similar to clang).
   BasicBlock &EntryBB = Builder.GetInsertBlock()->getParent()->getEntryBlock();
-  Instruction *IP = &*EntryBB.getFirstInsertionPt();
+  BasicBlock::iterator IP = EntryBB.getFirstInsertionPt();
   StructType *Ty = StructType::get(Builder.getContext(), Members);
   AllocaInst *Struct = new AllocaInst(Ty, DL.getAllocaAddrSpace(), nullptr,
                                       "polly.par.userContext", IP);
diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp
index afdb61f25b6c1..24c7011b06de9 100644
--- a/polly/lib/Support/ScopHelper.cpp
+++ b/polly/lib/Support/ScopHelper.cpp
@@ -328,8 +328,9 @@ struct ScopExpander final : SCEVVisitor<ScopExpander, const SCEV *> {
     Value *LHS = expandCodeFor(LHSScev, E->getType(), IP);
     Value *RHS = expandCodeFor(RHSScev, E->getType(), IP);
 
-    Inst = BinaryOperator::Create((Instruction::BinaryOps)Inst->getOpcode(),
-                                  LHS, RHS, Inst->getName() + Name, IP);
+    Inst =
+        BinaryOperator::Create((Instruction::BinaryOps)Inst->getOpcode(), LHS,
+                               RHS, Inst->getName() + Name, IP->getIterator());
     return SE.getSCEV(Inst);
   }
 
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel
new file mode 100644
index 0000000000000..f2aa64b639985
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel
@@ -0,0 +1,71 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+)
+
+licenses(["notice"])
+
+# TODO: this is a shim to provide Features.inc as needed by Feature.h.
+# Replace this with something that parses Features.inc.in.
+genrule(
+    name = "gen_features_inc",
+    outs = ["Features.inc"],
+    cmd = "\n".join([
+        "echo '// IWYU pragma: private, include \"Feature.h\"' >> $@",
+        "echo '#define CLANGD_BUILD_XPC 0' >> $@",
+        "echo '#define CLANGD_ENABLE_REMOTE 1' >> $@",
+        "echo '#define ENABLE_GRPC_REFLECTION 0' >> $@",
+        "echo '#define CLANGD_MALLOC_TRIM 0' >> $@",
+        "echo '#define CLANGD_TIDY_CHECKS 1' >> $@",
+        "echo '#define CLANGD_DECISION_FOREST 1' >> $@",
+    ]),
+)
+
+# TODO: Pick up other files for more complete functionality, to match
+# clangd/CMakeLists.txt. This might look something like
+# glob(["*.cpp", "dir/**/*.cpp", ...]).
+cc_library(
+    name = "ClangDaemon",
+    srcs = [
+        "Feature.cpp",
+        "Features.inc",
+        "JSONTransport.cpp",
+        "Protocol.cpp",
+        "URI.cpp",
+        "index/SymbolID.cpp",
+        "support/Cancellation.cpp",
+        "support/Context.cpp",
+        "support/Logger.cpp",
+        "support/MemoryTree.cpp",
+        "support/Shutdown.cpp",
+        "support/ThreadCrashReporter.cpp",
+        "support/Trace.cpp",
+    ],
+    hdrs = [
+        "Feature.h",
+        "LSPBinder.h",
+        "Protocol.h",
+        "Transport.h",
+        "URI.h",
+        "index/SymbolID.h",
+        "support/Cancellation.h",
+        "support/Context.h",
+        "support/Function.h",
+        "support/Logger.h",
+        "support/MemoryTree.h",
+        "support/Shutdown.h",
+        "support/ThreadCrashReporter.h",
+        "support/Trace.h",
+    ],
+    includes = ["."],
+    deps = [
+        "//clang:basic",
+        "//clang:index",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index d6b124f9d8e4c..865cafbf50c63 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -62,6 +62,7 @@ gentbl(
         "CrossTU",
         "Driver",
         "Frontend",
+        "InstallAPI",
         "Lex",
         "Parse",
         "Refactoring",
@@ -2068,6 +2069,7 @@ cc_library(
         ":frontend",
         ":support",
         "//llvm:Core",
+        "//llvm:Demangle",
         "//llvm:Support",
         "//llvm:TextAPI",
     ],
@@ -2405,6 +2407,7 @@ cc_binary(
     stamp = 0,
     deps = [
         ":basic",
+        ":diagnostic_defs_gen",
         ":frontend",
         ":lex",
         ":sema",
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 073353a89c890..1d98b986f4818 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -62,7 +62,41 @@ config_setting(
     flag_values = {":mpfr": "system"},
 )
 
-############################## Support libraries #############################
+################################# Include Files ################################
+
+libc_support_library(
+    name = "llvm_libc_macros_math_macros",
+    hdrs = ["include/llvm-libc-macros/math-macros.h"],
+)
+
+libc_support_library(
+    name = "llvm_libc_macros_limits_macros",
+    hdrs = ["include/llvm-libc-macros/limits-macros.h"],
+)
+
+libc_support_library(
+    name = "llvm_libc_macros_float_macros",
+    hdrs = ["include/llvm-libc-macros/float-macros.h"],
+)
+
+libc_support_library(
+    name = "llvm_libc_macros_stdint_macros",
+    hdrs = ["include/llvm-libc-macros/stdint-macros.h"],
+)
+
+libc_support_library(
+    name = "llvm_libc_macros_stdfix_macros",
+    hdrs = ["include/llvm-libc-macros/stdfix-macros.h"],
+    deps = [":llvm_libc_macros_float_macros"],
+)
+
+libc_support_library(
+    name = "llvm_libc_types_float128",
+    hdrs = ["include/llvm-libc-types/float128.h"],
+    deps = [":llvm_libc_macros_float_macros"],
+)
+
+############################### Support libraries ##############################
 
 libc_support_library(
     name = "__support_macros_properties_architectures",
@@ -151,10 +185,20 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_cpp_iterator",
+    hdrs = ["src/__support/CPP/iterator.h"],
+    deps = [
+        ":__support_cpp_type_traits",
+        ":__support_macros_attributes",
+    ],
+)
+
 libc_support_library(
     name = "__support_cpp_array",
     hdrs = ["src/__support/CPP/array.h"],
     deps = [
+        ":__support_cpp_iterator",
         ":__support_macros_attributes",
     ],
 )
@@ -528,6 +572,7 @@ libc_support_library(
         ":__support_cpp_type_traits",
         ":__support_ctype_utils",
         ":__support_str_to_num_result",
+        ":__support_uint128",
         ":errno",
     ],
 )
@@ -670,6 +715,7 @@ libc_support_library(
         ":__support_macros_properties_architectures",
         ":__support_macros_sanitizer",
         ":errno",
+        ":llvm_libc_macros_math_macros",
     ],
 )
 
@@ -727,7 +773,10 @@ libc_support_library(
 libc_support_library(
     name = "__support_fputil_manipulation_functions",
     hdrs = ["src/__support/FPUtil/ManipulationFunctions.h"],
-    textual_hdrs = ["src/__support/FPUtil/x86_64/NextAfterLongDouble.h"],
+    textual_hdrs = [
+        "src/__support/FPUtil/x86_64/NextAfterLongDouble.h",
+        "src/__support/FPUtil/x86_64/NextUpDownLongDouble.h",
+    ],
     deps = [
         ":__support_common",
         ":__support_cpp_bit",
@@ -739,6 +788,7 @@ libc_support_library(
         ":__support_fputil_normal_float",
         ":__support_macros_optimization",
         ":__support_uint128",
+        ":llvm_libc_macros_math_macros",
     ],
 )
 
@@ -752,6 +802,7 @@ libc_support_library(
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
         ":__support_macros_attributes",
+        ":llvm_libc_macros_math_macros",
     ],
 )
 
@@ -980,33 +1031,6 @@ libc_support_library(
     ],
 )
 
-libc_support_library(
-    name = "llvm_libc_macros_limits_macros",
-    hdrs = ["include/llvm-libc-macros/limits-macros.h"],
-)
-
-libc_support_library(
-    name = "llvm_libc_macros_float_macros",
-    hdrs = ["include/llvm-libc-macros/float-macros.h"],
-)
-
-libc_support_library(
-    name = "llvm_libc_macros_stdint_macros",
-    hdrs = ["include/llvm-libc-macros/stdint-macros.h"],
-)
-
-libc_support_library(
-    name = "llvm_libc_macros_stdfix_macros",
-    hdrs = ["include/llvm-libc-macros/stdfix-macros.h"],
-    deps = [":llvm_libc_macros_float_macros"],
-)
-
-libc_support_library(
-    name = "llvm_libc_types_float128",
-    hdrs = ["include/llvm-libc-types/float128.h"],
-    deps = [":llvm_libc_macros_float_macros"],
-)
-
 ############################### errno targets ################################
 
 libc_function(
@@ -1173,6 +1197,7 @@ libc_support_library(
         "__support_cpp_type_traits",
         ":__support_common",
         ":errno",
+        ":llvm_libc_macros_math_macros",
     ],
 )
 
@@ -1234,13 +1259,9 @@ libc_support_library(
     hdrs = ["src/math/generic/inv_trigf_utils.h"],
     deps = [
         ":__support_common",
-        ":__support_fputil_fenv_impl",
         ":__support_fputil_fma",
-        ":__support_fputil_fp_bits",
         ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
-        ":math_utils",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index 4a94916e1205e..2a0c071f22868 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -84,6 +84,7 @@ libc_support_library(
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_fpbits_str",
         "//libc:__support_fputil_rounding_mode",
+        "//libc:llvm_libc_macros_math_macros",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
index 5434761a9b139..4f976122967c4 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
@@ -64,8 +64,8 @@ libc_test(
     name = "integer_to_string_test",
     srcs = ["integer_to_string_test.cpp"],
     deps = [
-        "//libc:__support_cpp_span",
         "//libc:__support_cpp_limits",
+        "//libc:__support_cpp_span",
         "//libc:__support_cpp_string_view",
         "//libc:__support_integer_literals",
         "//libc:__support_integer_to_string",
@@ -89,6 +89,7 @@ libc_test(
         "//libc:__support_cpp_optional",
         "//libc:__support_macros_properties_types",
         "//libc:__support_uint",
+        "//libc:llvm_libc_macros_math_macros",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
index 63e18b8371091..15e367f0aca2d 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
@@ -178,6 +178,7 @@ libc_support_library(
     deps = [
         "//libc:__support_fputil_basic_operations",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
         "//libc/utils/MPFRWrapper:mpfr_wrapper",
@@ -296,6 +297,7 @@ libc_support_library(
         "//libc:__support_cpp_limits",
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_manipulation_functions",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
     ],
 )
@@ -322,6 +324,7 @@ libc_support_library(
         "//libc:__support_fputil_basic_operations",
         "//libc:__support_fputil_fenv_impl",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
@@ -349,6 +352,7 @@ libc_support_library(
         "//libc:__support_cpp_limits",
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_normal_float",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
@@ -375,6 +379,7 @@ libc_support_library(
     deps = [
         "//libc:__support_fputil_fenv_impl",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
         "//libc/utils/MPFRWrapper:mpfr_wrapper",
@@ -411,6 +416,7 @@ libc_support_library(
     deps = [
         "//libc:__support_fputil_fenv_impl",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
         "//libc/utils/MPFRWrapper:mpfr_wrapper",
@@ -522,6 +528,7 @@ libc_support_library(
         "//libc:__support_cpp_type_traits",
         "//libc:__support_fputil_basic_operations",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
index aba259ba6401a..1a5868d242e80 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
@@ -34,6 +34,7 @@ def math_test(name, hdrs = [], deps = [], **kwargs):
             "//libc:__support_math_extras",
             "//libc:__support_uint128",
             "//libc/test/UnitTest:fp_test_helpers",
+            "//libc:llvm_libc_macros_math_macros",
         ] + deps,
         **kwargs
     )
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 99b924626c2af..c8863af43c400 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -312,6 +312,7 @@ cc_test(
         "//llvm:AllTargetsAsmParsers",
         "//llvm:AllTargetsCodeGens",
         "//llvm:AsmParser",
+        "//llvm:BinaryFormat",
         "//llvm:Core",
         "//llvm:ExecutionEngine",
         "//llvm:IRReader",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 28a69c7ffea1f..ba3f60380d340 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1773,8 +1773,6 @@ cc_library(
         ":EmitCDialect",
         ":FuncDialect",
         ":IR",
-        ":MathDialect",
-        ":SCFDialect",
         ":Support",
         ":TranslateLib",
         "//llvm:Support",
@@ -1938,17 +1936,14 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
-        ":ArmNeonIncGen",
         ":ArmNeonDialect",
+        ":DialectUtils",
         ":FuncDialect",
         ":IR",
         ":LLVMDialect",
-        ":SideEffectInterfaces",
         ":Support",
-        ":VectorDialect",
         ":Transforms",
-        "//llvm:Core",
-        "//llvm:Support",
+        ":VectorDialect",
     ],
 )
 
@@ -2854,6 +2849,7 @@ cc_library(
         ":ArithDialect",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":DestinationStyleOpInterface",
         ":DialectUtils",
@@ -3127,6 +3123,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
@@ -3199,6 +3196,7 @@ cc_library(
         ":ArithDialect",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":ComplexDialect",
         ":DialectUtils",
@@ -3671,6 +3669,8 @@ td_library(
     deps = [
         ":BuiltinDialectTdFiles",
         ":OpBaseTdFiles",
+        ":ShapedOpInterfacesTdFiles",
+        ":ViewLikeInterfaceTdFiles",
     ],
 )
 
@@ -3754,7 +3754,10 @@ cc_library(
     hdrs = ["include/mlir/Dialect/XeGPU/IR/XeGPU.h"],
     includes = ["include"],
     deps = [
+        ":DialectUtils",
         ":IR",
+        ":ShapedOpInterfaces",
+        ":ViewLikeInterface",
         ":XeGPUIncGen",
         "//llvm:Core",
         "//llvm:Support",
@@ -3888,9 +3891,11 @@ cc_library(
         ":AffineMemoryOpInterfacesIncGen",
         ":AffineOpsIncGen",
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":ControlFlowInterfaces",
         ":DialectUtils",
         ":IR",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MemRefDialect",
         ":ShapedOpInterfaces",
@@ -4291,6 +4296,7 @@ cc_library(
     deps = [
         ":ArithDialect",
         ":ArithUtils",
+        ":BufferizationInterfaces",
         ":ControlFlowDialect",
         ":ControlFlowInterfaces",
         ":DestinationStyleOpInterface",
@@ -4298,6 +4304,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MemRefDialect",
         ":ParallelCombiningOpInterface",
@@ -4368,6 +4375,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "InliningUtils",
+    srcs = ["lib/Transforms/Utils/InliningUtils.cpp"],
+    hdrs = ["include/mlir/Transforms/InliningUtils.h"],
+    includes = ["include"],
+    deps = [
+        ":CallOpInterfaces",
+        ":IR",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "LoopLikeInterface",
     srcs = ["lib/Interfaces/LoopLikeInterface.cpp"],
@@ -4538,6 +4557,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":CastInterfaces",
         ":ControlFlowInterfaces",
         ":Dialect",
@@ -4545,6 +4565,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MLIRShapeCanonicalizationIncGen",
         ":ShapeOpsIncGen",
         ":SideEffectInterfaces",
@@ -4623,6 +4644,7 @@ cc_library(
     deps = [
         ":ArithDialect",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":FuncDialect",
         ":IR",
@@ -4703,11 +4725,13 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":CommonFolders",
         ":ControlFlowInterfaces",
         ":ControlFlowOpsIncGen",
         ":ConvertToLLVMInterface",
         ":IR",
+        ":InliningUtils",
         ":SideEffectInterfaces",
         ":Support",
         "//llvm:Support",
@@ -4725,6 +4749,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":ControlFlowDialect",
         ":IR",
@@ -4744,10 +4769,11 @@ cc_library(
     hdrs = glob([
         "include/mlir/Dialect/Func/IR/*.h",
         "include/mlir/Dialect/Func/Utils/*.h",
-    ]) + ["include/mlir/Transforms/InliningUtils.h"],
+    ]),
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":CallOpInterfaces",
         ":CastInterfaces",
         ":CommonFolders",
@@ -4758,6 +4784,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":SideEffectInterfaces",
         ":Support",
         "//llvm:Support",
@@ -4774,6 +4801,7 @@ cc_library(
         ":FuncDialect",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MeshShardingInterface",
     ],
 )
@@ -4924,16 +4952,19 @@ cc_library(
         ":AffineDialect",
         ":ArithDialect",
         ":ArithUtils",
+        ":BufferizationInterfaces",
         ":ControlFlowInterfaces",
         ":DataLayoutInterfaces",
         ":DestinationStyleOpInterface",
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MaskableOpInterface",
         ":MaskingOpInterface",
         ":MemRefDialect",
         ":SideEffectInterfaces",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorDialect",
         ":ValueBoundsOpInterface",
@@ -5036,6 +5067,7 @@ cc_library(
         ":ArithTransforms",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":DialectUtils",
         ":FuncDialect",
@@ -5289,6 +5321,14 @@ gentbl_cc_library(
             ["-gen-op-interface-defs"],
             "include/mlir/Dialect/LLVMIR/LLVMInterfaces.cpp.inc",
         ),
+        (
+            ["-gen-attr-interface-decls"],
+            "include/mlir/Dialect/LLVMIR/LLVMAttrInterfaces.h.inc",
+        ),
+        (
+            ["-gen-attr-interface-defs"],
+            "include/mlir/Dialect/LLVMIR/LLVMAttrInterfaces.cpp.inc",
+        ),
         (
             ["-gen-type-interface-decls"],
             "include/mlir/Dialect/LLVMIR/LLVMTypeInterfaces.h.inc",
@@ -5338,7 +5378,6 @@ cc_library(
             "include/mlir/Dialect/LLVMIR/*X86Vector*.h",
         ],
     ) + [
-        "include/mlir/Transforms/InliningUtils.h",
         "include/mlir/Transforms/Mem2Reg.h",
     ],
     includes = ["include"],
@@ -5349,6 +5388,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LLVMDialectInterfaceIncGen",
         ":LLVMIntrinsicOpsIncGen",
         ":LLVMOpsIncGen",
@@ -5546,6 +5586,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":ControlFlowInterfaces",
         ":DLTIDialect",
         ":FunctionInterfaces",
@@ -5555,6 +5596,7 @@ cc_library(
         ":IR",
         ":InferIntRangeInterface",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LLVMDialect",
         ":MemRefDialect",
         ":SCFDialect",
@@ -5643,6 +5685,7 @@ cc_library(
         ":AsmParser",
         ":AsyncDialect",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":ControlFlowDialect",
         ":DLTIDialect",
         ":DialectUtils",
@@ -6889,7 +6932,7 @@ cc_library(
     srcs = glob([
         "lib/Dialect/SPIRV/IR/*.cpp",
         "lib/Dialect/SPIRV/IR/*.h",
-    ]) + ["include/mlir/Transforms/InliningUtils.h"],
+    ]),
     hdrs = glob([
         "include/mlir/Dialect/SPIRV/IR/*.h",
     ]),
@@ -6901,6 +6944,7 @@ cc_library(
         ":GPUDialect",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":Parser",
         ":SPIRVAttrUtilsGen",
         ":SPIRVAttributesIncGen",
@@ -7305,7 +7349,6 @@ gentbl_cc_library(
 cc_library(
     name = "TensorDialect",
     srcs = [
-        "include/mlir/Transforms/InliningUtils.h",
         "lib/Dialect/Tensor/IR/TensorDialect.cpp",
         "lib/Dialect/Tensor/IR/TensorOps.cpp",
         "lib/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.cpp",
@@ -7319,6 +7362,7 @@ cc_library(
         ":AffineDialect",
         ":ArithDialect",
         ":ArithUtils",
+        ":BufferizationInterfaces",
         ":CastInterfaces",
         ":ComplexDialect",
         ":ControlFlowInterfaces",
@@ -7326,13 +7370,16 @@ cc_library(
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":ParallelCombiningOpInterface",
         ":ShapedOpInterfaces",
         ":SideEffectInterfaces",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorOpsIncGen",
         ":TilingInterface",
+        ":TransformDialectInterfaces",
         ":ValueBoundsOpInterface",
         ":ViewLikeInterface",
         "//llvm:Support",
@@ -7426,6 +7473,7 @@ cc_library(
         ":ArithDialect",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":DialectUtils",
         ":FuncDialect",
@@ -7516,15 +7564,19 @@ cc_library(
 
 cc_library(
     name = "TransformUtils",
-    srcs = glob([
-        "lib/Transforms/Utils/*.cpp",
-        "lib/Transforms/Utils/*.h",
-    ]),
+    srcs = glob(
+        include = [
+            "lib/Transforms/Utils/*.cpp",
+            "lib/Transforms/Utils/*.h",
+        ],
+        exclude = ["lib/Transforms/Utils/InliningUtils.cpp"],
+    ),
     hdrs = glob(
-        [
-            "include/mlir/Transforms/*.h",
+        include = ["include/mlir/Transforms/*.h"],
+        exclude = [
+            "include/mlir/Transforms/InliningUtils.h",
+            "include/mlir/Transforms/Passes.h",
         ],
-        exclude = ["include/mlir/Transforms/Passes.h"],
     ),
     includes = ["include"],
     deps = [
@@ -7532,6 +7584,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
         ":IR",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MemorySlotInterfaces",
         ":Pass",
@@ -7861,6 +7914,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
         ":IR",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MemorySlotInterfaces",
         ":Pass",
@@ -10948,6 +11002,7 @@ cc_library(
         ":ArithUtils",
         ":AsmParser",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":ComplexDialect",
         ":ControlFlowInterfaces",
         ":CopyOpInterface",
@@ -10957,6 +11012,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LinalgEnumsIncGen",
         ":LinalgInterfacesIncGen",
         ":LinalgNamedStructuredOpsYamlIncGen",
@@ -10964,11 +11020,12 @@ cc_library(
         ":LinalgStructuredOpsIncGen",
         ":MathDialect",
         ":MemRefDialect",
+        ":MeshShardingInterface",
         ":Parser",
         ":SCFDialect",
-        ":MeshShardingInterface",
         ":SideEffectInterfaces",
         ":SparseTensorDialect",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorDialect",
         ":TilingInterface",
@@ -11102,6 +11159,7 @@ cc_library(
         ":ArithTransforms",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":ComplexDialect",
         ":ControlFlowDialect",
@@ -11121,12 +11179,12 @@ cc_library(
         ":MemRefDialect",
         ":MemRefTransforms",
         ":MeshDialect",
+        ":MeshShardingInterface",
         ":MeshTransforms",
         ":Pass",
         ":SCFDialect",
         ":SCFTransforms",
         ":SCFUtils",
-        ":MeshShardingInterface",
         ":SparseTensorDialect",
         ":SubsetOpInterface",
         ":Support",
@@ -11703,6 +11761,7 @@ cc_library(
         ":FuncDialect",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MeshDialect",
         ":MeshShardingInterface",
@@ -12002,6 +12061,26 @@ gentbl_cc_library(
     deps = [":TransformDialectTdFiles"],
 )
 
+cc_library(
+    name = "TransformDialectInterfaces",
+    # FIXME: Change this once https://github.com/llvm/llvm-project/pull/85221 lands
+    hdrs = [
+        "include/mlir/Dialect/Transform/IR/TransformInterfaces.h",
+        "include/mlir/Dialect/Transform/IR/TransformTypes.h",
+    ],
+    deps = [
+        ":CastInterfaces",
+        ":IR",
+        ":Rewrite",
+        ":Support",
+        ":TransformDialectInterfacesIncGen",
+        ":TransformDialectUtils",
+        ":TransformTypesIncGen",
+        ":Transforms",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "TransformDialect",
     srcs = glob(["lib/Dialect/Transform/IR/*.cpp"]),
@@ -12317,6 +12396,7 @@ cc_library(
         ":ConvertToLLVMInterface",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":SideEffectInterfaces",
         "//llvm:Support",
     ],
@@ -12562,13 +12642,15 @@ gentbl_cc_library(
 cc_library(
     name = "ArithDialect",
     srcs = [
+        "include/mlir/Interfaces/ValueBoundsOpInterface.h",
         "lib/Dialect/Arith/IR/ArithDialect.cpp",
         "lib/Dialect/Arith/IR/ArithOps.cpp",
         "lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp",
-    ],
+    ] + glob([
+        "include/mlir/Analysis/**/*.h",
+    ]),
     hdrs = [
         "include/mlir/Dialect/Arith/IR/Arith.h",
-        "include/mlir/Transforms/InliningUtils.h",
     ],
     includes = ["include"],
     deps = [
@@ -12576,15 +12658,21 @@ cc_library(
         ":ArithCanonicalizationIncGen",
         ":ArithOpsIncGen",
         ":ArithOpsInterfacesIncGen",
+        ":BufferizationInterfaces",
         ":CastInterfaces",
         ":CommonFolders",
+        ":ControlFlowInterfaces",
         ":ConvertToLLVMInterface",
+        ":DestinationStyleOpInterface",
         ":IR",
         ":InferIntRangeCommon",
         ":InferIntRangeInterface",
         ":InferTypeOpInterface",
+        ":InliningUtils",
+        ":Pass",
         ":Support",
         ":UBDialect",
+        ":ValueBoundsOpInterfaceIncGen",
         ":VectorInterfaces",
         "//llvm:Support",
     ],
@@ -12622,6 +12710,7 @@ cc_library(
         ":ArithPassIncGen",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":FuncDialect",
         ":FuncTransforms",
@@ -12744,6 +12833,7 @@ cc_library(
         ":ConvertToLLVMInterface",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MathBaseIncGen",
         ":MathOpsIncGen",
         ":SideEffectInterfaces",
@@ -12882,8 +12972,10 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":AllocationOpInterface",
         ":ArithDialect",
         ":ArithUtils",
+        ":BufferizationInterfaces",
         ":CastInterfaces",
         ":ComplexDialect",
         ":ControlFlowInterfaces",
@@ -12892,9 +12984,11 @@ cc_library(
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MemRefBaseIncGen",
         ":MemRefOpsIncGen",
         ":MemorySlotInterfaces",
+        ":RuntimeVerifiableOpInterface",
         ":ShapedOpInterfaces",
         ":Support",
         ":ValueBoundsOpInterface",
@@ -13145,11 +13239,13 @@ cc_library(
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
         ":IR",
+        ":InliningUtils",
         ":MLProgramAttributesIncGen",
         ":MLProgramOpsIncGen",
         ":MLProgramTypesIncGen",
         ":Pass",
         ":Support",
+        ":Transforms",
         "//llvm:Support",
     ],
 )
@@ -13165,6 +13261,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":FuncDialect",
         ":IR",
         ":MLProgramDialect",
@@ -13448,6 +13545,7 @@ cc_library(
     deps = [
         ":BufferizationDialect",
         ":BufferizationEnumsIncGen",
+        ":BufferizationInterfaces",
         ":BufferizationTransformOpsIncGen",
         ":BufferizationTransforms",
         ":FunctionInterfaces",
@@ -13481,6 +13579,26 @@ gentbl_cc_library(
     ],
 )
 
+cc_library(
+    name = "BufferizationInterfaces",
+    srcs = [
+        "include/mlir/Analysis/Liveness.h",
+    ],
+    hdrs = [
+        "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h",
+        "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":BufferDeallocationOpInterfaceIncGen",
+        ":BufferizableOpInterfaceIncGen",
+        ":BufferizationEnumsIncGen",
+        ":IR",
+        ":Support",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "BufferizationDialect",
     srcs = [
@@ -13491,8 +13609,6 @@ cc_library(
         "lib/Dialect/Bufferization/IR/UnstructuredControlFlow.cpp",
     ],
     hdrs = [
-        "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h",
-        "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h",
         "include/mlir/Dialect/Bufferization/IR/Bufferization.h",
         "include/mlir/Dialect/Bufferization/IR/DstBufferizableOpInterfaceImpl.h",
         "include/mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h",
@@ -13506,7 +13622,7 @@ cc_library(
         ":BufferDeallocationOpInterfaceIncGen",
         ":BufferizableOpInterfaceIncGen",
         ":BufferizationBaseIncGen",
-        ":BufferizationEnumsIncGen",
+        ":BufferizationInterfaces",
         ":BufferizationOpsIncGen",
         ":ControlFlowInterfaces",
         ":CopyOpInterface",
@@ -13515,6 +13631,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MemRefDialect",
         ":SparseTensorDialect",
         ":SubsetOpInterface",
@@ -13555,7 +13672,7 @@ cc_library(
         ":Analysis",
         ":ArithDialect",
         ":BufferizationDialect",
-        ":BufferizationEnumsIncGen",
+        ":BufferizationInterfaces",
         ":BufferizationPassIncGen",
         ":ControlFlowDialect",
         ":ControlFlowInterfaces",
@@ -13607,6 +13724,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationToMemRef",
         ":BufferizationTransforms",
         ":FuncDialect",
@@ -13912,7 +14030,6 @@ gentbl_cc_library(
 cc_library(
     name = "UBDialect",
     srcs = [
-        "include/mlir/Transforms/InliningUtils.h",
         "lib/Dialect/UB/IR/UBOps.cpp",
     ],
     hdrs = ["include/mlir/Dialect/UB/IR/UBOps.h"],
@@ -13920,6 +14037,7 @@ cc_library(
     deps = [
         ":ConvertToLLVMInterface",
         ":IR",
+        ":InliningUtils",
         ":SideEffectInterfaces",
         ":UBOpsIncGen",
         ":UBOpsInterfacesIncGen",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 91706af935ac5..ccfef3f243409 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -399,6 +399,7 @@ cc_library(
         "//mlir:IR",
         "//mlir:InferIntRangeInterface",
         "//mlir:InferTypeOpInterface",
+        "//mlir:InliningUtils",
         "//mlir:LLVMDialect",
         "//mlir:LinalgDialect",
         "//mlir:LoopLikeInterface",
@@ -407,7 +408,6 @@ cc_library(
         "//mlir:SideEffectInterfaces",
         "//mlir:Support",
         "//mlir:TensorDialect",
-        "//mlir:TransformUtils",
         "//mlir:Transforms",
         "//mlir:ViewLikeInterface",
     ],
@@ -532,7 +532,7 @@ cc_library(
         "//mlir:PDLInterpDialect",
         "//mlir:Pass",
         "//mlir:Support",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -570,7 +570,7 @@ cc_library(
         "//mlir:SCFDialect",
         "//mlir:SPIRVDialect",
         "//mlir:Support",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -600,7 +600,7 @@ cc_library(
         "//mlir:Pass",
         "//mlir:SCFDialect",
         "//mlir:SCFTransforms",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -695,7 +695,6 @@ cc_library(
         "//mlir:SCFToControlFlow",
         "//mlir:SPIRVDialect",
         "//mlir:ToLLVMIRTranslation",
-        "//mlir:TransformUtils",
         "//mlir:Transforms",
         "//mlir:VectorDialect",
         "//mlir:VectorToLLVM",
@@ -728,7 +727,6 @@ cc_library(
         "//mlir:SCFTransforms",
         "//mlir:TensorDialect",
         "//mlir:TensorTransforms",
-        "//mlir:TransformUtils",
         "//mlir:Transforms",
         "//mlir:VectorDialect",
         "//mlir:VectorToSCF",
@@ -770,7 +768,7 @@ cc_library(
         "//mlir:MathTransforms",
         "//mlir:Pass",
         "//mlir:SCFDialect",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
         "//mlir:VectorDialect",
         "//mlir:X86VectorDialect",
     ],
@@ -786,7 +784,7 @@ cc_library(
         "//mlir:IR",
         "//mlir:MathDialect",
         "//mlir:Pass",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
         "//mlir:VCIXDialect",
         "//mlir:VectorDialect",
     ],
@@ -850,7 +848,7 @@ cc_library(
         "//mlir:Pass",
         "//mlir:SCFDialect",
         "//mlir:Support",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -869,7 +867,7 @@ cc_library(
         "//mlir:SCFDialect",
         "//mlir:SCFTransforms",
         "//mlir:SCFUtils",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -994,7 +992,7 @@ cc_library(
         "//mlir:FuncTransforms",
         "//mlir:IR",
         "//mlir:Pass",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -1033,7 +1031,7 @@ cc_library(
         "//mlir:SCFDialect",
         "//mlir:Support",
         "//mlir:TensorDialect",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
         "//mlir:VectorDialect",
         "//mlir:VectorToSCF",
         "//mlir:VectorTransforms",
@@ -1119,6 +1117,6 @@ cc_library(
         "//mlir:Parser",
         "//mlir:Pass",
         "//mlir:Support",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
index a749a11096c37..252b9ec951f6b 100644
--- a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
@@ -61,8 +61,8 @@ cc_test(
     deps = [
         "//llvm:Support",
         "//llvm:TestingSupport",
-        "//mlir:BytecodeReader",
         "//mlir:ArithDialect",
+        "//mlir:BytecodeReader",
         "//mlir:ControlFlowInterfaces",
         "//mlir:DLTIDialect",
         "//mlir:DataLayoutInterfaces",